[PATCH v1] RISC-V: Support {U}INT64 to FP16 auto-vectorization

2023-09-27 Thread pan2 . li
From: Pan Li 

This patch would like to support the auto-vectorization from
the INT64 to FP16. We take below steps for the conversion.

* INT64 to FP32.
* FP32 to FP16.

Given sample code as below:
void
test_func (int64_t * __restrict a, _Float16 *b, unsigned n)
{
  for (unsigned i = 0; i < n; i++)
b[i] = (_Float16) (a[i]);
}

Before this patch:
test.c:6:26: missed: couldn't vectorize loop
test.c:6:26: missed: not vectorized: unsupported data-type
ld  a0,0(s0)
call__floatdihf
fsh fa0,0(s1)
addis0,s0,8
addis1,s1,2
bne s2,s0,.L3
ld  ra,24(sp)
ld  s0,16(sp)
ld  s1,8(sp)
ld  s2,0(sp)
addisp,sp,32

After this patch:
vsetvli a5,a2,e8,mf8,ta,ma
vle64.v v1,0(a0)
vsetvli a4,zero,e32,mf2,ta,ma
vfncvt.f.x.wv1,v1
vsetvli zero,zero,e16,mf4,ta,ma
vfncvt.f.f.wv1,v1
vsetvli zero,a2,e16,mf4,ta,ma
vse16.v v1,0(a1)

Please note VLS mode is also involved in this patch and covered by the
test cases.

PR target/111506

gcc/ChangeLog:

* config/riscv/autovec.md (2):
* config/riscv/vector-iterators.md:

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/conversions/vfncvt-itof-rv32gcv.c:
Adjust checker.
* gcc.target/riscv/rvv/autovec/conversions/vfncvt-itof-rv64gcv.c:
Ditto.
* gcc.target/riscv/rvv/autovec/unop/cvt-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/cvt-1.c: New test.
* gcc.target/riscv/rvv/autovec/vls/cvt-0.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   | 24 ++
 gcc/config/riscv/vector-iterators.md  | 38 +++
 .../autovec/conversions/vfncvt-itof-rv32gcv.c |  5 +-
 .../autovec/conversions/vfncvt-itof-rv64gcv.c |  5 +-
 .../gcc.target/riscv/rvv/autovec/unop/cvt-0.c | 21 +
 .../gcc.target/riscv/rvv/autovec/unop/cvt-1.c | 22 +
 .../gcc.target/riscv/rvv/autovec/vls/cvt-0.c  | 47 +++
 7 files changed, 158 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/cvt-0.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/cvt-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/cvt-0.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index cd0cbdd2889..6dd3b96a423 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -974,6 +974,30 @@ (define_insn_and_split "2"
 }
 [(set_attr "type" "vfncvtitof")])
 
+;; This operation can be performed in the loop vectorizer but unfortunately
+;; not applicable for now. We can remove this pattern after loop vectorizer
+;; is able to take care of INT64 to FP16 conversion.
+(define_insn_and_split "2"
+  [(set (match_operand:  0 "register_operand")
+   (any_float:
+ (match_operand:VWWCONVERTI 1 "register_operand")))]
+  "TARGET_VECTOR && TARGET_ZVFH && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+rtx single = gen_reg_rtx (mode); /* Get vector SF mode.  */
+
+/* Step-1, INT64 => FP32.  */
+emit_insn (gen_2 (single, operands[1]));
+/* Step-2, FP32 => FP16.  */
+emit_insn (gen_trunc2 (operands[0], single));
+
+DONE;
+  }
+  [(set_attr "type" "vfncvtitof")]
+)
+
 ;; =
 ;; == Unary arithmetic
 ;; =
diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index b6cd872eb42..c9a7344b1bc 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -1247,6 +1247,24 @@ (define_mode_iterator VWCONVERTI [
   (V512DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && 
TARGET_VECTOR_ELEN_FP_32 && TARGET_MIN_VLEN >= 4096")
 ])
 
+(define_mode_iterator VWWCONVERTI [
+  (RVVM8DI "TARGET_VECTOR_ELEN_64 && TARGET_ZVFH")
+  (RVVM4DI "TARGET_VECTOR_ELEN_64 && TARGET_ZVFH")
+  (RVVM2DI "TARGET_VECTOR_ELEN_64 && TARGET_ZVFH")
+  (RVVM1DI "TARGET_VECTOR_ELEN_64 && TARGET_ZVFH")
+
+  (V1DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH")
+  (V2DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH")
+  (V4DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH")
+  (V8DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH && 
TARGET_MIN_VLEN >= 64")
+  (V16DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH && 
TARGET_MIN_VLEN >= 128")
+  (V32DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH && 
TARGET_MIN_VLEN >= 256")
+  (V64DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH && 
TARGET_MIN_VLEN >= 512")
+  (V128DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH && 
TARGET_MIN_VLEN >= 1024")
+  (V256DI "TARGET_VECTOR_VLS && TARGET_VECTOR_ELEN_64 && TARGET_ZVFH && 

Re: [PATCH 00/18] Support -mevex512 for AVX512

2023-09-27 Thread ZiNgA BuRgA

That sounds about right.  The code I had in mind would perhaps look like:


#if defined(__AVX512BW__) && defined(__AVX512VL__)
    #if defined(__EVEX256__) && !defined(__EVEX512__)
    // compiled code is AVX10.1/256 and AVX512 compatible
    #else
    // compiled code is only AVX512 compatible
    #endif

    // some code which only uses 256b instructions
    __m256i...
#endif


The '__EVEX256__' define would avoid needing to check compiler versions.
Hopefully you can align it with whatever Clang does: 
https://discourse.llvm.org/t/rfc-design-for-avx10-feature-support/72661/18


Thanks!

On 28/09/2023 12:26 pm, Hu, Lin1 wrote:

Hi,

Thanks for you reply.

I'd like to verify that our understanding of your requirements is correct, and 
that __EVEX256__ can be considered a default macro to determine whether the 
compiler supports the __EVEX***__ series of switches.

For example:

I have a segment of code like:
#if defined(__EVEX512__):
__mm512.*__;
#else
__mm256.*__;
#endif

But __EVEX512__ is undefined that doesn't mean I only need 256bit, maybe I use 
gcc-13, so I can still use 512bit.

So the code should be:
#if defined(__EVEX512__):
__mm512.*__;
#elif defined(__EVEX256__):
__mm256.*__;
#else
__mm512.*__;
#endif

If we understand correctly, we'll consider the request. But since we're about 
to have a vacation, follow-up replies may be a bit slower.

BRs,
Lin

-Original Message-
From: ZiNgA BuRgA 
Sent: Thursday, September 28, 2023 8:32 AM
To: Hu, Lin1 ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH 00/18] Support -mevex512 for AVX512

Thanks for the new patch!

I see that there's a new __EVEX512__ define.  Will there be some __EVEX256__ 
(or maybe some max EVEX width) define, so that code can detect whether the 
compiler supports AVX10.1/256 without resorting to version checks?






RE: [PATCH 00/18] Support -mevex512 for AVX512

2023-09-27 Thread Hu, Lin1
Hi, 

Thanks for you reply.

I'd like to verify that our understanding of your requirements is correct, and 
that __EVEX256__ can be considered a default macro to determine whether the 
compiler supports the __EVEX***__ series of switches. 

For example:

I have a segment of code like:
#if defined(__EVEX512__):
__mm512.*__;
#else
__mm256.*__;
#endif

But __EVEX512__ is undefined that doesn't mean I only need 256bit, maybe I use 
gcc-13, so I can still use 512bit.

So the code should be:
#if defined(__EVEX512__):
__mm512.*__;
#elif defined(__EVEX256__):
__mm256.*__;
#else
__mm512.*__;
#endif

If we understand correctly, we'll consider the request. But since we're about 
to have a vacation, follow-up replies may be a bit slower.

BRs,
Lin

-Original Message-
From: ZiNgA BuRgA  
Sent: Thursday, September 28, 2023 8:32 AM
To: Hu, Lin1 ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH 00/18] Support -mevex512 for AVX512

Thanks for the new patch!

I see that there's a new __EVEX512__ define.  Will there be some __EVEX256__ 
(or maybe some max EVEX width) define, so that code can detect whether the 
compiler supports AVX10.1/256 without resorting to version checks?




Re: Re: [PATCH v2] RISC-V: Bugfix for RTL check[PR111533]

2023-09-27 Thread Li Xu
Committed, thanks juzhe.
--
Li Xu
>LGTM. Thanks for fixing it.
>
>
>
>juzhe.zh...@rivai.ai
>
>From: Li Xu
>Date: 2023-09-28 09:33
>To: gcc-patches
>CC: kito.cheng; palmer; juzhe.zhong; xuli
>Subject: [PATCH v2] RISC-V: Bugfix for RTL check[PR111533]
>From: xuli 
>
>Consider the flowing situation:
>BB5: local_dem(RVV Insn 1, AVL(reg zero))
>RVV Insn 1: vmv.s.x, AVL (const_int 1)
>RVV Insn 2: vredsum.vs, AVL(reg zero)
>
>vmv.s.x has vl operand, the following code will get
>avl (cosnt_int) from RVV Insn 1.
>rtx avl = has_vl_op (insn->rtl ()) ? get_vl (insn->rtl ())
>   : dem.get_avl ();
>
>If use REGNO for const_int, the compiler will crash:
>
>during RTL pass: vsetvl
>res_debug.c: In function '__dn_count_labels':
>res_debug.c:1050:1: internal compiler error: RTL check: expected code 'reg',
>have 'const_int' in rhs_regno, at rtl.h:1934
>1050 | }
>  | ^
>0x8fb169 rtl_check_failed_code1(rtx_def const*, rtx_code, char const*, int, 
>char const*)
>../.././gcc/gcc/rtl.cc:770
>0x1399818 rhs_regno(rtx_def const*)
>../.././gcc/gcc/rtl.h:1934
>0x1399818 anticipatable_occurrence_p
>../.././gcc/gcc/config/riscv/riscv-vsetvl.cc:348
>
>So in this case avl should be obtained from dem.
>
>Another issue is caused by the following code:
>HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i;
>
>during RTL pass: expand
>../../.././gcc/libgfortran/generated/matmul_c4.c: In function 'matmul_c4':
>../../.././gcc/libgfortran/generated/matmul_c4.c:2906:39: internal compiler 
>error: RTL check:
>expected code 'const_int', have 'const_poly_int' in expand_const_vector,
>at config/riscv/riscv-v.cc:1149
>
>The builder.elt (i) can be either const_int or const_poly_int.
>
>PR target/111533
>
>gcc/ChangeLog:
>
>* config/riscv/riscv-v.cc (expand_const_vector): Fix bug.
>* config/riscv/riscv-vsetvl.cc (anticipatable_occurrence_p): Fix bug.
>
>gcc/testsuite/ChangeLog:
>
>* gcc.target/riscv/rvv/base/pr111533-1.c: New test.
>* gcc.target/riscv/rvv/base/pr111533-2.c: New test.
>---
>gcc/config/riscv/riscv-v.cc   |  5 ++--
>gcc/config/riscv/riscv-vsetvl.cc  |  3 +-
>.../gcc.target/riscv/rvv/base/pr111533-1.c    | 15 ++
>.../gcc.target/riscv/rvv/base/pr111533-2.c    | 29 +++
>4 files changed, 48 insertions(+), 4 deletions(-)
>create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
>create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c
>
>diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
>index 359fb2ced8b..26700cfc732 100644
>--- a/gcc/config/riscv/riscv-v.cc
>+++ b/gcc/config/riscv/riscv-v.cc
>@@ -1149,8 +1149,9 @@ expand_const_vector (rtx target, rtx src)
>  for (unsigned int i = 0; i < v.npatterns (); ++i)
>{
>  /* Calculate the diff between the target sequence and
>-  vid sequence.  */
>-   HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i;
>+  vid sequence.  The elt (i) can be either const_int or
>+  const_poly_int. */
>+   poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
>  v.quick_push (gen_int_mode (diff, v.inner_mode ()));
>}
>  /* Step 2: Generate result = VID + diff.  */
>diff --git a/gcc/config/riscv/riscv-vsetvl.cc 
>b/gcc/config/riscv/riscv-vsetvl.cc
>index 7af33e7ea6f..af8c31d873c 100644
>--- a/gcc/config/riscv/riscv-vsetvl.cc
>+++ b/gcc/config/riscv/riscv-vsetvl.cc
>@@ -307,8 +307,7 @@ anticipatable_occurrence_p (const bb_info *bb, const 
>vector_insn_info dem)
>   if (dem.has_avl_reg ())
> {
>   /* rs1 (avl) are not modified in the basic block prior to the VSETVL.  
>*/
>-  rtx avl
>- = has_vl_op (insn->rtl ()) ? get_vl (insn->rtl ()) : dem.get_avl ();
>+  rtx avl = dem.get_avl_or_vl_reg ();
>   if (dem.dirty_p ())
>{
>  gcc_assert (!vsetvl_insn_p (insn->rtl ()));
>diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c 
>b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
>new file mode 100644
>index 000..aba26dfac89
>--- /dev/null
>+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
>@@ -0,0 +1,15 @@
>+/* { dg-do compile } */
>+/* { dg-options "-march=rv64gcv -mabi=lp64d -O2 -ffast-math -ftree-vectorize" 
>} */
>+
>+#include 
>+
>+typedef _Complex float GFC_COMPLEX_4;
>+
>+void
>+test (GFC_COMPLEX_4 *a, GFC_COMPLEX_4 *b, GFC_COMPLEX_4 c, ptrdiff_t i, 
>ptrdiff_t j)
>+{
>+  ptrdiff_t l;
>+  for (l = 0; l <= i; ++l)
>+    c += b[l] * a[j];
>+  b[j] = c;
>+}
>diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c 
>b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c
>new file mode 100644
>index 000..a4d2011b74b
>--- /dev/null
>+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c
>@@ -0,0 +1,29 @@
>+/* { dg-do compile } */
>+/* { dg-options "-march=rv64gcv -mabi=lp64d -O2" } */
>+
>+#include 
>+
>+/* Return the number of DNS hierarchy levels in the name. */
>+int
>+test (const char *name) {
>+ int i, len, count;
>+
>+ len = strlen(name);
>+ for (i = 0, count = 0; i < len; i++) {

Re: [PATCH V4 2/2] rs6000: use mtvsrws to move sf from si p9

2023-09-27 Thread Jiufu Guo


Hi,

"Kewen.Lin"  writes:

> Hi Jeff,
>
> on 2023/8/30 15:43, Jiufu Guo wrote:
>> Hi,
>> 
>> As mentioned in PR108338, on p9, we could use mtvsrws to implement
>> the bitcast from SI to SF (or lowpart DI to SF).
>> 
>> For code:
>>   *(long long*)buff = di;
>>   float f = *(float*)(buff);
>> 
>> "sldi 9,3,32 ; mtvsrd 1,9 ; xscvspdpn 1,1" is generated.
>> A better one would be "mtvsrws 1,3 ; xscvspdpn 1,1".
>> 
>> Compare with previous patch:
>> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623533.html
>> "highpart DI-->SF" is put to a seperate patch.
>> 
>> Pass bootstrap and regression on ppc64{,le}.
>> Is this ok for trunk?
>> 
>> BR,
>> Jeff (Jiufu Guo)
>> 
> Nit: Missing a PR marker line.
Ok, this patch would share the PR108338.
>
>> gcc/ChangeLog:
>> 
>>  * config/rs6000/rs6000.md (movsf_from_si): Update to generate mtvsrws
>>  for P9.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>  * gcc.target/powerpc/pr108338.c: Updated to check mtvsrws for p9.
>> 
>> ---
>>  gcc/config/rs6000/rs6000.md | 25 -
>>  gcc/testsuite/gcc.target/powerpc/pr108338.c |  6 +++--
>>  2 files changed, 23 insertions(+), 8 deletions(-)
>> 
>> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
>> index 
>> 8c92cbf976de915136ad5dba24e69a363d21438d..c03e677bca79e8fb1acb276d07d0acfae009f6d8
>>  100644
>> --- a/gcc/config/rs6000/rs6000.md
>> +++ b/gcc/config/rs6000/rs6000.md
>> @@ -8280,13 +8280,26 @@ (define_insn_and_split "movsf_from_si"
>>  {
>>rtx op0 = operands[0];
>>rtx op1 = operands[1];
>> -  rtx op2 = operands[2];
>> -  rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
>> 
>> -  /* Move SF value to upper 32-bits for xscvspdpn.  */
>> -  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
>> -  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
>> -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
>> +  /* Move lowpart 32-bits from register for SFmode.  */
>> +  if (TARGET_P9_VECTOR)
>> +{
>> +  /* Using mtvsrws;xscvspdpn.  */
>> +  rtx op0_v = gen_rtx_REG (V4SImode, REGNO (op0));
>> +  emit_insn (gen_vsx_splat_v4si (op0_v, op1));
>> +  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
>> +}
>> +  else
>> +{
>> +  rtx op2 = operands[2];
>> +  rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
>> +
>> +  /* Using ashl;mtvsrd;xscvspdpn.  */
>
> Nit: Use sldi instead of ashl as the others are actual
> mnemonics but ashl isn't.
Oh, yes, thanks for your insight review!
>
>> +  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
>> +  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
>> +  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
>> +}
>> +
>>DONE;
>>  }
>>[(set_attr "length"
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108338.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr108338.c
>> index 
>> 6db65595343c2407fc32f68f5f52a1f7196c371d..0565e5254ed0a8cc579cf505a3f865426dcf62ae
>>  100644
>> --- a/gcc/testsuite/gcc.target/powerpc/pr108338.c
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108338.c
>> @@ -19,9 +19,11 @@ float  __attribute__ ((noipa)) sf_from_di_off4 (long long 
>> l)
>> 
>>  /* Under lp64, parameter 'l' is in one DI reg, then bitcast sub DI to SF. */
>>  /* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 { target { lp64 && 
>> has_arch_pwr8 } } } } */
>> -/* { dg-final { scan-assembler-times {\mmtvsrd\M} 2 { target { lp64 && 
>> has_arch_pwr8 } } } } */
>> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 2 { target { lp64 && { 
>> has_arch_pwr8 && { ! has_arch_pwr9 } } } } } } */
>> +/* { dg-final { scan-assembler-times {\msldi\M} 1 { target { lp64 && { 
>> has_arch_pwr8 && { ! has_arch_pwr9 } } } } } } */
>> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 1 { target { lp64 && 
>> has_arch_pwr9 } } } } */
>> +/* { dg-final { scan-assembler-times {\mmtvsrws\M} 1 { target { lp64 && 
>> has_arch_pwr9 } } } } */
>>  /* { dg-final { scan-assembler-times {\mrldicr\M} 1 { target { lp64 && 
>> has_arch_pwr8 } } } } */
>> -/* { dg-final { scan-assembler-times {\msldi\M} 1 { target { lp64 && 
>> has_arch_pwr8 } } } } */
>> 
>
> This part might need a fresh as the comments to patch 1/2.
Yes, thanks!
>
> The others look good to me, thanks!

BR,
Jeff (Jiufu Guo)
>
> BR,
> Kewen


Re: [PATCH v2] RISC-V: Bugfix for RTL check[PR111533]

2023-09-27 Thread juzhe.zh...@rivai.ai
LGTM. Thanks for fixing it.



juzhe.zh...@rivai.ai
 
From: Li Xu
Date: 2023-09-28 09:33
To: gcc-patches
CC: kito.cheng; palmer; juzhe.zhong; xuli
Subject: [PATCH v2] RISC-V: Bugfix for RTL check[PR111533]
From: xuli 
 
Consider the flowing situation:
BB5: local_dem(RVV Insn 1, AVL(reg zero))
RVV Insn 1: vmv.s.x, AVL (const_int 1)
RVV Insn 2: vredsum.vs, AVL(reg zero)
 
vmv.s.x has vl operand, the following code will get
avl (cosnt_int) from RVV Insn 1.
rtx avl = has_vl_op (insn->rtl ()) ? get_vl (insn->rtl ())
   : dem.get_avl ();
 
If use REGNO for const_int, the compiler will crash:
 
during RTL pass: vsetvl
res_debug.c: In function '__dn_count_labels':
res_debug.c:1050:1: internal compiler error: RTL check: expected code 'reg',
have 'const_int' in rhs_regno, at rtl.h:1934
1050 | }
  | ^
0x8fb169 rtl_check_failed_code1(rtx_def const*, rtx_code, char const*, int, 
char const*)
../.././gcc/gcc/rtl.cc:770
0x1399818 rhs_regno(rtx_def const*)
../.././gcc/gcc/rtl.h:1934
0x1399818 anticipatable_occurrence_p
../.././gcc/gcc/config/riscv/riscv-vsetvl.cc:348
 
So in this case avl should be obtained from dem.
 
Another issue is caused by the following code:
HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i;
 
during RTL pass: expand
../../.././gcc/libgfortran/generated/matmul_c4.c: In function 'matmul_c4':
../../.././gcc/libgfortran/generated/matmul_c4.c:2906:39: internal compiler 
error: RTL check:
expected code 'const_int', have 'const_poly_int' in expand_const_vector,
at config/riscv/riscv-v.cc:1149
 
The builder.elt (i) can be either const_int or const_poly_int.
 
PR target/111533
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (expand_const_vector): Fix bug.
* config/riscv/riscv-vsetvl.cc (anticipatable_occurrence_p): Fix bug.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/base/pr111533-1.c: New test.
* gcc.target/riscv/rvv/base/pr111533-2.c: New test.
---
gcc/config/riscv/riscv-v.cc   |  5 ++--
gcc/config/riscv/riscv-vsetvl.cc  |  3 +-
.../gcc.target/riscv/rvv/base/pr111533-1.c| 15 ++
.../gcc.target/riscv/rvv/base/pr111533-2.c| 29 +++
4 files changed, 48 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 359fb2ced8b..26700cfc732 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1149,8 +1149,9 @@ expand_const_vector (rtx target, rtx src)
  for (unsigned int i = 0; i < v.npatterns (); ++i)
{
  /* Calculate the diff between the target sequence and
-  vid sequence.  */
-   HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i;
+  vid sequence.  The elt (i) can be either const_int or
+  const_poly_int. */
+   poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
  v.quick_push (gen_int_mode (diff, v.inner_mode ()));
}
  /* Step 2: Generate result = VID + diff.  */
diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 7af33e7ea6f..af8c31d873c 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -307,8 +307,7 @@ anticipatable_occurrence_p (const bb_info *bb, const 
vector_insn_info dem)
   if (dem.has_avl_reg ())
 {
   /* rs1 (avl) are not modified in the basic block prior to the VSETVL.  */
-  rtx avl
- = has_vl_op (insn->rtl ()) ? get_vl (insn->rtl ()) : dem.get_avl ();
+  rtx avl = dem.get_avl_or_vl_reg ();
   if (dem.dirty_p ())
{
  gcc_assert (!vsetvl_insn_p (insn->rtl ()));
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
new file mode 100644
index 000..aba26dfac89
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O2 -ffast-math -ftree-vectorize" 
} */
+
+#include 
+
+typedef _Complex float GFC_COMPLEX_4;
+
+void
+test (GFC_COMPLEX_4 *a, GFC_COMPLEX_4 *b, GFC_COMPLEX_4 c, ptrdiff_t i, 
ptrdiff_t j)
+{
+  ptrdiff_t l;
+  for (l = 0; l <= i; ++l)
+c += b[l] * a[j];
+  b[j] = c;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c
new file mode 100644
index 000..a4d2011b74b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O2" } */
+
+#include 
+
+/* Return the number of DNS hierarchy levels in the name. */
+int
+test (const char *name) {
+ int i, len, count;
+
+ len = strlen(name);
+ for (i = 0, count = 0; i < len; i++) {
+ /* XXX need to check for \. or use named's nlabels(). */
+ if (name[i] == '.')
+ count++;
+ }
+
+ /* don't count initial wildcard */
+ if (name[0] == '*')
+ if (count)
+ count--;
+

Re: [PATCH V4 1/2] rs6000: optimize moving to sf from highpart di

2023-09-27 Thread Jiufu Guo


Hi,

"Kewen.Lin"  writes:

> Hi Jeff,
>
> on 2023/8/30 15:43, Jiufu Guo wrote:
>> Hi,
>> 
>> Currently, we have the pattern "movsf_from_si2" which was trying
>> to support moving high part DI to SF.
>> 
>> The pattern looks like: XX:SF=bitcast:SF(subreg(YY:DI>>32),0)
>> It only accepts the "ashiftrt" for ">>", but "lshiftrt" is also ok.
>> And the offset of "subreg" is hard code 0, which only works for LE.
>> 
>> "movsf_from_si2" is updated to cover BE for "subreg", and cover
>> the logical shift for ":DI>>32".
>> 
>> Pass bootstrap and regression on ppc64{,le}.
>> Is this ok for trunk?
>> 
>> BR,
>> Jeff (Jiufu Guo)
>> 
>>  PR target/108338
>> 
>> gcc/ChangeLog:
>> 
>>  * config/rs6000/predicates.md (lowpart_subreg_operator): New
>>  define_predicate.
>>  * config/rs6000/rs6000.md (any_rshift): New code_iterator.
>>  (movsf_from_si2): Rename to ...
>>  (movsf_from_si2_): ... this.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>  * gcc.target/powerpc/pr108338.c: New test.
>> 
>> ---
>>  gcc/config/rs6000/predicates.md |  5 +++
>>  gcc/config/rs6000/rs6000.md | 11 +++---
>>  gcc/testsuite/gcc.target/powerpc/pr108338.c | 40 +
>>  3 files changed, 51 insertions(+), 5 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108338.c
>> 
>> diff --git a/gcc/config/rs6000/predicates.md 
>> b/gcc/config/rs6000/predicates.md
>> index 
>> 3552d908e9d149a30993e3e6568466de537336be..e25b3b4864f681d47e9d5c2eb88bcde0aea6d17b
>>  100644
>> --- a/gcc/config/rs6000/predicates.md
>> +++ b/gcc/config/rs6000/predicates.md
>> @@ -2098,3 +2098,8 @@ (define_predicate "macho_pic_address"
>>else
>>  return false;
>>  })
>> +
>> +(define_predicate "lowpart_subreg_operator"
>> +  (and (match_code "subreg")
>> +   (match_test "subreg_lowpart_offset (mode, GET_MODE (SUBREG_REG (op)))
>> +== SUBREG_BYTE (op)")))
>> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
>> index 
>> 1a9a7b1a47918f39fc91038607f21a8ba9a2e740..8c92cbf976de915136ad5dba24e69a363d21438d
>>  100644
>> --- a/gcc/config/rs6000/rs6000.md
>> +++ b/gcc/config/rs6000/rs6000.md
>> @@ -8299,18 +8299,19 @@ (define_insn_and_split "movsf_from_si"
>>  "*,  *, p9v,   p8v,   *, *,
>>   p8v,p8v,   p8v,   *")])
>> 
>> +(define_code_iterator any_rshift [ashiftrt lshiftrt])
>
> Nit: Could we name this as any_shiftrt instead and move this close to the
> existing any_* code_iterator?
ok, thanks!
>
>> +
>>  ;; For extracting high part element from DImode register like:
>>  ;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>>  ;; split it before reload with "and mask" to avoid generating shift right
>>  ;; 32 bit then shift left 32 bit.
>> -(define_insn_and_split "movsf_from_si2"
>> +(define_insn_and_split "movsf_from_si2_"
>>[(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
>>  (unspec:SF
>> - [(subreg:SI
>> -   (ashiftrt:DI
>> + [(match_operator:SI 3 "lowpart_subreg_operator"
>> +   [(any_rshift:DI
>>  (match_operand:DI 1 "input_operand" "r")
>> -(const_int 32))
>> -   0)]
>> +(const_int 32))])]
>>   UNSPEC_SF_FROM_SI))
>>(clobber (match_scratch:DI 2 "=r"))]
>>"TARGET_NO_SF_SUBREG"
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108338.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr108338.c
>> new file mode 100644
>> index 
>> ..6db65595343c2407fc32f68f5f52a1f7196c371d
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108338.c
>> @@ -0,0 +1,40 @@
>> +// { dg-do run }
>> +// { dg-options "-O2 -save-temps" }
>
> I think we need to check effective target hard_float to ensure
> the expected assembly?
Thanks for pointing out this!
>
>> +
>> +float __attribute__ ((noipa)) sf_from_di_off0 (long long l)
>> +{
>> +  char buff[16];
>> +  *(long long*)buff = l;
>> +  float f = *(float*)(buff);
>> +  return f;
>> +}
>> +
>> +float  __attribute__ ((noipa)) sf_from_di_off4 (long long l)
>> +{
>> +  char buff[16];
>> +  *(long long*)buff = l;
>> +  float f = *(float*)(buff + 4);
>> +  return f; 
>> +}
>> +
>
> IIUC, this patch is to deal with high 32-bit, but why you proposed
> two functions is due to endianness difference, then could we use
> endianness macro like __LITTLE_ENDIAN__ to simplify the corresponding
> offset value (0 on BE, 4 on LE)?  so that we have only function and
> IMHO it's more focused.
Yes, this patch is for high part of DI, next patch is for low part of
DI.  Great sugguestion, thanks!
>
>> +/* Under lp64, parameter 'l' is in one DI reg, then bitcast sub DI to SF. */
>> +/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 { target { lp64 && 
>> has_arch_pwr8 } } } } */
>> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 2 { target { lp64 && 
>> has_arch_pwr8 } } } } */
>> +/* { dg-final { scan-assembler-times 

[PATCH v2] RISC-V: Bugfix for RTL check[PR111533]

2023-09-27 Thread Li Xu
From: xuli 

Consider the flowing situation:
BB5: local_dem(RVV Insn 1, AVL(reg zero))
RVV Insn 1: vmv.s.x, AVL (const_int 1)
RVV Insn 2: vredsum.vs, AVL(reg zero)

vmv.s.x has vl operand, the following code will get
avl (cosnt_int) from RVV Insn 1.
rtx avl = has_vl_op (insn->rtl ()) ? get_vl (insn->rtl ())
   : dem.get_avl ();

If use REGNO for const_int, the compiler will crash:

during RTL pass: vsetvl
res_debug.c: In function '__dn_count_labels':
res_debug.c:1050:1: internal compiler error: RTL check: expected code 'reg',
have 'const_int' in rhs_regno, at rtl.h:1934
 1050 | }
  | ^
0x8fb169 rtl_check_failed_code1(rtx_def const*, rtx_code, char const*, int, 
char const*)
../.././gcc/gcc/rtl.cc:770
0x1399818 rhs_regno(rtx_def const*)
../.././gcc/gcc/rtl.h:1934
0x1399818 anticipatable_occurrence_p
../.././gcc/gcc/config/riscv/riscv-vsetvl.cc:348

So in this case avl should be obtained from dem.

Another issue is caused by the following code:
HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i;

during RTL pass: expand
../../.././gcc/libgfortran/generated/matmul_c4.c: In function 'matmul_c4':
../../.././gcc/libgfortran/generated/matmul_c4.c:2906:39: internal compiler 
error: RTL check:
expected code 'const_int', have 'const_poly_int' in expand_const_vector,
at config/riscv/riscv-v.cc:1149

The builder.elt (i) can be either const_int or const_poly_int.

PR target/111533

gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_const_vector): Fix bug.
* config/riscv/riscv-vsetvl.cc (anticipatable_occurrence_p): Fix bug.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/pr111533-1.c: New test.
* gcc.target/riscv/rvv/base/pr111533-2.c: New test.
---
 gcc/config/riscv/riscv-v.cc   |  5 ++--
 gcc/config/riscv/riscv-vsetvl.cc  |  3 +-
 .../gcc.target/riscv/rvv/base/pr111533-1.c| 15 ++
 .../gcc.target/riscv/rvv/base/pr111533-2.c| 29 +++
 4 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 359fb2ced8b..26700cfc732 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1149,8 +1149,9 @@ expand_const_vector (rtx target, rtx src)
  for (unsigned int i = 0; i < v.npatterns (); ++i)
{
  /* Calculate the diff between the target sequence and
-vid sequence.  */
- HOST_WIDE_INT diff = INTVAL (builder.elt (i)) - i;
+vid sequence.  The elt (i) can be either const_int or
+const_poly_int. */
+ poly_int64 diff = rtx_to_poly_int64 (builder.elt (i)) - i;
  v.quick_push (gen_int_mode (diff, v.inner_mode ()));
}
  /* Step 2: Generate result = VID + diff.  */
diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 7af33e7ea6f..af8c31d873c 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -307,8 +307,7 @@ anticipatable_occurrence_p (const bb_info *bb, const 
vector_insn_info dem)
   if (dem.has_avl_reg ())
 {
   /* rs1 (avl) are not modified in the basic block prior to the VSETVL.  */
-  rtx avl
-   = has_vl_op (insn->rtl ()) ? get_vl (insn->rtl ()) : dem.get_avl ();
+  rtx avl = dem.get_avl_or_vl_reg ();
   if (dem.dirty_p ())
{
  gcc_assert (!vsetvl_insn_p (insn->rtl ()));
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
new file mode 100644
index 000..aba26dfac89
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O2 -ffast-math -ftree-vectorize" 
} */
+
+#include 
+
+typedef _Complex float GFC_COMPLEX_4;
+
+void
+test (GFC_COMPLEX_4 *a, GFC_COMPLEX_4 *b, GFC_COMPLEX_4 c, ptrdiff_t i, 
ptrdiff_t j)
+{
+  ptrdiff_t l;
+  for (l = 0; l <= i; ++l)
+c += b[l] * a[j];
+  b[j] = c;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c
new file mode 100644
index 000..a4d2011b74b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr111533-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O2" } */
+
+#include 
+
+/* Return the number of DNS hierarchy levels in the name. */
+int
+test (const char *name) {
+   int i, len, count;
+
+   len = strlen(name);
+   for (i = 0, count = 0; i < len; i++) {
+   /* XXX need to check for \. or use named's nlabels(). */
+   if (name[i] == '.')
+   

RE: [PATCH v1] Mode-Switching: Add optional EMIT_AFTER hook

2023-09-27 Thread Li, Pan2
Almost forget about this patch, sorry for disturbing and kindly ping again.

Pan

-Original Message-
From: Gcc-patches  On Behalf 
Of Li, Pan2 via Gcc-patches
Sent: Monday, September 11, 2023 4:37 PM
To: Jeff Law ; gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; Wang, Yanzhang ; 
kito.ch...@gmail.com
Subject: RE: [PATCH v1] Mode-Switching: Add optional EMIT_AFTER hook

Hi Jeff,

Kindly ping for the Patch V2 as below.

https://gcc.gnu.org/pipermail/gcc-patches/2023-August/628508.html

Pan

-Original Message-
From: Li, Pan2  
Sent: Friday, August 25, 2023 8:45 PM
To: Li, Pan2 ; Jeff Law ; 
gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; Wang, Yanzhang ; 
kito.ch...@gmail.com
Subject: RE: [PATCH v1] Mode-Switching: Add optional EMIT_AFTER hook

Hi Jeff,

> You might also peek at the RTL gcse/pre code which is also LCM based and 
> has the same class of problems.

I found a similar approach to take care of this in gcse.cc/pre_edge_insert with 
some comments as below.

  /* We can't insert anything on an abnormal and
   critical edge, so we insert the insn at the end of
   the previous block. There are several alternatives
   detailed in Morgans book P277 (sec 10.5) for
   handling this situation.  This one is easiest for
   now.  */

if (eg->flags & EDGE_ABNORMAL)
  insert_insn_end_basic_block (index_map[j], bb);
else
  {
  insn = process_insert_insn (index_map[j]);
  insert_insn_on_edge (insn, eg);
  }

It looks the insert_insn_end_basic_block is designed to handle the ABNORMAL 
edge by inserting at end of previous block from the comments.

Pan

-Original Message-
From: Gcc-patches  On Behalf 
Of Li, Pan2 via Gcc-patches
Sent: Thursday, August 24, 2023 12:54 PM
To: Jeff Law ; gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; Wang, Yanzhang ; 
kito.ch...@gmail.com
Subject: RE: [PATCH v1] Mode-Switching: Add optional EMIT_AFTER hook

Thanks Jeff.

> That implies a save/restore pair around the call (possibly optimized so 
> that we minimize the number of save/restores).  I would have expected 
> x86 to already be doing this.  But maybe there's some ABI thing around 
> mmx vs x86 state that allows it to be avoided

Very similar to save/restore but optional.
If no static rounding mode instrinsic here, it is unnecessary to add 
save/restore
pair around the call. I bet mode-switching take care of this already.

Pan

-Original Message-
From: Jeff Law  
Sent: Thursday, August 24, 2023 7:27 AM
To: Li, Pan2 ; gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; Wang, Yanzhang ; 
kito.ch...@gmail.com
Subject: Re: [PATCH v1] Mode-Switching: Add optional EMIT_AFTER hook



On 8/23/23 08:54, Li, Pan2 wrote:
> Thanks Jeff for comments.
> 
>> Understood.  So the natural question is why does x86/sh not need this
>> for its mode switching?   Don't all the same issues exist on those
>> targets as well?
> 
> AFAIK, it comes from the different design principle between the risc-v and 
> x86/arm intrinsic API.
> The risc-v rvv FP rounding mode intrinsic API has one abstract level above 
> the insn itself, while
> the x86/arm only indicates the semantics of the insn.
> 
> For example, if one vector instruction VFADD doesn't have static rounding 
> mode (aka encoding rm in insn),
> there is no such a intrinsic API contains rounding mode argument in x86/arm. 
> While the risc-v fp
> vector intrinsic will always have static rounding mode API if the frm is 
> honored.
> 
> In short, the risc-v intrinsic API is closer to the end-user, while the 
> x86/arm instrinsic API is closer to insn itself.
OK, but I'm still strugging to see how the distinction is important 
here.  Ultimately there's a state at a call site.  We need to make sure 
that state from the current function doesn't impact the callee and we 
need to make sure that the callee doesn't impact the state in the caller.

That implies a save/restore pair around the call (possibly optimized so 
that we minimize the number of save/restores).  I would have expected 
x86 to already be doing this.  But maybe there's some ABI thing around 
mmx vs x86 state that allows it to be avoided

> 
> For the rest part, will have a try based on your suggestion soon as I am in 
> the middle of something.
No problem.  Get to it when you can.  I think it affects you more than 
me :-)

jeff


Re: [PATCH 00/18] Support -mevex512 for AVX512

2023-09-27 Thread ZiNgA BuRgA

Thanks for the new patch!

I see that there's a new __EVEX512__ define.  Will there be some 
__EVEX256__ (or maybe some max EVEX width) define, so that code can 
detect whether the compiler supports AVX10.1/256 without resorting to 
version checks?





Re: [PATCH v3 1/2] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-09-27 Thread Waffl3x
Not to worry, I'm currently going through that process with the FSF, it
was confirmed that a pseudonym should be just fine. I don't know how
long the process takes but my goal is to get this in for GCC14, and
surely this won't take more than a month. One can only hope anyway.

On 2023-09-27 04:43 p.m., Hans-Peter Nilsson wrote:
>> Date: Tue, 26 Sep 2023 01:56:55 +
>> From: waffl3x 
> 
>> Signed-off-by: waffl3x 
> 
> I think I've read that you have to put your actual name in
> the DCO; using an alias (presumably) as above would be
> wrong.
> 
> Ah, it's on https://gcc.gnu.org/dco.html - the *second* DCO
> link; under "Signed-off-by", on
> https://gcc.gnu.org/contribute.html! "sorry, no pseudonyms
> or anonymous contributions".
> 
> (Also, from Some Source I Don't Remember: using an alias if
> you have FSF papers in place is ok; you can use a pseudonym
> if FSF can match it to papers on file that have your actual
> name or something to that effect.)
> 
> brgds, H-P



Re: committed [RISC-V]: Harden test scan patterns

2023-09-27 Thread Vineet Gupta




On 9/27/23 13:14, Jeff Law wrote:

It would help to describe how these patterns were under specified so
that folks don't continue to make the same mistake as new tests get 
added.


dg-final scan-assembler, scan-assembler-not, and scan-assembler-times
use a tcl regular expression (often referred to abbreviated as RE), as
described in https://www.tcl.tk/man/tcl8.4/TclCmd/re_syntax.html .

If your RE is not specific enough, it can match LTO information that the
compiler places into its assembly output when the relevant options are
provided, which is common when running tests where the test harness
iterates over a number of optimization option combinations.
Note that '.' is an atom that can match any character.  If you want to
match a dot specifically, you have to escape it with a backslash: '\.' .
When you are matching an instruction mnemonic, an effective way to
avoid matching in LTO information is to enforce matching of word start
(\m) and/or word end (\M) .
Note also that the backslash has to be quoted.  If the RE is enclosed in
'"' quotes, extra backslashes are needed.  That is not necessary when it
is enclosed in curly braces.

For example, "ld.w" will be matched in:

.ascii "h\227\022\212ld@w\251jr\254'\320\255vwj\252\026\016\364"

If you write {\mld\.w\M} instead, you avoid this problem.
OK.  So that naturally leads to the question, why aren't others seeing 
this, both in the RISC-V world and more generally.  I'm not aware of 
any case where I've run the testsuite and tripped over this issue, nor 
am I aware of anyone else tripping over it.


Actually I did run into it. See commit ecfa870ff29d979bd2c ("RISC-V: 
optim const DF +0.0 store to mem [PR/110748]") where a false failure was 
triggered due to these random LTO strings and needed adjusting.


-/* { dg-final { scan-assembler-not "sw" } } */
-/* { dg-final { scan-assembler-not "fld" } } */
-/* { dg-final { scan-assembler-not "fsd" } } */
-/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "\tsw\t" } } */
+/* { dg-final { scan-assembler-not "\tfld\t" } } */
+/* { dg-final { scan-assembler-not "\tfsd\t" } } */
+/* { dg-final { scan-assembler-not "\tlw\t" } } */



Re: [PATCH v3 1/2] c++: Initial support for P0847R7 (Deducing This) [PR102609]

2023-09-27 Thread Hans-Peter Nilsson
> Date: Tue, 26 Sep 2023 01:56:55 +
> From: waffl3x 

> Signed-off-by: waffl3x 

I think I've read that you have to put your actual name in
the DCO; using an alias (presumably) as above would be
wrong.

Ah, it's on https://gcc.gnu.org/dco.html - the *second* DCO
link; under "Signed-off-by", on
https://gcc.gnu.org/contribute.html! "sorry, no pseudonyms
or anonymous contributions".

(Also, from Some Source I Don't Remember: using an alias if
you have FSF papers in place is ok; you can use a pseudonym
if FSF can match it to papers on file that have your actual
name or something to that effect.)

brgds, H-P


Re: [PATCH 1/2] match.pd: Support combine cond_len_op + vec_cond similar to cond_op

2023-09-27 Thread Jeff Law




On 9/20/23 07:09, Lehua Ding wrote:

This patch adds combine cond_len_op and vec_cond to cond_len_op like
cond_op.

gcc/ChangeLog:

* gimple-match.h (gimple_match_op::gimple_match_op):
Add interfaces for more arguments.
(gimple_match_op::set_op): Add interfaces for more arguments.
* match.pd: Add support of combining cond_len_op + vec_cond

OK
jeff


Re: [PATCH 2/2] RISC-V: Add assert of the number of vmerge in autovec cond testcases

2023-09-27 Thread Jeff Law




On 9/20/23 07:09, Lehua Ding wrote:

This patch makes cond autovec testcase checks more restrict.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/cond/cond_arith-1.c:
Assert of the number of vmerge.
* gcc.target/riscv/rvv/autovec/cond/cond_arith-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_arith-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_arith-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_arith-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_arith-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_arith-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_arith-8.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-1.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv32-2.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv64-1.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2float-rv64-2.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2int-rv32-1.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2int-rv32-2.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2int-rv64-1.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_float2int-rv64-2.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv32-1.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv32-2.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv64-1.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2float-rv64-2.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2int-rv32-1.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2int-rv32-2.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2int-rv64-1.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_convert_int2int-rv64-2.c:
Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_copysign-rv32gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_copysign-rv64gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fadd-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fadd-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fadd-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fadd-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma-8.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fma_fnma_run-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmax-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmin-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fms_fnms-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fms_fnms-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fms_fnms-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fms_fnms-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fms_fnms-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fms_fnms-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmul-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmul-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmul-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_fmul-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_logical_min_max-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_logical_min_max-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_logical_min_max-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_logical_min_max-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_logical_min_max-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_shift-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/cond/cond_shift-2.c: Ditto.
* 

Re: committed [RISC-V]: Harden test scan patterns

2023-09-27 Thread Andrew Pinski
On Wed, Sep 27, 2023 at 1:14 PM Jeff Law  wrote:
>
>
>
> On 9/27/23 12:22, Joern Rennecke wrote:
> > On Wed, 27 Sept 2023 at 18:22, Jeff Law  wrote:
> >
> >> It would help to describe how these patterns were under specified so
> >> that folks don't continue to make the same mistake as new tests get added.
> >
> > dg-final scan-assembler, scan-assembler-not, and scan-assembler-times
> > use a tcl regular expression (often referred to abbreviated as RE), as
> > described in https://www.tcl.tk/man/tcl8.4/TclCmd/re_syntax.html .
> >
> > If your RE is not specific enough, it can match LTO information that the
> > compiler places into its assembly output when the relevant options are
> > provided, which is common when running tests where the test harness
> > iterates over a number of optimization option combinations.
> > Note that '.' is an atom that can match any character.  If you want to
> > match a dot specifically, you have to escape it with a backslash: '\.' .
> > When you are matching an instruction mnemonic, an effective way to
> > avoid matching in LTO information is to enforce matching of word start
> > (\m) and/or word end (\M) .
> > Note also that the backslash has to be quoted.  If the RE is enclosed in
> > '"' quotes, extra backslashes are needed.  That is not necessary when it
> > is enclosed in curly braces.
> >
> > For example, "ld.w" will be matched in:
> >
> > .ascii  "h\227\022\212ld@w\251jr\254'\320\255vwj\252\026\016\364"
> >
> > If you write {\mld\.w\M} instead, you avoid this problem.
> OK.  So that naturally leads to the question, why aren't others seeing
> this, both in the RISC-V world and more generally.  I'm not aware of any
> case where I've run the testsuite and tripped over this issue, nor am I
> aware of anyone else tripping over it.

I can answer the more generally part. Most other testcases if not all
scan-assembler tries to add spaces that allow not to catch things out
of place. Or even file/directory names.
The documentation at
https://gcc.gnu.org/onlinedocs/gccint/Final-Actions.html (and/or
https://gcc.gnu.org/wiki/HowToPrepareATestcase) definitely could be
expanded to make a mention of these gotchas really.

Thanks,
Andrew

>
> Jeff


Re: [Committed] RISC-V: Fix mem-to-mem VLS move pattern[PR111566]

2023-09-27 Thread Jeff Law




On 9/26/23 17:08, 钟居哲 wrote:

Hi, Jeff.

I removed mem-to-mem patterns as you suggested that means we don't have 
scalar move optimization for small size vector modes.

Is it ok for trunk?
Since it is a bug fix patch, I hope we can land it soon. We may will 
find another way to optimize small size vector mode mem-to-mem.

It's OK with me.

jeff


Re: [PATCH] RISC-V/testsuite: Fix ILP32 RVV failures from missing

2023-09-27 Thread Jeff Law




On 9/27/23 11:32, Palmer Dabbelt wrote:



IMO this is one of those places where we should just be as normal as 
possible.  So if the other big ports allow system headers then we 
should, otherwise we should move everyone over to testing in some way 
we'll catch these before commit.
Exactly.  I think the dance we've been doing with stdint-gcc.h is a bit 
silly, but I haven't pushed on it at all.


No other port does anything similar.  When they need stdint.h, they 
include it.  It does mean you have to have the appropriate headers 
installed for each multilib configuration, but that's the way every 
other port handles this problem.  There's no good reason I'm aware of 
for RISC-V to be different.


jeff


Re: RFA: [RISC-V] Replace riscv_vector with riscv_v in target selector clauses. (Followup-patch for RISCV test infrastructure for d / v / zfh extensions)

2023-09-27 Thread Jeff Law




On 9/27/23 11:48, Joern Rennecke wrote:

Regression tested for:
 riscv-sim
 
riscv-sim/-march=rv32gcv_zfh/-mabi=ilp32d/-ftree-vectorize/--param=riscv-autovec-preference=scalable
 riscv-sim/-march=rv32imac/-mabi=ilp32
 
riscv-sim/-march=rv64gcv_zfh_zvfh_zba_zbb_zbc_zicond_zicboz_zawrs/-mabi=lp64d/-ftree-vectorize/--param=riscv-autovec-preference=scalable
 riscv-sim/-march=rv64imac/-mabi=lp64


riscv_v-ector-.txt

Replace riscv_vector with riscv_v in target selector clauses.

# after deleting check_effective_target_riscv_vector:
$ grep -rl '[^"<]riscv_vector\>[^.]' > file-list
# edit file-list to remove ChangeLog and *.orig
$ cat edcmds
g/riscv_vector[^.]/s/\([^"<]riscv_v\)ector\>\([^.]\)/\1\2/g
w
q
$ sed 's/.*/ed & < edcmds/' < file-list > tmp
$ source tmp
$ git checkout gcc.target/riscv/predef-19.c
$ git checkout gcc.target/riscv/predef-18.c gcc.target/riscv/predef-20.c

2023-09-27  Joern Rennecke

gcc/testsuite/
 * lib/target-supports.exp (check_effective_target_riscv_vector): 
Delete.  Changed all users to use *riscv_v instead.


OK after wrapping the ChangeLog entry appropriately.

jeff


Re: RISC-V: Added support for CRC.

2023-09-27 Thread Jeff Law




On 9/26/23 12:56, Joern Rennecke wrote:




What ultimately pushed us to keep moving forward on this effort was
discovering numerous CRC loop implementations out in the wild, including
4 implementations (IIRC) in the kernel itself.


I have always assumed that such must exist (CRCs are useful for a number
of problems, and besides, they wouldn't have been included in coremark as
a part of the workload if they were not relevant), but it is good to have
confirmation, and even better to have code that can detect and analyse a
entire class of idioms that is in such widespread use.
I was personally surprised at how many we found.   While there were a 
bunch of table generation routines which obviously aren't at all 
interesting, there were enough in the cases we analyzed that it 
convinced me this wasn't just catering to a benchmark.




This still leaves room for further improvements, like detecting fully-unrolled
code, table lookup, or slice-by-N, and replacing them with better
target-optimized code where this is indicated by the optimization flags to
save execution time or code/rodata size.  Not something we have to tackle
now, but just because we don't do it now, doesn't mean we couldn't address
these in the future if that appears worthwhile.
Absolutely.  In fact, I have encouraged Mariam to capture some of the 
cases we can't currently handle in the testsuite, essentially building a 
bit of a TODO list.






I can easily see creating a clmul RTL opcode for targets which support
it and hoisting the clmul vs lookup table selection into generic code.
I'm still pondering if we're likely to ever see cases where we want a
vector clmul intrinsic or support in the autovectorizer for clmul.
We've certainly got targets with vector clmul in the ISA, the question
is using it.


If we aim for optimal code, I think it more likely that we want to detect a
block CRC computation, and have a target expander decide to do that
with inline code or a library call that uses vectorized clmul.  At the time
we add such block-CRC expansion code, it also makes sense to add a
builtin for block CRC so that new GNU C programs can directly request
that functionality without having to go through the cargo cult of matching
a supported idiom.
And I think we've got a structure which largely allows further 
improvements, both in the recognition/validation step and in the code 
expansion step.




Now, the library might be written in GNU C, and for that it might be useful
to have a vector clmul intrinsic so that we can express this algorithm more
easily.
Agreed.  It's also worth nothing that LLVM has a clmul in their IL and I 
suspect they expose it via a builtin/intrinsic.  I'd expect we'll 
ultimately end up in the same boat.





Probably the biggest task in that space right now is to see if we can
avoid the symbolic execution engine by re-using ranger.


I'll be interested to see what you'll come up with, but if reverting to the
symbolic execution engine, the compile time cost isn't much if you only
use it for a proper match.  So whatever heuristics are used before deciding
to use the engine matter.  Can all the cases detected by the engine be
recognized as a loop with a reduction?  We might use different heuristics
for different optimization levels, i.e. allow more false negatives at -O1,
and more false positives at -O2 / -fexpensive-optimizations.
It's mostly a desire not to add (yet another) symbolic execution engine 
to GCC.  We've already got the engine for CCP as well as symbolic 
execution capabilities in Ranger.  I'd like to avoid adding another if 
we can do so.


For a LFSR validation step we need to track 4 potential states for each 
bit in an object.  0, 1, x, !x where "x" is the state of the bit from a 
different object.  If it was just tracking 0, 1, x, !x for an entire 
object, Ranger is probably already do that.  But we need to do it for 
each bit within an object.


We haven't seen compile-time cost be a real issue.  But we also haven't 
looked too closely at that problem.




While I concur that we want existing code to be able to take advantage of
this optimization by gcc recognizing whatever method the code uses to
compute CRC (within reason, we are still bound by the laws of
computability and resource constraints for the compilation), I would
like to stress that I don't think the builtin will use its value over time.
It can be used in tests to make sure we exercise the code generation for the
internal function.  It can be used in new GNU C code to make it easier to
do a CRC computation without worrying about the implementation.  If
an accord with other major compiler suppliers (e.g. the LLVM community)
is reached, it might even see more widespread use.
Which somewhat dovetails with Alex's position -- namely that it's not 
that value.  Hence the de-emphasis on this approach.  We'll continue to 
focus on the end-to-end solution.


We may still want a clmul as an RTL opcode and builtins to utilize it.




Which is 

Re: RISC-V sign extension query

2023-09-27 Thread Jeff Law




On 9/27/23 00:29, Vineet Gupta wrote:

Hi Jeff,




We touched upon this in our airport "rendezvous". I'm curious if you 
have the wip bits lying around - (a) to get a feel for how this could be 
done and (b) to see why REE and/or similar construct in CSE don't work 
as expected.

Not in any usable form.  Just several variants that didn't work ;-)

They don't work with REE because I'd forgotten a key point.  REE doesn't 
look for lexical redundancies like you'd see with CSE/PRE.  ie, given a 
pair of identical sign extensions in the IL, REE doesn't eliminate one.


Instead REE is focused on cases where we can transform an existing 
operation such as a load into an extending load to eliminate a 
subsequent extension.


This leads to a couple thoughts.

First, we ought to be able to use the same concept, but instead of 
putting something like this into the IL to express the extension done by 
the caller


(set (reg:DI a0) (sign_extend:DI (reg:SI a0)))

Instead we probably want to insert this as a dummy into the IL

(set (reg:SI a0) (mem:SI (sp))

If this is followed by a sign extension, then it'll get turned into

(set (reg:DI a0) (sign_extend:DI (mem:SI (sp)))

And the subsequent extension will get removed.  And since we've tracked 
the dummy, we can just eliminate the dummy as well.  I'm a bit worried 
about how this plays with the copy_needed bits in REE.


This should at least tell us how often there's an extension of an 
incoming argument that can be trivially eliminated.  I'm not sure it's 
the best place to eliminate the extensions though.  Leading to




We should make sure that CSE/PRE are properly identifying and 
eliminating lexical redundancies.   I wouldn't be surprised if the class 
of problems we're chasing are better detected and optimized by CSE/PRE 
since those can work on an extended block or global basis respectively.


For CSE/PRE we'd want to insert something like

(set (reg:DI a0) (sign_extend:DI (reg:SI a0)))

Into the IL which should make any expressions like

(sign_extend:DI (reg:SI a0))

fully redundant in the IL, thus allowing CSE/PRE to eliminate them.

I've got a few things backed up from before the Cauldron, but expect to 
be able to poke at this some this week.


jeff


Re: committed [RISC-V]: Harden test scan patterns

2023-09-27 Thread Jeff Law




On 9/27/23 12:22, Joern Rennecke wrote:

On Wed, 27 Sept 2023 at 18:22, Jeff Law  wrote:


It would help to describe how these patterns were under specified so
that folks don't continue to make the same mistake as new tests get added.


dg-final scan-assembler, scan-assembler-not, and scan-assembler-times
use a tcl regular expression (often referred to abbreviated as RE), as
described in https://www.tcl.tk/man/tcl8.4/TclCmd/re_syntax.html .

If your RE is not specific enough, it can match LTO information that the
compiler places into its assembly output when the relevant options are
provided, which is common when running tests where the test harness
iterates over a number of optimization option combinations.
Note that '.' is an atom that can match any character.  If you want to
match a dot specifically, you have to escape it with a backslash: '\.' .
When you are matching an instruction mnemonic, an effective way to
avoid matching in LTO information is to enforce matching of word start
(\m) and/or word end (\M) .
Note also that the backslash has to be quoted.  If the RE is enclosed in
'"' quotes, extra backslashes are needed.  That is not necessary when it
is enclosed in curly braces.

For example, "ld.w" will be matched in:

.ascii  "h\227\022\212ld@w\251jr\254'\320\255vwj\252\026\016\364"

If you write {\mld\.w\M} instead, you avoid this problem.
OK.  So that naturally leads to the question, why aren't others seeing 
this, both in the RISC-V world and more generally.  I'm not aware of any 
case where I've run the testsuite and tripped over this issue, nor am I 
aware of anyone else tripping over it.


Jeff


Re: [committed] libstdc++: Add GDB printers for types

2023-09-27 Thread Jonathan Wakely
On Wed, 27 Sept 2023, 18:25 Tom Tromey via Libstdc++, 
wrote:

> >> I have fixes for most of the issues that are worth fixing (I didn't
> >> bother with line lengths -- FWIW in gdb we just run 'black' and don't
> >> worry about these details),
>
> Jonathan> I used autopep8 and committed the result as
> Jonathan> e08559271b2d797f658579ac8610dbf5e58bcfd8 so the line lengths
> Jonathan> should be OK now.
>
> Yeah, my patches are on top of that, but flake8 still complains, and I
> still see lines > 79 characters.  However maybe flake8 isn't the checker
> you want to use, or maybe you have something set up for a different line
> length?
>

I don't think I have anything set up for python formatting at all, I just
committed whatever autopep8 did with its default settings.

If that's suboptimal, we can consider other tools, if they're reliable and
easy to run.


> Jonathan> So the fix is to just change the string to '{} {}' which I've
> pushed
> Jonathan> as 1fab05a885a308c19cf42b72fd36805ddf27fdc8 now (also attached).
>
> Thank you.
>
> Tom
>


Re: [PATCH v2] ARM: Block predication on atomics [PR111235]

2023-09-27 Thread Wilco Dijkstra
Hi Ramana,

> Hope this helps.

Yes definitely!

>> Passes regress/bootstrap, OK for commit?
>
> Target ? armhf ? --with-arch , -with-fpu , -with-float parameters ?
> Please be specific.

I used --target=arm-none-linux-gnueabihf --host=arm-none-linux-gnueabihf
--build=arm-none-linux-gnueabihf --with-float=hard. However it seems that the
default armhf settings are incorrect. I shouldn't need the --with-float=hard 
since
that is obviously implied by armhf, and they should also imply armv7-a with 
vfpv3
according to documentation. It seems to get confused and skip some tests. I 
tried
using --with-fpu=auto, but that doesn't work at all, so in the end I forced it 
like:
--with-arch=armv8-a --with-fpu=neon-fp-armv8. With this it runs a few more 
tests.

> Since these patterns touch armv8m.baseline can you find all the
> testcases in the testsuite and ensure no change in code for
> armv8m.baseline as that's unpredicated already and this patch brings
> this in line with the same ? Does the testsuite already cover these
> arch variants and are you satisfied that the tests in the testsuite
> can catch / don't make any additional code changes to the other
> architectures affected by this ?

There are various v8-m(.base/.main) tests and they all pass. The generated
code is generally unchanged if there was no conditional execution. I made
the new UNSPEC_LDR/STR patterns support offsets so there is no difference
in generated code for relaxed loads/stores (since they used to use a plain
load/store which has an immediate offset).

>> * onfig/arm/sync.md (arm_atomic_load): Add new pattern.
>
> Nit: s/onfig/config

Fixed.

>> (atomic_load): Always expand atomic loads explicitly.
>> (atomic_store): Always expand atomic stores explicitly.
>
> Nit: Change message to :
> Switch patterns to define_expand.

Fixed.

> Largely looks ok though I cannot work out tonight if we need more v8-a
> or v8m-baseline specific tests for scan-assembler patterns.
>
> Clearly our testsuite doesn't catch it , so perhaps the OP could help
> validate this patch with their formal models to see if this fixes
> these set of issues and creates no new regressions ? Is that feasible
> to do ?

Disabling conditional execution avoids the issue. It's trivial to verify that
atomics can no longer be conditionally executed (no "%?"). When this is
committed, we can run the random testing again to confirm the issue
is no longer present.

> -(define_insn "atomic_load"
> -  [(set (match_operand:QHSI 0 "register_operand" "=r,r,l")
> +(define_insn "arm_atomic_load"
> +  [(set (match_operand:QHSI 0 "register_operand" "=r,l")
>  (unspec_volatile:QHSI
> -  [(match_operand:QHSI 1 "arm_sync_memory_operand" "Q,Q,Q")
> -   (match_operand:SI 2 "const_int_operand" "n,Pf,n")]  ;; model
> +  [(match_operand:QHSI 1 "memory_operand" "m,m")]
>
> Remind me again why is it safe to go from the Q constraint to the m
> constraint here and everywhere else you've done this ?

That's because the relaxed loads/stores use LDR/STR wrapped in an
UNSPEC. To avoid regressions we have to use 'm' so that an immediate
offset can be merged into the memory access.

>> -  VUNSPEC_LDA  ; Represent a store-register-acquire.
>> +  VUNSPEC_LDR  ; Represent a load-register-relaxed.
>> +  VUNSPEC_LDA  ; Represent a load-register-acquire.
>
> Nit: LDA before LDR ? Though I suspect this list can be alphabetically
> ordered at another point of time.

Swapped.

> There are new tests added for v7-a , what happens with the output for
> v8-a and the changes for ldacq and other such instructions ?

v7-a and v8-a generate the same instructions for relaxed load/store.
The acquire/release versions are identical except they are no longer
predicated. Basically the new patterns are not only significantly simpler,
they are now the same between the many ARM/Thumb-2/v7-a/v8-m/v8-a
combinations, so test coverage is much higher now. This is how these
patterns should have been designed all along.

v2 follows below.

Cheers,
Wilco


[PATCH v2] ARM: Block predication on atomics [PR111235]

The v7 memory ordering model allows reordering of conditional atomic
instructions.  To avoid this, make all atomic patterns unconditional.
Expand atomic loads and stores for all architectures so the memory access
can be wrapped into an UNSPEC.

gcc/ChangeLog/
PR target/111235
* config/arm/constraints.md: Remove Pf constraint.
* config/arm/sync.md (arm_atomic_load): Add new pattern.
(arm_atomic_load_acquire): Likewise.
(arm_atomic_store): Likewise.
(arm_atomic_store_release): Likewise.
(atomic_load): Switch patterns to define_expand.
(atomic_store): Likewise.
(arm_atomic_loaddi2_ldrd): Remove predication.
(arm_load_exclusive): Likewise.
(arm_load_acquire_exclusive): Likewise.
(arm_load_exclusivesi): Likewise.
(arm_load_acquire_exclusivesi: Likewise.

Re: committed [RISC-V]: Harden test scan patterns

2023-09-27 Thread Joern Rennecke
On Wed, 27 Sept 2023 at 18:22, Jeff Law  wrote:

> It would help to describe how these patterns were under specified so
> that folks don't continue to make the same mistake as new tests get added.

dg-final scan-assembler, scan-assembler-not, and scan-assembler-times
use a tcl regular expression (often referred to abbreviated as RE), as
described in https://www.tcl.tk/man/tcl8.4/TclCmd/re_syntax.html .

If your RE is not specific enough, it can match LTO information that the
compiler places into its assembly output when the relevant options are
provided, which is common when running tests where the test harness
iterates over a number of optimization option combinations.
Note that '.' is an atom that can match any character.  If you want to
match a dot specifically, you have to escape it with a backslash: '\.' .
When you are matching an instruction mnemonic, an effective way to
avoid matching in LTO information is to enforce matching of word start
(\m) and/or word end (\M) .
Note also that the backslash has to be quoted.  If the RE is enclosed in
'"' quotes, extra backslashes are needed.  That is not necessary when it
is enclosed in curly braces.

For example, "ld.w" will be matched in:

.ascii  "h\227\022\212ld@w\251jr\254'\320\255vwj\252\026\016\364"

If you write {\mld\.w\M} instead, you avoid this problem.

#

Where should this go?  Maybe somewhere in or linked from
https://gcc.gnu.org/codingconventions.html , Testsuite conventions?


Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]

2023-09-27 Thread Toon Moene

On 9/27/23 19:31, Jeff Law wrote:



On 9/27/23 04:14, juzhe.zh...@rivai.ai wrote:

Since after removing mem-to-mem pattern.

program main
   integer, dimension(:,:), allocatable :: a, b
   integer, dimension(:), allocatable :: sh
   allocate (a(2,2))
   allocate (b(2,2))
   allocate (sh(3))
   a = 1
   b = cshift(a,sh)
end program main

This case will failed if we don't change mov pattern.
Can you expand on this?  You didn't indicate the failure mode or any 
analysis behind the failure.


jeff


Note that this Fortran code has no defined behavior, because the sh 
array isn't given any values ...


Kind regards,

--
Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290
Saturnushof 14, 3738 XG  Maartensdijk, The Netherlands



Re: [PATCH v4] i386: Allow -mlarge-data-threshold with -mcmodel=large

2023-09-27 Thread Fangrui Song
On Wed, Sep 13, 2023 at 11:19 AM Fangrui Song  wrote:
>
> On Tue, Aug 22, 2023 at 12:19 AM Fangrui Song  wrote:
> >
> > On Tue, Aug 1, 2023 at 12:51 PM Fangrui Song  wrote:
> > >
> > > When using -mcmodel=medium, large data objects larger than the
> > > -mlarge-data-threshold threshold are placed into large data sections
> > > (.lrodata, .ldata, .lbss and some variants).  GNU ld and ld.lld 17 place
> > > .l* sections into separate output sections.  If small and medium code
> > > model object files are mixed, the .l* sections won't exert relocation
> > > overflow pressure on sections in object files built with -mcmodel=small.
> > >
> > > However, when using -mcmodel=large, -mlarge-data-threshold doesn't
> > > apply.  This means that the .rodata/.data/.bss sections may exert
> > > relocation overflow pressure on sections in -mcmodel=small object files.
> > >
> > > This patch allows -mcmodel=large to generate .l* sections and drops an
> > > unneeded documentation restriction that the value must be the same.
> > >
> > > Link: https://groups.google.com/g/x86-64-abi/c/jnQdJeabxiU
> > > ("Large data sections for the large code model")
> > >
> > > Signed-off-by: Fangrui Song 
> > >
> > > ---
> > > Changes from v1 
> > > (https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616947.html):
> > > * Clarify commit message. Add link to 
> > > https://groups.google.com/g/x86-64-abi/c/jnQdJeabxiU
> > >
> > > Changes from v2
> > > * Drop an uneeded limitation in the documentation.
> > >
> > > Changes from v3
> > > * Change scan-assembler directives to use \. to match literal .
> > > ---
> > >  gcc/config/i386/i386.cc| 15 +--
> > >  gcc/config/i386/i386.opt   |  2 +-
> > >  gcc/doc/invoke.texi|  6 +++---
> > >  gcc/testsuite/gcc.target/i386/large-data.c | 13 +
> > >  4 files changed, 26 insertions(+), 10 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/large-data.c
> > >
> > > [...]
> >
> > Ping:)
>
> Ping:) https://gcc.gnu.org/pipermail/gcc-patches/2023-August/625993.html
>
> (I don't have write access to gcc.)
>
>
> --
> 宋方睿

Ping? :) https://gcc.gnu.org/pipermail/gcc-patches/2023-August/625993.html

(I don't have write access to gcc.)


-- 
宋方睿


Re: [PATCH] RISC-V/testsuite: Fix ILP32 RVV failures from missing

2023-09-27 Thread Palmer Dabbelt

On Wed, 27 Sep 2023 10:28:55 PDT (-0700), jeffreya...@gmail.com wrote:



On 9/25/23 15:17, Maciej W. Rozycki wrote:

On Mon, 25 Sep 2023, Maciej W. Rozycki wrote:


  NB the use of this specific  header, still in place elsewhere,
seems gratuitous to me.  We don't need or indeed want to print anything in
the test cases (unless verifying something specific to the print facility)
and if we want to avoid minor code duplication (i.e. not to have explicit:

   if (...)
 __builtin_abort ();

replicated across test cases), we can easily implement this via a local
header, there's no need to pull in a complex system facility.


  Overall we ought not to require any system headers in compile tests and
then link and run tests need a functional target environment anyway.  So
maybe the use of  in run tests isn't as bad after all if not for
the -DNDEBUG peculiarity.  However I still think the less we depend in
verification on external components the better, that's one variable to
exclude.

Certainly we don't want extraneous #includes.   We can often avoid them
with a few judicious prototypes, like for abort ().

But we also need to get to the point where we can run tests which have
#include directives that reference system headers.  Many tests in the
various GCC testsuites have those directives and we don't want to be
continually trying to eradicate #includes from those tests.

The standard way to deal with this is single tree builds which are
deprecated or to have an install tree with the suitable multilib headers
and libraries.  The latter seems like the only viable solution to me.


IMO this is one of those places where we should just be as normal as 
possible.  So if the other big ports allow system headers then we 
should, otherwise we should move everyone over to testing in some way 
we'll catch these before commit.




jeff


Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]

2023-09-27 Thread Jeff Law




On 9/27/23 04:14, juzhe.zh...@rivai.ai wrote:

Since after removing mem-to-mem pattern.

program main
   integer, dimension(:,:), allocatable :: a, b
   integer, dimension(:), allocatable :: sh
   allocate (a(2,2))
   allocate (b(2,2))
   allocate (sh(3))
   a = 1
   b = cshift(a,sh)
end program main

This case will failed if we don't change mov pattern.
Can you expand on this?  You didn't indicate the failure mode or any 
analysis behind the failure.


jeff


Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]

2023-09-27 Thread Jeff Law




On 9/27/23 03:38, juzhe.zh...@rivai.ai wrote:

 >> Why add `can_create_pseudo_p ()` here? this will split after reload,

but we forbid that pattern between reload and split2?


I have no ideal. Some fortran tests just need recognization of 
mem-to-mem pattern before RA

I don't know the reason.

But isn't that the key to understanding what's going on here?

There is nothing special about Fortran here.  Whatever problem this is 
working around will almost certainly show up again in other, 
non-Fortran, contexts.


There aren't enough details in here to really evaluate what's going on.

jeff


Re: [PATCH] RISC-V/testsuite: Fix ILP32 RVV failures from missing

2023-09-27 Thread Jeff Law




On 9/25/23 15:17, Maciej W. Rozycki wrote:

On Mon, 25 Sep 2023, Maciej W. Rozycki wrote:


  NB the use of this specific  header, still in place elsewhere,
seems gratuitous to me.  We don't need or indeed want to print anything in
the test cases (unless verifying something specific to the print facility)
and if we want to avoid minor code duplication (i.e. not to have explicit:

   if (...)
 __builtin_abort ();

replicated across test cases), we can easily implement this via a local
header, there's no need to pull in a complex system facility.


  Overall we ought not to require any system headers in compile tests and
then link and run tests need a functional target environment anyway.  So
maybe the use of  in run tests isn't as bad after all if not for
the -DNDEBUG peculiarity.  However I still think the less we depend in
verification on external components the better, that's one variable to
exclude.
Certainly we don't want extraneous #includes.   We can often avoid them 
with a few judicious prototypes, like for abort ().


But we also need to get to the point where we can run tests which have 
#include directives that reference system headers.  Many tests in the 
various GCC testsuites have those directives and we don't want to be 
continually trying to eradicate #includes from those tests.


The standard way to deal with this is single tree builds which are 
deprecated or to have an install tree with the suitable multilib headers 
and libraries.  The latter seems like the only viable solution to me.


jeff


Re: [committed] libstdc++: Add GDB printers for types

2023-09-27 Thread Tom Tromey
>> I have fixes for most of the issues that are worth fixing (I didn't
>> bother with line lengths -- FWIW in gdb we just run 'black' and don't
>> worry about these details),

Jonathan> I used autopep8 and committed the result as
Jonathan> e08559271b2d797f658579ac8610dbf5e58bcfd8 so the line lengths
Jonathan> should be OK now.

Yeah, my patches are on top of that, but flake8 still complains, and I
still see lines > 79 characters.  However maybe flake8 isn't the checker
you want to use, or maybe you have something set up for a different line
length?

Jonathan> So the fix is to just change the string to '{} {}' which I've pushed
Jonathan> as 1fab05a885a308c19cf42b72fd36805ddf27fdc8 now (also attached).

Thank you.

Tom


Re: committed [RISC-V]: Harden test scan patterns

2023-09-27 Thread Jeff Law




On 9/27/23 03:26, Joern Rennecke wrote:

I got tired of scan tests failing when they have an underspecified
pattern that matches LTO information, so I did a global replace for
the most common form of such scan patterns in the gcc.target/riscv
testsuite.

regression tested for:
 riscv-sim
 
riscv-sim/-march=rv32gcv_zfh/-mabi=ilp32d/-ftree-vectorize/--param=riscv-autovec-preference=scalable
 riscv-sim/-march=rv32imac/-mabi=ilp32
 
riscv-sim/-march=rv64gcv_zfh_zvfh_zba_zbb_zbc_zicond_zicboz_zawrs/-mabi=lp64d/-ftree-vectorize/--param=riscv-autovec-preferenc
e=scalable
 riscv-sim/-march=rv64imac/-mabi=lp64

Committed as obvious.
It would help to describe how these patterns were under specified so 
that folks don't continue to make the same mistake as new tests get added.


Jeff


Re: [committed] libstdc++: Add GDB printers for types

2023-09-27 Thread Jonathan Wakely
On Wed, 27 Sept 2023 at 16:37, Tom Tromey  wrote:
>
> > Jonathan Wakely via Gcc-patches  writes:
>
> Replying to a quite old email...
>
> I ran a Python linter on the libstdc++ pretty-printers.
>
> I have fixes for most of the issues that are worth fixing (I didn't
> bother with line lengths -- FWIW in gdb we just run 'black' and don't
> worry about these details),

I used autopep8 and committed the result as
e08559271b2d797f658579ac8610dbf5e58bcfd8 so the line lengths should be
OK now.

> but the patch I'm replying to had a problem
> that I didn't know how to fix:
>
> > +class StdChronoTimeZoneRulePrinter:
> [...]
> > +if kind == 0: # DayOfMonth
> > +start = '{} {}{}'.format(month, ordinal_day)
>
> flake8 points out that this call to format has three placeholders but
> only two arguments.

Oops, I think it was originally written like this:

'{} {}{}'.format(month, day, suffixes.get(day, 'th'))

but then I refactored it to:

ordinal_day = '{}{}'.format(day, suffixes.get(day, 'th'))
if kind == 0:  # DayOfMonth
start = '{} {}{}'.format(month, ordinal_day)

So the fix is to just change the string to '{} {}' which I've pushed
as 1fab05a885a308c19cf42b72fd36805ddf27fdc8 now (also attached).

These printers are for implementation details internal to the library,
which are never exposed to users. I added them because they made it
much easier to debug the implementation when stepping through library
functions, but that means there are no tests for them.

Thanks for finding this!
commit 1fab05a885a308c19cf42b72fd36805ddf27fdc8
Author: Jonathan Wakely 
Date:   Wed Sep 27 17:03:51 2023

libstdc++: Fix format string in StdChronoTimeZoneRulePrinter

libstdc++-v3/ChangeLog:

* python/libstdcxx/v6/printers.py (StdChronoTimeZoneRulePrinter):
Fix incorrect number of replacement fields.

diff --git a/libstdc++-v3/python/libstdcxx/v6/printers.py 
b/libstdc++-v3/python/libstdcxx/v6/printers.py
index c0056de2565..d60c8003a63 100644
--- a/libstdc++-v3/python/libstdcxx/v6/printers.py
+++ b/libstdc++-v3/python/libstdcxx/v6/printers.py
@@ -2215,7 +2215,7 @@ class StdChronoTimeZoneRulePrinter:
 day = on['day_of_month']
 ordinal_day = '{}{}'.format(day, suffixes.get(day, 'th'))
 if kind == 0:  # DayOfMonth
-start = '{} {}{}'.format(month, ordinal_day)
+start = '{} {}'.format(month, ordinal_day)
 else:
 weekday = weekdays[on['day_of_week']]
 if kind == 1:  # LastWeekDay


Re: [committed] libstdc++: Add GDB printers for types

2023-09-27 Thread Tom Tromey
> Jonathan Wakely via Gcc-patches  writes:

Replying to a quite old email...

I ran a Python linter on the libstdc++ pretty-printers.

I have fixes for most of the issues that are worth fixing (I didn't
bother with line lengths -- FWIW in gdb we just run 'black' and don't
worry about these details), but the patch I'm replying to had a problem
that I didn't know how to fix:

> +class StdChronoTimeZoneRulePrinter:
[...]
> +if kind == 0: # DayOfMonth
> +start = '{} {}{}'.format(month, ordinal_day)

flake8 points out that this call to format has three placeholders but
only two arguments.

Tom


Re: [PATCH 01/12] [contrib] validate_failures.py: Avoid testsuite aliasing

2023-09-27 Thread Maxim Kuvyrkov
Hi Bernhard,

Thanks, I meant to fix this, but forgot.

The underlying problem here is that we want to detect which sub-testsuites had 
failures.  Current regex doesn't match go's case because there is no "..." at 
the end: "Running foo" vs "Running foo ..." .

My preferred way of fixing this is to make go's testsuite print out "..." .  We 
have a similar patch for glibc [1].

[1] https://sourceware.org/pipermail/libc-alpha/2023-June/148702.html

--
Maxim Kuvyrkov
https://www.linaro.org

> On Sep 26, 2023, at 19:46, Bernhard Reutner-Fischer  
> wrote:
> 
> Hi Maxim!
> 
> On Mon, 5 Jun 2023 18:06:25 +0400
> Maxim Kuvyrkov via Gcc-patches  wrote:
> 
>>> On Jun 3, 2023, at 19:17, Jeff Law  wrote:
>>> 
>>> On 6/2/23 09:20, Maxim Kuvyrkov via Gcc-patches wrote:  
 This patch adds tracking of current testsuite "tool" and "exp"
 to the processing of .sum files.  This avoids aliasing between
 tests from different testsuites with same name+description.
 E.g., this is necessary for testsuite/c-c++-common, which is ran
 for both gcc and g++ "tools".
 This patch changes manifest format from ...
 
 FAIL: gcc_test
 FAIL: g++_test
 
 ... to ...
 
 === gcc tests ===
 Running gcc/foo.exp ...
 FAIL: gcc_test
 === gcc Summary ==
 === g++ tests ===
 Running g++/bar.exp ...
 FAIL: g++_test
 === g++ Summary ==
 .
 The new format uses same formatting as DejaGnu's .sum files
 to specify which "tool" and "exp" the test belongs to.  
>>> I think the series is fine.  You're not likely to hear from Diego or Doug I 
>>> suspect, I don't think either are involved in GNU stuff anymore.
>>> 
>> 
>> Thanks, Jeff.  I'll wait for a couple of days and will merge if there are no 
>> new comments.
> 
> Maxim, may i ask you to have a look at the following problem, please?
> 
> ISTM that your exp code does not work as expected for go, maybe you
> forgot to test the changes with go enabled?
> 
> Ever since your changes in summer i see the following:
> 
> gcc-14.mine$ 
> /scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py 
> --clean_build ../gcc-14.orig/
> Getting actual results from build directory .
> ./gcc/testsuite/go/go.sum
> ./gcc/testsuite/gcc/gcc.sum
> ./gcc/testsuite/objc/objc.sum
> ./gcc/testsuite/jit/jit.sum
> ./gcc/testsuite/gdc/gdc.sum
> ./gcc/testsuite/gnat/gnat.sum
> ./gcc/testsuite/ada/acats/acats.sum
> ./gcc/testsuite/g++/g++.sum
> ./gcc/testsuite/obj-c++/obj-c++.sum
> ./gcc/testsuite/rust/rust.sum
> ./gcc/testsuite/gfortran/gfortran.sum
> ./x86_64-pc-linux-gnu/libgomp/testsuite/libgomp.sum
> ./x86_64-pc-linux-gnu/libphobos/testsuite/libphobos.sum
> ./x86_64-pc-linux-gnu/libstdc++-v3/testsuite/libstdc++.sum
> ./x86_64-pc-linux-gnu/libffi/testsuite/libffi.sum
> ./x86_64-pc-linux-gnu/libitm/testsuite/libitm.sum
> ./x86_64-pc-linux-gnu/libgo/libgo.sum
> ./x86_64-pc-linux-gnu/libatomic/testsuite/libatomic.sum
> ./gotools/gotools.sum
> .sum file seems to be broken: tool="gotools", exp="None", summary_line="FAIL: 
> TestScript"
> Traceback (most recent call last):
>  File 
> "/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
> line 732, in 
>retval = Main(sys.argv)
>  File 
> "/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
> line 721, in Main
>retval = CompareBuilds()
>  File 
> "/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
> line 622, in CompareBuilds
>actual = GetResults(sum_files)
>  File 
> "/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
> line 466, in GetResults
>build_results.update(ParseSummary(sum_fname))
>  File 
> "/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
> line 405, in ParseSummary
>result = result_set.MakeTestResult(line, ordinal)
>  File 
> "/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
> line 239, in MakeTestResult
>return TestResult(summary_line, ordinal,
>  File 
> "/scratch/src/gcc-14.mine/contrib/testsuite-management/validate_failures.py", 
> line 151, in __init__
>raise
> RuntimeError: No active exception to reraise
> 
> 
> The problem seems to be that gotools.sum does not mention any ".exp"
> files.
> 
> $ grep "Running " gotools/gotools.sum 
> Running cmd/go
> Running runtime
> Running cgo
> Running carchive
> Running cmd/vet
> Running embed
> $ grep -c "\.exp" gotools/gotools.sum 
> 0
> 
> The .sum files looks like this:
> ---8<---
> Test Run By foo on Tue Sep 26 14:46:48 CEST 2023
> Native configuration is x86_64-foo-linux-gnu
> 
>=== gotools tests ===
> 
> Running cmd/go
> UNTESTED: TestAccidentalGitCheckout
> PASS: TestAlwaysLinkSysoFiles
> ...
> UNTESTED: TestParallelTest
> FAIL: TestScript
> ...
> ---8<---
> 
> May i ask you to have a look, please?
> 
> TIA,




[PATCH] ipa-utils: avoid generating uninitialized probabilities on merges.

2023-09-27 Thread Sergei Trofimovich
From: Sergei Trofimovich 

r14-3459-g0c78240fd7d519 "Check that passes do not forget to define profile"
exposed check failures in cases when gcc produces uninitialized profile
probabilities. In case of PR/111559 uninitialized profile is generated
by edges executed 0 times during profile:

__attribute__((noipa)) static void edge(void) {}

int p = 0;

__attribute__((noinline))
static void rule1(void) { if (p) edge(); }

__attribute__((noinline))
static void rule1_same(void) { if (p) edge(); }

__attribute__((noipa)) int main(void) {
rule1();
rule1_same();
}

$ gcc -O2 -fprofile-generate bug.c -o b -fopt-info
$ ./b
$ gcc -O2 -fprofile-use -fprofile-correction bug.c -o b -fopt-info

bug.c: In function 'rule1':
bug.c:6:13: error: probability of edge 3->4 not initialized
6 | static void rule1(void) { if (p) edge(); }
  | ^
during GIMPLE pass: fixup_cfg
bug.c:6:13: internal compiler error: verify_flow_info failed

The change conservatively ignores updates with uninitialized values and
uses initially assigned probabilities (`always` probability in case of
the example).

gcc/
PR/111283
PR/111559
* ipa-utils.cc (ipa_merge_profiles): Avoid producing
  uninitialized probabilities when merging counters with zero
  denominators.
---
 gcc/ipa-utils.cc | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/ipa-utils.cc b/gcc/ipa-utils.cc
index 956c6294fd7..7c53ae9dd45 100644
--- a/gcc/ipa-utils.cc
+++ b/gcc/ipa-utils.cc
@@ -651,13 +651,17 @@ ipa_merge_profiles (struct cgraph_node *dst,
{
  edge srce = EDGE_SUCC (srcbb, i);
  edge dste = EDGE_SUCC (dstbb, i);
- dste->probability = 
+ profile_probability merged =
dste->probability * dstbb->count.ipa ().probability_in
 (dstbb->count.ipa ()
  + srccount.ipa ())
+ srce->probability * srcbb->count.ipa ().probability_in
 (dstbb->count.ipa ()
  + srccount.ipa ());
+ /* We produce uninitialized probabilities when
+denominator is zero: https://gcc.gnu.org/PR111559.  */
+ if (merged.initialized_p ())
+   dste->probability = merged;
}
  dstbb->count = dstbb->count.ipa () + srccount.ipa ();
}
-- 
2.42.0



[PATCH v4] c-family: Implement __has_feature and __has_extension [PR60512]

2023-09-27 Thread Alex Coplan
Hi,

This is a v4 patch to address Jason's feedback here:
https://gcc.gnu.org/pipermail/gcc-patches/2023-September/630911.html

w.r.t. v3 it just removes a comment now that some uncertainty around
cxx_binary_literals has been resolved, and updates the documentation as
suggested to point to the Clang docs.

--

This patch implements clang's __has_feature and __has_extension in GCC.
Currently the patch aims to implement all documented features (and some
undocumented ones) following the documentation at
https://clang.llvm.org/docs/LanguageExtensions.html with the exception
of the legacy features for C++ type traits.  These are omitted, since as
the clang documentation notes, __has_builtin is the correct "modern" way
to query for these (which GCC already implements).

Bootstrapped/regtested on aarch64-linux-gnu, bootstrapped on
x86_64-apple-darwin, darwin regtest in progress.  OK for trunk if
testing passes?

Thanks,
Alex

gcc/c-family/ChangeLog:

PR c++/60512
* c-common.cc (struct hf_feature_info): New.
(c_common_register_feature): New.
(init_has_feature): New.
(has_feature_p): New.
* c-common.h (c_common_has_feature): New.
(c_family_register_lang_features): New.
(c_common_register_feature): New.
(has_feature_p): New.
(c_register_features): New.
(cp_register_features): New.
* c-lex.cc (init_c_lex): Plumb through has_feature callback.
(c_common_has_builtin): Generalize and move common part ...
(c_common_lex_availability_macro): ... here.
(c_common_has_feature): New.
* c-ppoutput.cc (init_pp_output): Plumb through has_feature.

gcc/c/ChangeLog:

PR c++/60512
* c-lang.cc (c_family_register_lang_features): New.
* c-objc-common.cc (struct c_feature_info): New.
(c_register_features): New.

gcc/cp/ChangeLog:

PR c++/60512
* cp-lang.cc (c_family_register_lang_features): New.
* cp-objcp-common.cc (struct cp_feature_selector): New.
(cp_feature_selector::has_feature): New.
(struct cp_feature_info): New.
(cp_register_features): New.

gcc/ChangeLog:

PR c++/60512
* doc/cpp.texi: Document __has_{feature,extension}.

gcc/objc/ChangeLog:

PR c++/60512
* objc-act.cc (struct objc_feature_info): New.
(objc_nonfragile_abi_p): New.
(objc_common_register_features): New.
* objc-act.h (objc_common_register_features): New.
* objc-lang.cc (c_family_register_lang_features): New.

gcc/objcp/ChangeLog:

PR c++/60512
* objcp-lang.cc (c_family_register_lang_features): New.

libcpp/ChangeLog:

PR c++/60512
* include/cpplib.h (struct cpp_callbacks): Add has_feature.
(enum cpp_builtin_type): Add BT_HAS_{FEATURE,EXTENSION}.
* init.cc: Add __has_{feature,extension}.
* macro.cc (_cpp_builtin_macro_text): Handle
BT_HAS_{FEATURE,EXTENSION}.

gcc/testsuite/ChangeLog:

PR c++/60512
* c-c++-common/has-feature-common.c: New test.
* g++.dg/ext/has-feature.C: New test.
* gcc.dg/asan/has-feature-asan.c: New test.
* gcc.dg/has-feature.c: New test.
* gcc.dg/ubsan/has-feature-ubsan.c: New test.
* obj-c++.dg/has-feature.mm: New test.
* objc.dg/has-feature.m: New test.
diff --git a/gcc/c-family/c-common.cc b/gcc/c-family/c-common.cc
index aae57260097..1210953d33a 100644
--- a/gcc/c-family/c-common.cc
+++ b/gcc/c-family/c-common.cc
@@ -311,6 +311,43 @@ const struct fname_var_t fname_vars[] =
   {NULL, 0, 0},
 };
 
+/* Flags to restrict availability of generic features that
+   are known to __has_{feature,extension}.  */
+
+enum
+{
+  HF_FLAG_EXT = 1, /* Available only as an extension.  */
+  HF_FLAG_SANITIZE = 2, /* Availability depends on sanitizer flags.  */
+};
+
+/* Info for generic features which can be queried through
+   __has_{feature,extension}.  */
+
+struct hf_feature_info
+{
+  const char *ident;
+  unsigned flags;
+  unsigned mask;
+};
+
+/* Table of generic features which can be queried through
+   __has_{feature,extension}.  */
+
+static const hf_feature_info has_feature_table[] =
+{
+  { "address_sanitizer",   HF_FLAG_SANITIZE, SANITIZE_ADDRESS },
+  { "thread_sanitizer",HF_FLAG_SANITIZE, SANITIZE_THREAD },
+  { "leak_sanitizer",  HF_FLAG_SANITIZE, SANITIZE_LEAK },
+  { "hwaddress_sanitizer", HF_FLAG_SANITIZE, SANITIZE_HWADDRESS },
+  { "undefined_behavior_sanitizer", HF_FLAG_SANITIZE, SANITIZE_UNDEFINED },
+  { "attribute_deprecated_with_message",  0, 0 },
+  { "attribute_unavailable_with_message", 0, 0 },
+  { "enumerator_attributes", 0, 0 },
+  { "tls", 0, 0 },
+  { "gnu_asm_goto_with_outputs", HF_FLAG_EXT, 0 },
+  { "gnu_asm_goto_with_outputs_full",HF_FLAG_EXT, 0 }
+};
+
 /* Global visibility options.  */
 

Re: [PATCH] libstdc++: Ensure active union member is correctly set

2023-09-27 Thread Jonathan Wakely
On Sat, 23 Sept 2023 at 08:30, Nathaniel Shead via Libstdc++
 wrote:
>
> On Sat, Sep 23, 2023 at 07:40:48AM +0100, Jonathan Wakely wrote:
> > On Sat, 23 Sept 2023, 01:39 Nathaniel Shead via Libstdc++, <
> > libstd...@gcc.gnu.org> wrote:
> >
> > > Now that bootstrap has finished, I have gotten regressions in the
> > > following libstdc++ tests:
> > >
> > > Running libstdc++:libstdc++-dg/conformance.exp ...
> > > FAIL: 20_util/bitset/access/constexpr.cc -std=gnu++23 (test for excess
> > > errors)
> > > FAIL: 20_util/bitset/access/constexpr.cc -std=gnu++26 (test for excess
> > > errors)
> > > FAIL: 20_util/variant/constexpr.cc -std=gnu++20 (test for excess errors)
> > > FAIL: 20_util/variant/constexpr.cc -std=gnu++26 (test for excess errors)
> > > FAIL: 21_strings/basic_string/cons/char/constexpr.cc -std=gnu++20 (test
> > > for excess errors)
> > > FAIL: 21_strings/basic_string/cons/char/constexpr.cc -std=gnu++26 (test
> > > for excess errors)
> > > FAIL: 21_strings/basic_string/cons/wchar_t/constexpr.cc -std=gnu++20 (test
> > > for excess errors)
> > > FAIL: 21_strings/basic_string/cons/wchar_t/constexpr.cc -std=gnu++26 (test
> > > for excess errors)
> > > FAIL: 21_strings/basic_string/modifiers/swap/constexpr-wchar_t.cc
> > > -std=gnu++20 (test for excess errors)
> > > FAIL: 21_strings/basic_string/modifiers/swap/constexpr-wchar_t.cc
> > > -std=gnu++26 (test for excess errors)
> > > FAIL: 21_strings/basic_string/modifiers/swap/constexpr.cc -std=gnu++20
> > > (test for excess errors)
> > > FAIL: 21_strings/basic_string/modifiers/swap/constexpr.cc -std=gnu++26
> > > (test for excess errors)
> > > FAIL: std/ranges/adaptors/join_with/1.cc -std=gnu++23 (test for excess
> > > errors)
> > > UNRESOLVED: std/ranges/adaptors/join_with/1.cc -std=gnu++23 compilation
> > > failed to produce executable
> > > FAIL: std/ranges/adaptors/join_with/1.cc -std=gnu++26 (test for excess
> > > errors)
> > > UNRESOLVED: std/ranges/adaptors/join_with/1.cc -std=gnu++26 compilation
> > > failed to produce executable
> > >
> > > On investigation though it looks like the issue might be with libstdc++
> > > rather than the patch itself; running the failing tests using clang with
> > > libstdc++ also produces similar errors, and my reading of the code
> > > suggests that this is correct.
> > >
> > > What's the way forward here? Should I look at creating a patch to fix
> > > the libstdc++ issues before resubmitting this patch for the C++
> > > frontend? Or should I submit a version of this patch without the
> > > `std::construct_at` changes and wait till libstdc++ gets fixed for that?
> > >
> >
> > I think we should fix libstdc++. There are probably only a few places that
> > need a fix, which cause all those failures.
> >
> > I can help with those fixes. I'll look into it after the weekend.
> >
>
> Thanks. I did end up getting a chance to look at it earlier today, and
> with the following patch I had no regressions when applying the frontend
> changes. Bootstrapped and regtested on x86_64-pc-linux-gnu.
>
> -- >8 --
>
> This patch ensures that the union members for std::string and
> std::variant are always properly set when a change occurs.
>
> libstdc++-v3/ChangeLog:
>
> * include/bits/basic_string.h: (basic_string(basic_string&&)):
> Activate _M_local_buf when needed.
> (basic_string(basic_string&&, const _Alloc&)): Likewise.
> * include/bits/basic_string.tcc: (basic_string::swap): Likewise.
> * include/std/variant: (__detail::__variant::__construct_n): New.
> (__detail::_variant::__emplace): Use __construct_n.
>
> Signed-off-by: Nathaniel Shead 
> ---
>  libstdc++-v3/include/bits/basic_string.h   |  7 +++--
>  libstdc++-v3/include/bits/basic_string.tcc |  8 +++---
>  libstdc++-v3/include/std/variant   | 32 --
>  3 files changed, 38 insertions(+), 9 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/basic_string.h 
> b/libstdc++-v3/include/bits/basic_string.h
> index 09fd62afa66..7c342879827 100644
> --- a/libstdc++-v3/include/bits/basic_string.h
> +++ b/libstdc++-v3/include/bits/basic_string.h
> @@ -678,7 +678,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
>{
> if (__str._M_is_local())
>   {
> -   traits_type::copy(_M_local_buf, __str._M_local_buf,
> +   traits_type::copy(_M_use_local_data(), __str._M_local_buf,
>   __str.length() + 1);
>   }
> else
> @@ -691,7 +691,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
> // basic_stringbuf relies on writing into unallocated capacity so
> // we mess up the contents if we put a '\0' in the string.
> _M_length(__str.length());
> -   __str._M_data(__str._M_local_data());
> +   __str._M_data(__str._M_use_local_data());
> __str._M_set_length(0);
>}
>
> @@ -717,6 +717,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
>{
> if (__str._M_is_local())
>   {
> +   _M_use_local_data();

Lets add a 

Re: [PATCH v14 16/40] c, c++: Use 16 bits for all use of enum rid for more keyword space

2023-09-27 Thread Jason Merrill
On Tue, Sep 19, 2023 at 7:05 PM Ken Matsui 
wrote:

> On Tue, Sep 19, 2023 at 9:59 AM Jason Merrill  wrote:
> >
> > On 9/15/23 19:51, Ken Matsui via Gcc-patches wrote:
> > > Now that RID_MAX has reached 255, we need to update the bit sizes of
> every
> > > use of the enum rid from 8 to 16 to support more keywords.
> >
> > Sorry to bring this up so late, but this does raise the question of
> > whether we actually want to use keyword space for all these traits that
> > will probably be used approximately once in a C++ translation unit.  I
> > wonder if it would make sense to instead use e.g. RID_TRAIT for all of
> > them and use gperf to look up the specific trait from the identifier?
> >
>
> Thank you for your review. To use gperf, we might need to duplicate
> the list of all traits defined in cp-trait.def. Modifying the traits
> would require us to edit two files, but would it be acceptable?
>

I think the gperf input could be generated from the .def with a simple
script?

Jason


[committed] libstdc++: Prevent unwanted ADL in std::to_array [PR111512]

2023-09-27 Thread Jonathan Wakely
This is the fix for the release branches, where std::to_array is
implemented differently.

Tested x86_64-linux. Pushed to gcc-13 and gcc-12. Will push to gcc-11
after testing.

-- >8 --

Qualify the calls to the __to_array helper to prevent ADL, so we don't
try to complete associated classes.

libstdc++-v3/ChangeLog:

PR libstdc++/111511
PR c++/111512
* include/std/array (to_array): Qualify calls to __to_array.
* testsuite/23_containers/array/creation/111512.cc: New test.

(cherry picked from commit 77cf3773021b0a20d89623e09d620747a05588ec)
---
 libstdc++-v3/include/std/array|  4 +--
 .../23_containers/array/creation/111512.cc| 25 +++
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/23_containers/array/creation/111512.cc

diff --git a/libstdc++-v3/include/std/array b/libstdc++-v3/include/std/array
index 70280c1beeb..97cca454ef9 100644
--- a/libstdc++-v3/include/std/array
+++ b/libstdc++-v3/include/std/array
@@ -436,7 +436,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static_assert(!is_array_v<_Tp>);
   static_assert(is_constructible_v<_Tp, _Tp&>);
   if constexpr (is_constructible_v<_Tp, _Tp&>)
-   return __to_array(__a, make_index_sequence<_Nm>{});
+   return std::__to_array(__a, make_index_sequence<_Nm>{});
   __builtin_unreachable(); // FIXME: see PR c++/91388
 }
 
@@ -449,7 +449,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static_assert(!is_array_v<_Tp>);
   static_assert(is_move_constructible_v<_Tp>);
   if constexpr (is_move_constructible_v<_Tp>)
-   return __to_array<1>(__a, make_index_sequence<_Nm>{});
+   return std::__to_array<1>(__a, make_index_sequence<_Nm>{});
   __builtin_unreachable(); // FIXME: see PR c++/91388
 }
 #endif // C++20
diff --git a/libstdc++-v3/testsuite/23_containers/array/creation/111512.cc 
b/libstdc++-v3/testsuite/23_containers/array/creation/111512.cc
new file mode 100644
index 000..f510480ae4b
--- /dev/null
+++ b/libstdc++-v3/testsuite/23_containers/array/creation/111512.cc
@@ -0,0 +1,25 @@
+// { dg-options "-std=gnu++20" }
+// { dg-do compile { target c++20 } }
+
+// Bug libstdc++/111511 - Incorrect ADL in std::to_array in GCC 11/12/13
+// Bug c++/111512 - GCC's __builtin_memcpy can trigger ADL
+
+#include 
+#include 
+
+struct incomplete;
+
+template
+struct holder {
+T t; // { dg-bogus "'holder::t' has incomplete type" }
+};
+
+// A complete type that cannot be used as an associated type for ADL.
+using adl_bomb = holder*;
+
+int main()
+{
+adl_bomb a[1]{};
+(void) std::to_array(a);
+(void) std::to_array(std::move(a));
+}
-- 
2.41.0



[pushed] Darwin, configure: Allow for an unrecognisable dsymutil [PR111610].

2023-09-27 Thread Iain Sandoe
tested on x86_64-darwin21 (native) and with crosses from x86_64-linux-gnu
to powerpc-darwin, i686-darwin, x86_64-darwin and with
--enable-languages=all to powerpc-apple-darwin9 (this built the front ends
OK - but there seems to be an unrelated config error in that even with
all-host, some language front end is triggering a build of target runtimes).

--- 8< ---

We had a catch-all configuration case for missing or unrecognised dsymutil
but it was setting the dsymutil source to "UNKNOWN" which is not usable in
this context (since it clashes with an existing enum).  We rename this to
DET_UNKNOWN (for Darwin External Toolchain).

PR target/111610

gcc/ChangeLog:

* configure: Regenerate.
* configure.ac: Rename the missing dsymutil case to "DET_UNKNOWN".

Signed-off-by: Iain Sandoe 
---
 gcc/configure| 2 +-
 gcc/configure.ac | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/configure b/gcc/configure
index 307a3e05bb3..c43bde8174b 100755
--- a/gcc/configure
+++ b/gcc/configure
@@ -30746,7 +30746,7 @@ $as_echo_n "checking dsymutil version 
\"$dsymutil_temp\"... " >&6; }
   dsymutil_kind=LLVM
   dsymutil_vers=`echo $dsymutil_temp | sed 's/.*LLVM\ version\ 
\([0-9\.]*\).*/\1/'`
 else
-  dsymutil_kind=UNKNOWN
+  dsymutil_kind=DET_UNKNOWN
   dsymutil_vers="0.0"
 fi
 dsymutil_major=`expr "$dsymutil_vers" : '\([0-9]*\)'`
diff --git a/gcc/configure.ac b/gcc/configure.ac
index 9b35c0ffea3..fb8e32f8ee5 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -6363,7 +6363,7 @@ if test x"$dsymutil_flag" = x"yes"; then
   dsymutil_kind=LLVM
   dsymutil_vers=`echo $dsymutil_temp | sed 's/.*LLVM\ version\ 
\([[0-9\.]]*\).*/\1/'`
 else
-  dsymutil_kind=UNKNOWN
+  dsymutil_kind=DET_UNKNOWN
   dsymutil_vers="0.0"
 fi
 dsymutil_major=`expr "$dsymutil_vers" : '\([[0-9]]*\)'`
-- 
2.39.2 (Apple Git-143)



Re: [PATCH v4] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-27 Thread Philipp Tomsich
Applied to master (with fixups). Thanks!
Philipp.

On Wed, 27 Sept 2023 at 10:40, Kyrylo Tkachov  wrote:
>
> Hi Manos,
>
> > -Original Message-
> > From: Manos Anagnostakis 
> > Sent: Tuesday, September 26, 2023 2:52 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Kyrylo Tkachov ; Tamar Christina
> > ; Philipp Tomsich ;
> > Manos Anagnostakis 
> > Subject: [PATCH v4] aarch64: Fine-grained policies to control ldp-stp
> > formation.
> >
> > This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
> > to provide the requested behaviour for handling ldp and stp:
> >
> >   /* Allow the tuning structure to disable LDP instruction formation
> >  from combining instructions (e.g., in peephole2).
> >  TODO: Implement fine-grained tuning control for LDP and STP:
> >1. control policies for load and store separately;
> >2. support the following policies:
> >   - default (use what is in the tuning structure)
> >   - always
> >   - never
> >   - aligned (only if the compiler can prove that the
> > load will be aligned to 2 * element_size)  */
> >
> > It provides two new and concrete target-specific command-line parameters
> > -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> > to give the ability to control load and store policies seperately as
> > stated in part 1 of the TODO.
> >
> > The accepted values for both parameters are:
> > - default: Use the policy of the tuning structure (default).
> > - always: Emit ldp/stp regardless of alignment.
> > - never: Do not emit ldp/stp.
> > - aligned: In order to emit ldp/stp, first check if the load/store will
> >   be aligned to 2 * element_size.
> >
> > Bootstrapped and regtested aarch64-linux.
> >
> > gcc/ChangeLog:
> > * config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): New
> >   enum type.
> > (enum aarch64_stp_policy): New enum type.
> > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> >   appropriate enums for the policies.
> >   (aarch64_mem_ok_with_ldpstp_policy_model): New declaration.
> > * config/aarch64/aarch64-tuning-flags.def
> >   (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> >   options.
> > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> >   function to parse ldp-policy parameter.
> > (aarch64_parse_stp_policy): New function to parse stp-policy 
> > parameter.
> > (aarch64_override_options_internal): Call parsing functions.
> >   (aarch64_mem_ok_with_ldpstp_policy_model): New function.
> > (aarch64_operands_ok_for_ldpstp): Add call to
> >   aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
> >   check and alignment check and remove superseded ones.
> > (aarch64_operands_adjust_ok_for_ldpstp): Add call to
> > aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
> >   check and alignment check and remove superseded ones.
> > * config/aarch64/aarch64.opt: Add parameters.
> >   * doc/invoke.texi: Document the parameters accordingly.
>
> The ChangeLog entry should name the new parameters. For example:
> * config/aarch64/aarch64.opt (aarch64-ldp-policy): New param.
>
> Ok with the fixed ChangeLog.
> Thank you for the work!
> Kyrill
>
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
> > * gcc.target/aarch64/ldp_aligned.c: New test.
> > * gcc.target/aarch64/ldp_always.c: New test.
> > * gcc.target/aarch64/ldp_never.c: New test.
> > * gcc.target/aarch64/stp_aligned.c: New test.
> > * gcc.target/aarch64/stp_always.c: New test.
> > * gcc.target/aarch64/stp_never.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > ---
> > Changes in v4:
> > - Changed the parameters to accept enum instead of an
> >   integer and updated documentation in doc/invoke.texi.
> > - Packed all the new checks in aarch64_operands_ok_for_ldpstp/
> >   aarch64_operands_adjust_ok_for_ldpstp in a new function
> >   called aarch64_mem_ok_with_ldpstp_policy_model.
> >
> >  gcc/config/aarch64/aarch64-opts.h |  16 ++
> >  gcc/config/aarch64/aarch64-protos.h   |  25 +++
> >  gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
> >  gcc/config/aarch64/aarch64.cc | 212 +-
> >  gcc/config/aarch64/aarch64.opt|  38 
> >  gcc/doc/invoke.texi   |  20 ++
> >  .../aarch64/ampere1-no_ldp_combine.c  |  11 -
> >  .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
> >  gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
> >  gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
> >  .../gcc.target/aarch64/stp_aligned.c  |  60 +
> >  gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
> >  

Re: [PATCH] vec.h, v2: Make some ops work with non-trivially copy constructible and/or destructible types

2023-09-27 Thread Mikael Morin

Hello,

Le 27/09/2023 à 12:46, Jakub Jelinek a écrit :

--- gcc/vec.h.jj2023-09-27 10:38:50.635845540 +0200
+++ gcc/vec.h   2023-09-27 12:11:56.665586490 +0200
@@ -1028,13 +1050,17 @@ template
  inline void
  vec::truncate (unsigned size)
  {
-  gcc_checking_assert (length () >= size);
+  unsigned l = length ();
+  gcc_checking_assert (l >= size);
+  if (!std::is_trivially_destructible ::value)
+vec_destruct (address () + l, l - size);


Shouldn't this line be:

vec_destruct (address () + *size*, l - size);

instead?


m_vecpfx.m_num = size;
  }
  
  


[PATCH v2 1/2] RISC-V: Add support for XCVmac extension in CV32E40P

2023-09-27 Thread Mary Bennett
Spec: 
github.com/openhwgroup/core-v-sw/blob/master/specifications/corev-builtin-spec.md

Contributors:
  Mary Bennett 
  Nandni Jamnadas 
  Pietra Ferreira 
  Charlie Keaney
  Jessica Mills
  Craig Blackmore 
  Simon Cook 
  Jeremy Bennett 
  Helene Chelin 

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc: Added XCVmac.
* config/riscv/riscv-ftypes.def: Added XCVmac builtins.
* config/riscv/riscv-opts.h: Likewise.
* config/riscv/riscv.md: Likewise.
* config/riscv/riscv.opt: Likewise.
* doc/extend.texi: Added XCVmac builtin documentation.
* config/riscv/corev.def: New file.
* config/riscv/corev.md: New file.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Added new effective target check.
* gcc.target/riscv/cv-mac-compile.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mac.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-machhsn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-machhsrn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-machhun.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-machhurn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-macsn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-macsrn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-macun.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-macurn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-msu.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mulhhsn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mulhhsrn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mulhhun.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mulhhurn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mulsn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mulsrn.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mulun.c: New test.
* gcc.target/riscv/cv-mac-fail-compile-mulurn.c: New test.
* gcc.target/riscv/cv-mac-test-autogeneration.c: New test.
---
 gcc/common/config/riscv/riscv-common.cc   |   4 +
 gcc/config/riscv/corev.def|  19 +
 gcc/config/riscv/corev.md | 390 ++
 gcc/config/riscv/riscv-builtins.cc|  10 +
 gcc/config/riscv/riscv-ftypes.def |   5 +
 gcc/config/riscv/riscv-opts.h |   5 +
 gcc/config/riscv/riscv.md |   1 +
 gcc/config/riscv/riscv.opt|   3 +
 gcc/doc/extend.texi   |  80 
 .../gcc.target/riscv/cv-mac-compile.c | 198 +
 .../riscv/cv-mac-fail-compile-mac.c   |  25 ++
 .../riscv/cv-mac-fail-compile-machhsn.c   |  24 ++
 .../riscv/cv-mac-fail-compile-machhsrn.c  |  24 ++
 .../riscv/cv-mac-fail-compile-machhun.c   |  24 ++
 .../riscv/cv-mac-fail-compile-machhurn.c  |  24 ++
 .../riscv/cv-mac-fail-compile-macsn.c |  24 ++
 .../riscv/cv-mac-fail-compile-macsrn.c|  24 ++
 .../riscv/cv-mac-fail-compile-macun.c |  24 ++
 .../riscv/cv-mac-fail-compile-macurn.c|  24 ++
 .../riscv/cv-mac-fail-compile-msu.c   |  25 ++
 .../riscv/cv-mac-fail-compile-mulhhsn.c   |  24 ++
 .../riscv/cv-mac-fail-compile-mulhhsrn.c  |  24 ++
 .../riscv/cv-mac-fail-compile-mulhhun.c   |  24 ++
 .../riscv/cv-mac-fail-compile-mulhhurn.c  |  24 ++
 .../riscv/cv-mac-fail-compile-mulsn.c |  24 ++
 .../riscv/cv-mac-fail-compile-mulsrn.c|  24 ++
 .../riscv/cv-mac-fail-compile-mulun.c |  24 ++
 .../riscv/cv-mac-fail-compile-mulurn.c|  24 ++
 .../riscv/cv-mac-test-autogeneration.c|  18 +
 gcc/testsuite/lib/target-supports.exp |  13 +
 30 files changed, 1180 insertions(+)
 create mode 100644 gcc/config/riscv/corev.def
 create mode 100644 gcc/config/riscv/corev.md
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-compile.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-mac.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-machhsn.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-machhsrn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-machhun.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-machhurn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-macsn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-macsrn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-macun.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-macurn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-msu.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-mulhhsn.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-mulhhsrn.c
 create mode 100644 

[PATCH v2 2/2] RISC-V: Add support for XCValu extension in CV32E40P

2023-09-27 Thread Mary Bennett
Spec: 
github.com/openhwgroup/core-v-sw/blob/master/specifications/corev-builtin-spec.md

Contributors:
  Mary Bennett 
  Nandni Jamnadas 
  Pietra Ferreira 
  Charlie Keaney
  Jessica Mills
  Craig Blackmore 
  Simon Cook 
  Jeremy Bennett 
  Helene Chelin 

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc: Added the XCValu
extension.
* config/riscv/constraints.md: Added builtins for the XCValu
extension.
* config/riscv/predicates.md (immediate_register_operand):
Likewise.
* config/riscv/corev.def: Likewise.
* config/riscv/corev.md: Likewise.
* config/riscv/riscv-builtins.cc (AVAIL): Likewise.
  (RISCV_ATYPE_UHI): Likewise.
* config/riscv/riscv-ftypes.def: Likewise.
* config/riscv/riscv-opts.h: Likewise.
* config/riscv/riscv.opt: Likewise.
* config/riscv/riscv.cc (riscv_print_operand): Likewise.
* doc/extend.texi: Added XCValu documentation.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Added proc for the XCValu extension.
* gcc.target/riscv/cv-alu-compile.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-addn.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-addrn.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-addun.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-addurn.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-clip.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-clipu.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-subn.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-subrn.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-subun.c: New test.
* gcc.target/riscv/cv-alu-fail-compile-suburn.c: New test.
* gcc.target/riscv/cv-alu-fail-compile.c: New test.
---
 gcc/common/config/riscv/riscv-common.cc   |   2 +
 gcc/config/riscv/constraints.md   |   7 +
 gcc/config/riscv/corev.def|  24 ++
 gcc/config/riscv/corev.md | 303 ++
 gcc/config/riscv/predicates.md|   5 +
 gcc/config/riscv/riscv-builtins.cc|   3 +
 gcc/config/riscv/riscv-ftypes.def |   6 +
 gcc/config/riscv/riscv-opts.h |   2 +
 gcc/config/riscv/riscv.cc |   7 +
 gcc/doc/extend.texi   |  94 ++
 .../gcc.target/riscv/cv-alu-compile.c | 252 +++
 .../riscv/cv-alu-fail-compile-addn.c  |  11 +
 .../riscv/cv-alu-fail-compile-addrn.c |  11 +
 .../riscv/cv-alu-fail-compile-addun.c |  11 +
 .../riscv/cv-alu-fail-compile-addurn.c|  11 +
 .../riscv/cv-alu-fail-compile-clip.c  |  11 +
 .../riscv/cv-alu-fail-compile-clipu.c |  11 +
 .../riscv/cv-alu-fail-compile-subn.c  |  11 +
 .../riscv/cv-alu-fail-compile-subrn.c |  11 +
 .../riscv/cv-alu-fail-compile-subun.c |  11 +
 .../riscv/cv-alu-fail-compile-suburn.c|  11 +
 .../gcc.target/riscv/cv-alu-fail-compile.c|  32 ++
 gcc/testsuite/lib/target-supports.exp |  13 +
 23 files changed, 860 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-compile.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-addn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-addrn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-addun.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-addurn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-clip.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-clipu.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-subn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-subrn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-subun.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-suburn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile.c

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index 53e21fa4bce..e7c1a99fbd2 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -311,6 +311,7 @@ static const struct riscv_ext_version 
riscv_ext_version_table[] =
   {"svpbmt",  ISA_SPEC_CLASS_NONE, 1, 0},
 
   {"xcvmac", ISA_SPEC_CLASS_NONE, 1, 0},
+  {"xcvalu", ISA_SPEC_CLASS_NONE, 1, 0},
 
   {"xtheadba", ISA_SPEC_CLASS_NONE, 1, 0},
   {"xtheadbb", ISA_SPEC_CLASS_NONE, 1, 0},
@@ -1483,6 +1484,7 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
   {"ztso", _options::x_riscv_ztso_subext, MASK_ZTSO},
 
   {"xcvmac",_options::x_riscv_xcv_flags, MASK_XCVMAC},
+  {"xcvalu",_options::x_riscv_xcv_flags, MASK_XCVALU},
 
   {"xtheadba",  _options::x_riscv_xthead_subext, 

[PATCH v2 0/2] RISC-V: Support CORE-V XCVMAC and XCVALU extensions

2023-09-27 Thread Mary Bennett
This patch series presents the comprehensive implementation of the MAC and ALU
extension for CORE-V.

Tested with riscv-gnu-toolchain on binutils, ld, gas and gcc testsuites to
ensure its correctness and compatibility with the existing codebase.
However, your input, reviews, and suggestions are invaluable in making this
extension even more robust.

The CORE-V builtins are described in the specification [1] and work can be
found in the OpenHW group's Github repository [2].

[1] 
github.com/openhwgroup/core-v-sw/blob/master/specifications/corev-builtin-spec.md

[2] github.com/openhwgroup/corev-gcc

Contributors:
Mary Bennett 
Nandni Jamnadas 
Pietra Ferreira 
Charlie Keaney
Jessica Mills
Craig Blackmore 
Simon Cook 
Jeremy Bennett 
Helene Chelin 

  RISC-V: Add support for XCValu extension in CV32E40P
  RISC-V: Add support for XCVmac extension in CV32E40P

 gcc/common/config/riscv/riscv-common.cc   |   6 +
 gcc/config/riscv/constraints.md   |   7 +
 gcc/config/riscv/corev.def|  43 ++
 gcc/config/riscv/corev.md | 693 ++
 gcc/config/riscv/predicates.md|   5 +
 gcc/config/riscv/riscv-builtins.cc|  13 +
 gcc/config/riscv/riscv-ftypes.def |  11 +
 gcc/config/riscv/riscv-opts.h |   7 +
 gcc/config/riscv/riscv.cc |   7 +
 gcc/config/riscv/riscv.md |   1 +
 gcc/config/riscv/riscv.opt|   3 +
 gcc/doc/extend.texi   | 174 +
 .../gcc.target/riscv/cv-alu-compile.c | 252 +++
 .../riscv/cv-alu-fail-compile-addn.c  |  11 +
 .../riscv/cv-alu-fail-compile-addrn.c |  11 +
 .../riscv/cv-alu-fail-compile-addun.c |  11 +
 .../riscv/cv-alu-fail-compile-addurn.c|  11 +
 .../riscv/cv-alu-fail-compile-clip.c  |  11 +
 .../riscv/cv-alu-fail-compile-clipu.c |  11 +
 .../riscv/cv-alu-fail-compile-subn.c  |  11 +
 .../riscv/cv-alu-fail-compile-subrn.c |  11 +
 .../riscv/cv-alu-fail-compile-subun.c |  11 +
 .../riscv/cv-alu-fail-compile-suburn.c|  11 +
 .../gcc.target/riscv/cv-alu-fail-compile.c|  32 +
 .../gcc.target/riscv/cv-mac-compile.c | 198 +
 .../riscv/cv-mac-fail-compile-mac.c   |  25 +
 .../riscv/cv-mac-fail-compile-machhsn.c   |  24 +
 .../riscv/cv-mac-fail-compile-machhsrn.c  |  24 +
 .../riscv/cv-mac-fail-compile-machhun.c   |  24 +
 .../riscv/cv-mac-fail-compile-machhurn.c  |  24 +
 .../riscv/cv-mac-fail-compile-macsn.c |  24 +
 .../riscv/cv-mac-fail-compile-macsrn.c|  24 +
 .../riscv/cv-mac-fail-compile-macun.c |  24 +
 .../riscv/cv-mac-fail-compile-macurn.c|  24 +
 .../riscv/cv-mac-fail-compile-msu.c   |  25 +
 .../riscv/cv-mac-fail-compile-mulhhsn.c   |  24 +
 .../riscv/cv-mac-fail-compile-mulhhsrn.c  |  24 +
 .../riscv/cv-mac-fail-compile-mulhhun.c   |  24 +
 .../riscv/cv-mac-fail-compile-mulhhurn.c  |  24 +
 .../riscv/cv-mac-fail-compile-mulsn.c |  24 +
 .../riscv/cv-mac-fail-compile-mulsrn.c|  24 +
 .../riscv/cv-mac-fail-compile-mulun.c |  24 +
 .../riscv/cv-mac-fail-compile-mulurn.c|  24 +
 .../riscv/cv-mac-test-autogeneration.c|  18 +
 gcc/testsuite/lib/target-supports.exp |  26 +
 45 files changed, 2040 insertions(+)
 create mode 100644 gcc/config/riscv/corev.def
 create mode 100644 gcc/config/riscv/corev.md
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-compile.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-addn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-addrn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-addun.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-addurn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-clip.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-clipu.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-subn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-subrn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-subun.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile-suburn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-alu-fail-compile.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-compile.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-mac.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-machhsn.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-machhsrn.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-machhun.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/cv-mac-fail-compile-machhurn.c
 create mode 100644 

Re: [PATCH 1/2] c++: remove NON_DEPENDENT_EXPR, part 1

2023-09-27 Thread Patrick Palka
On Tue, 26 Sep 2023, Jason Merrill wrote:

> On 9/25/23 16:43, Patrick Palka wrote:
> > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK
> > for trunk?
> > 
> > -- >8 --
> > 
> > This tree code dates all the way back to r69130[1] which implemented
> > typing of non-dependent expressions.  Its motivation was never clear (to
> > me at least) since the documentation for it in e.g. cp-tree.def doesn't
> > seem accurate anymore.  build_non_dependent_expr has since gained
> > a bunch of edge cases about whether (or how) to wrap certain templated
> > trees, making it hard to reason about in general.
> > 
> > So this patch removes this tree code, and temporarily turns
> > build_non_dependent_expr into the identity function.  The subsequent
> > patch will remove build_non_dependent_expr and adjust its callers
> > appropriately.
> > 
> > We now need to gracefully handle templated (sub)trees in a couple of
> > places, places which previously didn't see templated trees since they
> > didn't look through NON_DEPENDENT_EXPR.
> > 
> > [1]: https://gcc.gnu.org/pipermail/gcc-patches/2003-July/109355.html
> > 
> > gcc/c-family/ChangeLog:
> > 
> > * c-warn.cc (check_address_or_pointer_of_packed_member): Handle
> > templated CALL_EXPR naming a local extern function.
> > 
> > gcc/cp/ChangeLog:
> > 
> > * class.cc (instantiate_type): Remove NON_DEPENDENT_EXPR
> > handling.
> > * constexpr.cc (cxx_eval_constant_expression): Likewise.
> > (potential_constant_expression_1): Likewise.
> > * coroutines.cc (coro_validate_builtin_call): Don't
> > expect ALIGNOF_EXPR to be wrapped in NON_DEPENDENT_EXPR.
> > * cp-objcp-common.cc (cp_common_init_ts): Remove
> > NON_DEPENDENT_EXPR handling.
> > * cp-tree.def (NON_DEPENDENT_EXPR): Remove.
> > * cp-tree.h (build_non_dependent_expr): Temporarily redefine as
> > the identity function.
> > * cvt.cc (maybe_warn_nodiscard): Handle templated CALL_EXPR
> > naming a local extern function.
> > * cxx-pretty-print.cc (cxx_pretty_printer::expression): Remove
> > NON_DEPENDENT_EXPR handling.
> > * error.cc (dump_decl): Likewise.
> > (dump_expr): Likewise.
> > * expr.cc (mark_use): Likewise.
> > (mark_exp_read): Likewise.
> > * pt.cc (build_non_dependent_expr): Remove.
> > * tree.cc (lvalue_kind): Remove NON_DEPENDENT_EXPR handling.
> > (cp_stabilize_reference): Likewise.
> > * typeck.cc (warn_for_null_address): Likewise.
> > (cp_build_binary_op): Handle type-dependent SIZEOF_EXPR operands.
> > (cp_build_unary_op) : Don't fold inside a
> > template.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > * g++.dg/concepts/var-concept3.C: Adjust expected diagnostic
> > for attempting to call a variable concept.
> > ---
> >   gcc/c-family/c-warn.cc   |  2 +-
> >   gcc/cp/class.cc  |  9 --
> >   gcc/cp/constexpr.cc  |  9 --
> >   gcc/cp/coroutines.cc |  3 +-
> >   gcc/cp/cp-objcp-common.cc|  1 -
> >   gcc/cp/cp-tree.def   | 11 ---
> >   gcc/cp/cp-tree.h |  2 +-
> >   gcc/cp/cvt.cc|  4 +-
> >   gcc/cp/cxx-pretty-print.cc   |  1 -
> >   gcc/cp/error.cc  |  8 --
> >   gcc/cp/expr.cc   |  2 -
> >   gcc/cp/pt.cc | 92 
> >   gcc/cp/tree.cc   |  5 --
> >   gcc/cp/typeck.cc | 13 +--
> >   gcc/testsuite/g++.dg/concepts/var-concept3.C |  2 +-
> >   15 files changed, 15 insertions(+), 149 deletions(-)
> > 
> > diff --git a/gcc/c-family/c-warn.cc b/gcc/c-family/c-warn.cc
> > index e67dd87a773..c07770394bf 100644
> > --- a/gcc/c-family/c-warn.cc
> > +++ b/gcc/c-family/c-warn.cc
> > @@ -3029,7 +3029,7 @@ check_address_or_pointer_of_packed_member (tree type,
> > tree rhs)
> > if (TREE_CODE (rhs) == CALL_EXPR)
> > {
> >   rhs = CALL_EXPR_FN (rhs); /* Pointer expression.  */
> > - if (rhs == NULL_TREE)
> > + if (rhs == NULL_TREE || TREE_CODE (rhs) == IDENTIFIER_NODE)
> > return NULL_TREE;
> >   rhs = TREE_TYPE (rhs);/* Pointer type.  */
> >   /* We could be called while processing a template and RHS could be
> >  a functor.  In that case it's a class, not a pointer.  */
> >   if (!POINTER_TYPE_P (rhs))
> 
> How about adding !rhs to this condition instead of checking specifically for
> IDENTIFIER_NODE above?

Done.

> 
> > return NULL_TREE;
> 
> > @@ -1048,7 +1048,7 @@ maybe_warn_nodiscard (tree expr, impl_conv_void
> > implicit)
> >   call = TARGET_EXPR_INITIAL (expr);
> > location_t loc = cp_expr_loc_or_input_loc (call);
> > tree callee = cp_get_callee (call);
> > -  if (!callee)
> > +  if (!callee || identifier_p (callee))
> >

Re: [PATCH]middle-end Fold vec_cond into conditional ternary or binary operation when sharing operand [PR109154]

2023-09-27 Thread Richard Biener
On Wed, 27 Sep 2023, Tamar Christina wrote:

> Hi All,
> 
> When we have a vector conditional on a masked target which is doing a 
> selection
> on the result of a conditional operation where one of the operands of the
> conditional operation is the other operand of the select, then we can fold the
> vector conditional into the operation.
> 
> Concretely this transforms
> 
>   c = mask1 ? (masked_op mask2 a b) : b
> 
> into
> 
>   c = masked_op (mask1 & mask2) a b
> 
> The mask is then propagated upwards by the compiler.  In the SVE case we don't
> end up needing a mask AND here since `mask2` will end up in the instruction
> creating `mask` which gives us a natural &.
> 
> Such transformations are more common now in GCC 13+ as PRE has not started
> unsharing of common code in case it can make one branch fully independent.
> 
> e.g. in this case `b` becomes a loop invariant value after PRE.
> 
> This transformation removes the extra select for masked architectures but
> doesn't fix the general case.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   PR tree-optimization/109154
>   * match.pd: Add new cond_op rule.
> 
> gcc/testsuite/ChangeLog:
> 
>   PR tree-optimization/109154
>   * gcc.target/aarch64/sve/pre_cond_share_1.c: New test.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 
> 8ebde06dcd4b26d694826cffad0fb17e1136600a..20b9ea211385d9cc3876a5002f771267533e8868
>  100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -8827,6 +8827,30 @@ and,
>(IFN_COND_ADD @0 @1 (vec_cond @2 @3 integer_zerop) @1)
> (IFN_COND_ADD (bit_and @0 @2) @1 @3 @1))
>  
> +/* Detect simplification for vector condition folding where
> +
> +  c = mask1 ? (masked_op mask2 a b) : b
> +
> +  into
> +
> +  c = masked_op (mask1 & mask2) a b
> +
> +  where the operation can be partially applied to one operand. */
> +
> +(for cond_op (COND_BINARY)
> + (simplify
> +  (vec_cond @0
> +   (cond_op:s @1 @2 @3 @4) @3)
> +  (cond_op (BIT_AND_EXPR @1 @0) @2 @3 @4)))

(bit_and ..., not BIT_AND_EXPR please

> +
> +/* And same for ternary expressions.  */
> +
> +(for cond_op (COND_TERNARY)
> + (simplify
> +  (vec_cond @0
> +   (cond_op:s @1 @2 @3 @4 @5) @4)
> +  (cond_op (BIT_AND_EXPR @1 @0) @2 @3 @4 @5)))

likewise

OK with that change.

Thanks,
Richard.

> +
>  /* For pointers @0 and @2 and nonnegative constant offset @1, look for
> expressions like:
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c
> new file mode 100644
> index 
> ..b51d0f298ea1fcf556365fe4afc875ebcd67584b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pre_cond_share_1.c
> @@ -0,0 +1,132 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -fdump-tree-optimized" } */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +typedef struct __attribute__((__packed__)) _Atom {
> +float x, y, z;
> +int32_t type;
> +} Atom;
> +
> +typedef struct __attribute__((__packed__)) _FFParams {
> +int32_t hbtype;
> +float radius;
> +float hphb;
> +float elsc;
> +} FFParams;
> +
> +#ifndef PPWI
> +#define PPWI (64)
> +#endif
> +
> +#ifndef ITERS
> +#define ITERS 8
> +#endif
> +
> +#define DIFF_TOLERANCE_PCT 0.025f
> +
> +#define POSES_SIZE 393216
> +#define PROTEIN_SIZE 938
> +#define LIGAND_SIZE 26
> +#define FORCEFIELD_SIZE 34
> +
> +#define ZERO 0.0f
> +#define QUARTER 0.25f
> +#define HALF 0.5f
> +#define ONE 1.0f
> +#define TWO 2.0f
> +#define FOUR 4.0f
> +#define CNSTNT 45.0f
> +
> +// Energy evaluation parameters
> +#define HBTYPE_F 70
> +#define HBTYPE_E 69
> +#define HARDNESS 38.0f
> +#define NPNPDIST 5.5f
> +#define NPPDIST 1.0f
> +
> +void
> +fasten_main(size_t group, size_t ntypes, size_t nposes, size_t natlig, 
> size_t natpro,//
> +const Atom *protein, const Atom *ligand, 
> //
> +const float *transforms_0, const float *transforms_1, const 
> float *transforms_2, //
> +const float *transforms_3, const float *transforms_4, const 
> float *transforms_5, //
> +const FFParams *forcefield, float *energies  
> //
> +) {
> +
> +float etot[PPWI];
> +float lpos_x[PPWI];
> +
> +for (int l = 0; l < PPWI; l++) {
> +etot[l] = 0.f;
> +lpos_x[l] = 0.f;
> +}
> +
> +// Loop over ligand atoms
> +for (int il = 0; il < natlig; il++) {
> +// Load ligand atom data
> +const Atom l_atom = ligand[il];
> +const FFParams l_params = forcefield[l_atom.type];
> +const int lhphb_ltz = l_params.hphb < 0.f;
> +const int lhphb_gtz = l_params.hphb > 0.f;
> +
> +// Transform ligand atom
> +
> +// Loop over protein atoms
> +for (int ip = 0; ip < natpro; ip++) {
> +  

Re: [PATCH] ifcvt/vect: Emit COND_ADD for conditional scalar reduction.

2023-09-27 Thread Richard Biener
On Wed, 20 Sep 2023, Robin Dapp wrote:

> Hi,
> 
> as described in PR111401 we currently emit a COND and a PLUS expression
> for conditional reductions.  This makes it difficult to combine both
> into a masked reduction statement later.
> This patch improves that by directly emitting a COND_ADD during ifcvt and
> adjusting some vectorizer code to handle it.
> 
> It also makes neutral_op_for_reduction return -0 if HONOR_SIGNED_ZEROS
> is true.
> 
> Related question/change: We only allow PLUS_EXPR in fold_left_reduction_fn
> but have code to handle MINUS_EXPR in vectorize_fold_left_reduction.  I
> suppose that's intentional but it "just works" on riscv and the testsuite
> doesn't change when allowing MINUS_EXPR so I went ahead and did that.
> 
> Bootstrapped and regtested on x86 and aarch64.

I think overall the patch is fine - please address Tamars comments
though, those look valid.

Thanks,
Richard.

> Regards
>  Robin
> 
> gcc/ChangeLog:
> 
>   PR middle-end/111401
>   * internal-fn.cc (cond_fn_p): New function.
>   * internal-fn.h (cond_fn_p): Define.
>   * tree-if-conv.cc (convert_scalar_cond_reduction): Emit COND_ADD
>   if supported.
>   (predicate_scalar_phi): Add whitespace.
>   * tree-vect-loop.cc (fold_left_reduction_fn): Add IFN_COND_ADD.
>   (neutral_op_for_reduction): Return -0 for PLUS.
>   (vect_is_simple_reduction): Don't count else operand in
>   COND_ADD.
>   (vectorize_fold_left_reduction): Add COND_ADD handling.
>   (vectorizable_reduction): Don't count else operand in COND_ADD.
>   (vect_transform_reduction): Add COND_ADD handling.
>   * tree-vectorizer.h (neutral_op_for_reduction): Add default
>   parameter.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c: New test.
>   * gcc.target/riscv/rvv/autovec/cond/pr111401.c: New test.
> ---
>  gcc/internal-fn.cc|  38 +
>  gcc/internal-fn.h |   1 +
>  .../vect-cond-reduc-in-order-2-signed-zero.c  | 141 ++
>  .../riscv/rvv/autovec/cond/pr111401.c |  61 
>  gcc/tree-if-conv.cc   |  63 ++--
>  gcc/tree-vect-loop.cc | 130 
>  gcc/tree-vectorizer.h |   2 +-
>  7 files changed, 394 insertions(+), 42 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cond/pr111401.c
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0fd34359247..77939890f5a 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4241,6 +4241,44 @@ first_commutative_argument (internal_fn fn)
>  }
>  }
>  
> +/* Return true if this CODE describes a conditional (masked) internal_fn.  */
> +
> +bool
> +cond_fn_p (code_helper code)
> +{
> +  if (!code.is_fn_code ())
> +return false;
> +
> +  if (!internal_fn_p ((combined_fn) code))
> +return false;
> +
> +  internal_fn fn = as_internal_fn ((combined_fn) code);
> +  switch (fn)
> +{
> +#undef DEF_INTERNAL_COND_FN
> +#define DEF_INTERNAL_COND_FN(NAME, F, O, T)\
> +case IFN_COND_##NAME:  \
> +case IFN_COND_LEN_##NAME:  \
> +  return true;
> +#include "internal-fn.def"
> +#undef DEF_INTERNAL_COND_FN
> +
> +#undef DEF_INTERNAL_SIGNED_COND_FN
> +#define DEF_INTERNAL_SIGNED_COND_FN(NAME, F, S, SO, UO, T) \
> +case IFN_COND_##NAME:  \
> +case IFN_COND_LEN_##NAME:  \
> +  return true;
> +#include "internal-fn.def"
> +#undef DEF_INTERNAL_SIGNED_COND_FN
> +
> +default:
> +  return false;
> +}
> +
> +  return false;
> +}
> +
> +
>  /* Return true if this CODE describes an internal_fn that returns a vector 
> with
> elements twice as wide as the element size of the input vectors.  */
>  
> diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
> index 99de13a0199..f1cc9db29c0 100644
> --- a/gcc/internal-fn.h
> +++ b/gcc/internal-fn.h
> @@ -219,6 +219,7 @@ extern bool commutative_ternary_fn_p (internal_fn);
>  extern int first_commutative_argument (internal_fn);
>  extern bool associative_binary_fn_p (internal_fn);
>  extern bool widening_fn_p (code_helper);
> +extern bool cond_fn_p (code_helper code);
>  
>  extern bool set_edom_supported_p (void);
>  
> diff --git 
> a/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c 
> b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> new file mode 100644
> index 000..57c600838ee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-cond-reduc-in-order-2-signed-zero.c
> @@ -0,0 +1,141 @@
> +/* Make sure a -0 stays -0 when we perform a conditional reduction.  */
> +/* { 

Re: [PATCH] Fixes for profile count/probability maintenance

2023-09-27 Thread Jan Hubicka
> > gcc/ChangeLog:
> >
> > * auto-profile.cc (afdo_calculate_branch_prob): Fix count 
> > comparisons
> > * ipa-utils.cc (ipa_merge_profiles): Guard against zero count when
> > computing probabilities
> > * tree-vect-loop-manip.cc (vect_do_peeling): Guard against zero 
> > count
> > when scaling loop profile
> >
> > Tested on x86_64-pc-linux-gnu.
> >
> > ---
> >  gcc/auto-profile.cc |  4 ++--
> >  gcc/ipa-utils.cc| 16 +---
> >  gcc/tree-vect-loop-manip.cc |  2 +-
> >  3 files changed, 12 insertions(+), 10 deletions(-)
> >
> > diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
> > index ff3b763945c..3e61f36c29b 100644
> > --- a/gcc/auto-profile.cc
> > +++ b/gcc/auto-profile.cc
> > @@ -1434,7 +1434,7 @@ afdo_calculate_branch_prob (bb_set *annotated_bb)
> >else
> >  total_count += AFDO_EINFO (e)->get_count ();
> >  }
> > -if (num_unknown_succ == 0 && total_count > profile_count::zero ())
> > +if (num_unknown_succ == 0 && total_count > profile_count::zero ().afdo 
> > ())

I think you want nonzero_p() here (it is usual guard to use when
computing probability_in)
> > @@ -651,13 +651,15 @@ ipa_merge_profiles (struct cgraph_node *dst,
> > {
> >   edge srce = EDGE_SUCC (srcbb, i);
> >   edge dste = EDGE_SUCC (dstbb, i);
> > - dste->probability =
> > -   dste->probability * dstbb->count.ipa ().probability_in
> > -(dstbb->count.ipa ()
> > - + srccount.ipa ())
> > -   + srce->probability * srcbb->count.ipa ().probability_in
> > -(dstbb->count.ipa ()
> > - + srccount.ipa ());
> > + profile_count total = dstbb->count.ipa () + srccount.ipa 
> > ();
> > + if (total.nonzero_p ())
> > +   {
> > + dste->probability =
> > +   dste->probability * dstbb->count.ipa 
> > ().probability_in
> > +   
> > (total)
> > +   + srce->probability * srcbb->count.ipa 
> > ().probability_in
> > +   
> > (total);
> > +   }
> > }
> >   dstbb->count = dstbb->count.ipa () + srccount.ipa ();
> > }
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index 09641901ff1..2608c286e5d 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -3335,7 +3335,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree 
> > niters, tree nitersm1,
> >   free (bbs);
> >   free (original_bbs);
> > }
> > - else
> > + else if (old_count.nonzero_p ())
> > scale_loop_profile (epilog, guard_to->count.probability_in 
> > (old_count), -1);

This is fine.  Sorry for late reaction.  I had some travelling in last
two weeks.
Honza
> >
> >   /* Only need to handle basic block before epilog loop if it's not
> > --
> > 2.25.1


Re: [PATCH] Remove .PHONY targets when building .fda files during autoprofiledbootstrap

2023-09-27 Thread Richard Biener
On Tue, Sep 19, 2023 at 8:17 PM Eugene Rozenfeld
 wrote:
>
> These .PHONY targets are always executed and were breaking `make install`
> for autoprofiledbootstrap build.

OK.

> gcc/ChangeLog:
>
> * c/Make-lang.in: Make create_fdas_for_cc1 target not .PHONY
> * cp/Make-lang.in: Make create_fdas_for_cc1plus target not .PHONY
> * lto/Make-lang.in: Make create_fdas_for_lto1 target not .PHONY
>
> Tested on x86_64-pc-linux-gnu.
>
> ---
>  gcc/c/Make-lang.in   | 4 ++--
>  gcc/cp/Make-lang.in  | 4 ++--
>  gcc/lto/Make-lang.in | 4 ++--
>  3 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/c/Make-lang.in b/gcc/c/Make-lang.in
> index 79bc0dfd1cf..3ef8a674971 100644
> --- a/gcc/c/Make-lang.in
> +++ b/gcc/c/Make-lang.in
> @@ -91,8 +91,6 @@ cc1$(exeext): $(C_OBJS) cc1-checksum.o $(BACKEND) $(LIBDEPS)
>  components_in_prev = "bfd opcodes binutils fixincludes gas gcc gmp mpfr mpc 
> isl gold intl ld libbacktrace libcpp libcody libdecnumber libiberty 
> libiberty-linker-plugin libiconv zlib lto-plugin libctf libsframe"
>  components_in_prev_target = "libstdc++-v3 libsanitizer libvtv libgcc 
> libbacktrace libphobos zlib libgomp libatomic"
>
> -.PHONY: create_fdas_for_cc1
> -
>  cc1.fda: create_fdas_for_cc1
> $(PROFILE_MERGER) $(shell ls -ha cc1_*.fda) --output_file cc1.fda 
> -gcov_version 2
>
> @@ -116,6 +114,8 @@ create_fdas_for_cc1: ../stage1-gcc/cc1$(exeext) 
> ../prev-gcc/$(PERF_DATA)
> $(CREATE_GCOV) -binary ../prev-gcc/cc1$(exeext) -gcov 
> $$profile_name -profile $$perf_path -gcov_version 2; \  fi; \
> done;
> +
> +   $(STAMP) $@
>  #
>  # Build hooks:
>
> diff --git a/gcc/cp/Make-lang.in b/gcc/cp/Make-lang.in
> index ba5e8766e99..2727fb7f8cc 100644
> --- a/gcc/cp/Make-lang.in
> +++ b/gcc/cp/Make-lang.in
> @@ -189,8 +189,6 @@ cp/name-lookup.o: $(srcdir)/cp/std-name-hint.h
>  components_in_prev = "bfd opcodes binutils fixincludes gas gcc gmp mpfr mpc 
> isl gold intl ld libbacktrace libcpp libcody libdecnumber libiberty 
> libiberty-linker-plugin libiconv zlib lto-plugin libctf libsframe"
>  components_in_prev_target = "libstdc++-v3 libsanitizer libvtv libgcc 
> libbacktrace libphobos zlib libgomp libatomic"
>
> -.PHONY: create_fdas_for_cc1plus
> -
>  cc1plus.fda: create_fdas_for_cc1plus
> $(PROFILE_MERGER) $(shell ls -ha cc1plus_*.fda) --output_file 
> cc1plus.fda -gcov_version 2
>
> @@ -214,6 +212,8 @@ create_fdas_for_cc1plus: ../stage1-gcc/cc1plus$(exeext) 
> ../prev-gcc/$(PERF_DATA)
> $(CREATE_GCOV) -binary ../prev-gcc/cc1plus$(exeext) -gcov 
> $$profile_name -profile $$perf_path -gcov_version 2; \
>   fi; \
> done;
> +
> +   $(STAMP) $@
>  #
>  # Build hooks:
>
> diff --git a/gcc/lto/Make-lang.in b/gcc/lto/Make-lang.in
> index 98aa9f4cc39..7dc0a9fef42 100644
> --- a/gcc/lto/Make-lang.in
> +++ b/gcc/lto/Make-lang.in
> @@ -108,8 +108,6 @@ lto/lto-dump.o: $(LTO_OBJS)
>  components_in_prev = "bfd opcodes binutils fixincludes gas gcc gmp mpfr mpc 
> isl gold intl ld libbacktrace libcpp libcody libdecnumber libiberty 
> libiberty-linker-plugin libiconv zlib lto-plugin libctf libsframe"
>  components_in_prev_target = "libstdc++-v3 libsanitizer libvtv libgcc 
> libbacktrace libphobos zlib libgomp libatomic"
>
> -.PHONY: create_fdas_for_lto1
> -
>  lto1.fda: create_fdas_for_lto1
> $(PROFILE_MERGER) $(shell ls -ha lto1_*.fda) --output_file lto1.fda 
> -gcov_version 2
>
> @@ -134,6 +132,8 @@ create_fdas_for_lto1: ../stage1-gcc/lto1$(exeext) 
> ../prev-gcc/$(PERF_DATA)
>   fi; \
> done;
>
> +   $(STAMP) $@
> +
>  # LTO testing is done as part of C/C++/Fortran etc. testing.
>  check-lto:
>
> --
> 2.25.1


Re: [PATCH] Fixes for profile count/probability maintenance

2023-09-27 Thread Richard Biener
On Tue, Sep 19, 2023 at 8:17 PM Eugene Rozenfeld
 wrote:
>
> Verifier checks have recently been strengthened to check that
> all counts and probabilities are initialized. The checks fired
> during autoprofiledbootstrap build and this patch fixes it.

OK if Honza doesn't have any comments this week.

Richard.

> gcc/ChangeLog:
>
> * auto-profile.cc (afdo_calculate_branch_prob): Fix count comparisons
> * ipa-utils.cc (ipa_merge_profiles): Guard against zero count when
> computing probabilities
> * tree-vect-loop-manip.cc (vect_do_peeling): Guard against zero count
> when scaling loop profile
>
> Tested on x86_64-pc-linux-gnu.
>
> ---
>  gcc/auto-profile.cc |  4 ++--
>  gcc/ipa-utils.cc| 16 +---
>  gcc/tree-vect-loop-manip.cc |  2 +-
>  3 files changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
> index ff3b763945c..3e61f36c29b 100644
> --- a/gcc/auto-profile.cc
> +++ b/gcc/auto-profile.cc
> @@ -1434,7 +1434,7 @@ afdo_calculate_branch_prob (bb_set *annotated_bb)
>else
>  total_count += AFDO_EINFO (e)->get_count ();
>  }
> -if (num_unknown_succ == 0 && total_count > profile_count::zero ())
> +if (num_unknown_succ == 0 && total_count > profile_count::zero ().afdo 
> ())
>{
> FOR_EACH_EDGE (e, ei, bb->succs)
>   e->probability
> @@ -1571,7 +1571,7 @@ afdo_annotate_cfg (const stmt_set _stmts)
>DECL_SOURCE_LOCATION (current_function_decl));
>afdo_source_profile->mark_annotated (cfun->function_start_locus);
>afdo_source_profile->mark_annotated (cfun->function_end_locus);
> -  if (max_count > profile_count::zero ())
> +  if (max_count > profile_count::zero ().afdo ())
>  {
>/* Calculate, propagate count and probability information on CFG.  */
>afdo_calculate_branch_prob (_bb);
> diff --git a/gcc/ipa-utils.cc b/gcc/ipa-utils.cc
> index 956c6294fd7..3aaf7e595df 100644
> --- a/gcc/ipa-utils.cc
> +++ b/gcc/ipa-utils.cc
> @@ -651,13 +651,15 @@ ipa_merge_profiles (struct cgraph_node *dst,
> {
>   edge srce = EDGE_SUCC (srcbb, i);
>   edge dste = EDGE_SUCC (dstbb, i);
> - dste->probability =
> -   dste->probability * dstbb->count.ipa ().probability_in
> -(dstbb->count.ipa ()
> - + srccount.ipa ())
> -   + srce->probability * srcbb->count.ipa ().probability_in
> -(dstbb->count.ipa ()
> - + srccount.ipa ());
> + profile_count total = dstbb->count.ipa () + srccount.ipa ();
> + if (total.nonzero_p ())
> +   {
> + dste->probability =
> +   dste->probability * dstbb->count.ipa ().probability_in
> +   
> (total)
> +   + srce->probability * srcbb->count.ipa 
> ().probability_in
> +   
> (total);
> +   }
> }
>   dstbb->count = dstbb->count.ipa () + srccount.ipa ();
> }
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 09641901ff1..2608c286e5d 100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -3335,7 +3335,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
> tree nitersm1,
>   free (bbs);
>   free (original_bbs);
> }
> - else
> + else if (old_count.nonzero_p ())
> scale_loop_profile (epilog, guard_to->count.probability_in 
> (old_count), -1);
>
>   /* Only need to handle basic block before epilog loop if it's not
> --
> 2.25.1


Re: [PATCH 10/10] vect: Consider vec_perm costing for VMAT_CONTIGUOUS_REVERSE

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> For VMAT_CONTIGUOUS_REVERSE, the transform code in function
> vectorizable_store generates a VEC_PERM_EXPR stmt before
> storing, but it's never considered in costing.
>
> This patch is to make it consider vec_perm in costing, it
> adjusts the order of transform code a bit to make it easy
> to early return for costing_p.

OK.

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vectorizable_store): Consider generated
> VEC_PERM_EXPR stmt for VMAT_CONTIGUOUS_REVERSE in costing as
> vec_perm.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/costmodel/ppc/costmodel-vect-store-2.c: New test.
> ---
>  .../costmodel/ppc/costmodel-vect-store-2.c| 29 +
>  gcc/tree-vect-stmts.cc| 63 +++
>  2 files changed, 65 insertions(+), 27 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-2.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-2.c 
> b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-2.c
> new file mode 100644
> index 000..72b67cf9040
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-2.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +/* { dg-additional-options "-mvsx" } */
> +
> +/* Verify we do cost the required vec_perm.  */
> +
> +int
> +foo (int *a, int *b, int len)
> +{
> +  int i;
> +  int *a1 = a;
> +  int *a0 = a1 - 4;
> +  for (i = 0; i < len; i++)
> +{
> +  *b = *a0 + *a1;
> +  b--;
> +  a0++;
> +  a1++;
> +}
> +  return 0;
> +}
> +
> +/* The reason why it doesn't check the exact count is that
> +   we can get more than 1 vec_perm when it's compiled with
> +   partial vector capability like Power10 (retrying for
> +   the epilogue) or it's complied without unaligned vector
> +   memory access support (realign).  */
> +/* { dg-final { scan-tree-dump {\mvec_perm\M} "vect" } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3d451c80bca..ce925cc1d53 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -9279,6 +9279,40 @@ vectorizable_store (vec_info *vinfo,
>stmt_vec_info next_stmt_info = first_stmt_info;
>for (i = 0; i < vec_num; i++)
> {
> + if (!costing_p)
> +   {
> + if (slp)
> +   vec_oprnd = vec_oprnds[i];
> + else if (grouped_store)
> +   /* For grouped stores vectorized defs are interleaved in
> +  vect_permute_store_chain().  */
> +   vec_oprnd = result_chain[i];
> +   }
> +
> + if (memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> +   {
> + if (costing_p)
> +   inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
> +stmt_info, 0, vect_body);
> + else
> +   {
> + tree perm_mask = perm_mask_for_reverse (vectype);
> + tree perm_dest = vect_create_destination_var (
> +   vect_get_store_rhs (stmt_info), vectype);
> + tree new_temp = make_ssa_name (perm_dest);
> +
> + /* Generate the permute statement.  */
> + gimple *perm_stmt
> +   = gimple_build_assign (new_temp, VEC_PERM_EXPR, vec_oprnd,
> +  vec_oprnd, perm_mask);
> + vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
> +  gsi);
> +
> + perm_stmt = SSA_NAME_DEF_STMT (new_temp);
> + vec_oprnd = new_temp;
> +   }
> +   }
> +
>   if (costing_p)
> {
>   vect_get_store_cost (vinfo, stmt_info, 1,
> @@ -9294,8 +9328,6 @@ vectorizable_store (vec_info *vinfo,
>
>   continue;
> }
> - unsigned misalign;
> - unsigned HOST_WIDE_INT align;
>
>   tree final_mask = NULL_TREE;
>   tree final_len = NULL_TREE;
> @@ -9315,13 +9347,8 @@ vectorizable_store (vec_info *vinfo,
> dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
>stmt_info, bump);
>
> - if (slp)
> -   vec_oprnd = vec_oprnds[i];
> - else if (grouped_store)
> -   /* For grouped stores vectorized defs are interleaved in
> -  vect_permute_store_chain().  */
> -   vec_oprnd = result_chain[i];
> -
> + unsigned misalign;
> + unsigned HOST_WIDE_INT align;
>   align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
>   if (alignment_support_scheme == dr_aligned)
> misalign = 0;
> @@ -9338,24 +9365,6 @@ vectorizable_store 

Re: [PATCH 09/10] vect: Get rid of vect_model_store_cost

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> This patch is to eventually get rid of vect_model_store_cost,
> it adjusts the costing for the remaining memory access types
> VMAT_CONTIGUOUS{, _DOWN, _REVERSE} by moving costing close
> to the transform code.  Note that in vect_model_store_cost,
> there is one special handling for vectorizing a store into
> the function result, since it's extra penalty and the
> transform part doesn't have it, this patch keep it alone.

OK.

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vect_model_store_cost): Remove.
> (vectorizable_store): Adjust the costing for the remaining memory
> access types VMAT_CONTIGUOUS{, _DOWN, _REVERSE}.
> ---
>  gcc/tree-vect-stmts.cc | 137 +
>  1 file changed, 44 insertions(+), 93 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index e3ba8077091..3d451c80bca 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -951,81 +951,6 @@ cfun_returns (tree decl)
>return false;
>  }
>
> -/* Function vect_model_store_cost
> -
> -   Models cost for stores.  In the case of grouped accesses, one access
> -   has the overhead of the grouped access attributed to it.  */
> -
> -static void
> -vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
> -  vect_memory_access_type memory_access_type,
> -  dr_alignment_support alignment_support_scheme,
> -  int misalignment,
> -  vec_load_store_type vls_type, slp_tree slp_node,
> -  stmt_vector_for_cost *cost_vec)
> -{
> -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> - && memory_access_type != VMAT_ELEMENTWISE
> - && memory_access_type != VMAT_STRIDED_SLP
> - && memory_access_type != VMAT_LOAD_STORE_LANES
> - && memory_access_type != VMAT_CONTIGUOUS_PERMUTE);
> -
> -  unsigned int inside_cost = 0, prologue_cost = 0;
> -
> -  /* ???  Somehow we need to fix this at the callers.  */
> -  if (slp_node)
> -ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> -
> -  if (vls_type == VLS_STORE_INVARIANT)
> -{
> -  if (!slp_node)
> -   prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
> -  stmt_info, 0, vect_prologue);
> -}
> -
> -
> -  /* Costs of the stores.  */
> -  vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> -  misalignment, _cost, cost_vec);
> -
> -  /* When vectorizing a store into the function result assign
> - a penalty if the function returns in a multi-register location.
> - In this case we assume we'll end up with having to spill the
> - vector result and do piecewise loads as a conservative estimate.  */
> -  tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
> -  if (base
> -  && (TREE_CODE (base) == RESULT_DECL
> - || (DECL_P (base) && cfun_returns (base)))
> -  && !aggregate_value_p (base, cfun->decl))
> -{
> -  rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
> -  /* ???  Handle PARALLEL in some way.  */
> -  if (REG_P (reg))
> -   {
> - int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
> - /* Assume that a single reg-reg move is possible and cheap,
> -do not account for vector to gp register move cost.  */
> - if (nregs > 1)
> -   {
> - /* Spill.  */
> - prologue_cost += record_stmt_cost (cost_vec, ncopies,
> -vector_store,
> -stmt_info, 0, vect_epilogue);
> - /* Loads.  */
> - prologue_cost += record_stmt_cost (cost_vec, ncopies * nregs,
> -scalar_load,
> -stmt_info, 0, vect_epilogue);
> -   }
> -   }
> -}
> -
> -  if (dump_enabled_p ())
> -dump_printf_loc (MSG_NOTE, vect_location,
> - "vect_model_store_cost: inside_cost = %d, "
> - "prologue_cost = %d .\n", inside_cost, prologue_cost);
> -}
> -
> -
>  /* Calculate cost of DR's memory access.  */
>  void
>  vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
> @@ -9223,6 +9148,11 @@ vectorizable_store (vec_info *vinfo,
>return true;
>  }
>
> +  gcc_assert (memory_access_type == VMAT_CONTIGUOUS
> + || memory_access_type == VMAT_CONTIGUOUS_DOWN
> + || memory_access_type == VMAT_CONTIGUOUS_PERMUTE
> + || memory_access_type == VMAT_CONTIGUOUS_REVERSE);
> +
>unsigned inside_cost = 0, prologue_cost = 0;
>auto_vec result_chain (group_size);
>auto_vec vec_oprnds;
> @@ -9257,10 +9187,9 @@ vectorizable_store (vec_info *vinfo,
>

Re: [PATCH 07/10] vect: Adjust vectorizable_store costing on VMAT_CONTIGUOUS_PERMUTE

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> This patch adjusts the cost handling on VMAT_CONTIGUOUS_PERMUTE
> in function vectorizable_store.  We don't call function
> vect_model_store_cost for it any more.  It's the case of
> interleaving stores, so it skips all stmts excepting for
> first_stmt_info, consider the whole group when costing
> first_stmt_info.  This patch shouldn't have any functional
> changes.

OK.

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vect_model_store_cost): Assert it will never
> get VMAT_CONTIGUOUS_PERMUTE and remove VMAT_CONTIGUOUS_PERMUTE related
> handlings.
> (vectorizable_store): Adjust the cost handling on
> VMAT_CONTIGUOUS_PERMUTE without calling vect_model_store_cost.
> ---
>  gcc/tree-vect-stmts.cc | 128 -
>  1 file changed, 74 insertions(+), 54 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index fbd16b8a487..e3ba8077091 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -967,10 +967,10 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
>   && memory_access_type != VMAT_ELEMENTWISE
>   && memory_access_type != VMAT_STRIDED_SLP
> - && memory_access_type != VMAT_LOAD_STORE_LANES);
> + && memory_access_type != VMAT_LOAD_STORE_LANES
> + && memory_access_type != VMAT_CONTIGUOUS_PERMUTE);
> +
>unsigned int inside_cost = 0, prologue_cost = 0;
> -  stmt_vec_info first_stmt_info = stmt_info;
> -  bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
>
>/* ???  Somehow we need to fix this at the callers.  */
>if (slp_node)
> @@ -983,35 +983,6 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>stmt_info, 0, vect_prologue);
>  }
>
> -  /* Grouped stores update all elements in the group at once,
> - so we want the DR for the first statement.  */
> -  if (!slp_node && grouped_access_p)
> -first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
> -
> -  /* True if we should include any once-per-group costs as well as
> - the cost of the statement itself.  For SLP we only get called
> - once per group anyhow.  */
> -  bool first_stmt_p = (first_stmt_info == stmt_info);
> -
> -  /* We assume that the cost of a single store-lanes instruction is
> - equivalent to the cost of DR_GROUP_SIZE separate stores.  If a grouped
> - access is instead being provided by a permute-and-store operation,
> - include the cost of the permutes.  */
> -  if (first_stmt_p
> -  && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
> -{
> -  /* Uses a high and low interleave or shuffle operations for each
> -needed permute.  */
> -  int group_size = DR_GROUP_SIZE (first_stmt_info);
> -  int nstmts = ncopies * ceil_log2 (group_size) * group_size;
> -  inside_cost = record_stmt_cost (cost_vec, nstmts, vec_perm,
> - stmt_info, 0, vect_body);
> -
> -  if (dump_enabled_p ())
> -dump_printf_loc (MSG_NOTE, vect_location,
> - "vect_model_store_cost: strided group_size = %d 
> .\n",
> - group_size);
> -}
>
>/* Costs of the stores.  */
>vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> @@ -8408,9 +8379,7 @@ vectorizable_store (vec_info *vinfo,
>  costing, use the first one instead.  */
>if (grouped_store
>   && !slp
> - && first_stmt_info != stmt_info
> - && (memory_access_type == VMAT_ELEMENTWISE
> - || memory_access_type == VMAT_LOAD_STORE_LANES))
> + && first_stmt_info != stmt_info)
> return true;
>  }
>gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE 
> (stmt_info));
> @@ -9254,14 +9223,15 @@ vectorizable_store (vec_info *vinfo,
>return true;
>  }
>
> +  unsigned inside_cost = 0, prologue_cost = 0;
>auto_vec result_chain (group_size);
>auto_vec vec_oprnds;
>for (j = 0; j < ncopies; j++)
>  {
>gimple *new_stmt;
> -  if (j == 0 && !costing_p)
> +  if (j == 0)
> {
> - if (slp)
> + if (slp && !costing_p)
> {
>   /* Get vectorized arguments for SLP_NODE.  */
>   vect_get_vec_defs (vinfo, stmt_info, slp_node, 1, op,
> @@ -9287,13 +9257,20 @@ vectorizable_store (vec_info *vinfo,
>  that there is no interleaving, DR_GROUP_SIZE is 1,
>  and only one iteration of the loop will be executed.  */
>   op = vect_get_store_rhs (next_stmt_info);
> - vect_get_vec_defs_for_operand (vinfo, next_stmt_info, 
> ncopies,
> -op, gvec_oprnds[i]);
> -

Re: [PATCH 06/10] vect: Adjust vectorizable_store costing on VMAT_LOAD_STORE_LANES

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> This patch adjusts the cost handling on VMAT_LOAD_STORE_LANES
> in function vectorizable_store.  We don't call function
> vect_model_store_cost for it any more.  It's the case of
> interleaving stores, so it skips all stmts excepting for
> first_stmt_info, consider the whole group when costing
> first_stmt_info.  This patch shouldn't have any functional
> changes.

OK.

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vect_model_store_cost): Assert it will never
> get VMAT_LOAD_STORE_LANES.
> (vectorizable_store): Adjust the cost handling on 
> VMAT_LOAD_STORE_LANES
> without calling vect_model_store_cost.  Factor out new lambda function
> update_prologue_cost.
> ---
>  gcc/tree-vect-stmts.cc | 110 -
>  1 file changed, 75 insertions(+), 35 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3d01168080a..fbd16b8a487 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -966,7 +966,8 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>  {
>gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
>   && memory_access_type != VMAT_ELEMENTWISE
> - && memory_access_type != VMAT_STRIDED_SLP);
> + && memory_access_type != VMAT_STRIDED_SLP
> + && memory_access_type != VMAT_LOAD_STORE_LANES);
>unsigned int inside_cost = 0, prologue_cost = 0;
>stmt_vec_info first_stmt_info = stmt_info;
>bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
> @@ -8408,7 +8409,8 @@ vectorizable_store (vec_info *vinfo,
>if (grouped_store
>   && !slp
>   && first_stmt_info != stmt_info
> - && memory_access_type == VMAT_ELEMENTWISE)
> + && (memory_access_type == VMAT_ELEMENTWISE
> + || memory_access_type == VMAT_LOAD_STORE_LANES))
> return true;
>  }
>gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE 
> (stmt_info));
> @@ -8479,6 +8481,31 @@ vectorizable_store (vec_info *vinfo,
>  dump_printf_loc (MSG_NOTE, vect_location, "transform store. ncopies = 
> %d\n",
>  ncopies);
>
> +  /* Check if we need to update prologue cost for invariant,
> + and update it accordingly if so.  If it's not for
> + interleaving store, we can just check vls_type; but if
> + it's for interleaving store, need to check the def_type
> + of the stored value since the current vls_type is just
> + for first_stmt_info.  */
> +  auto update_prologue_cost = [&](unsigned *prologue_cost, tree store_rhs)
> +  {
> +gcc_assert (costing_p);
> +if (slp)
> +  return;
> +if (grouped_store)
> +  {
> +   gcc_assert (store_rhs);
> +   enum vect_def_type cdt;
> +   gcc_assert (vect_is_simple_use (store_rhs, vinfo, ));
> +   if (cdt != vect_constant_def && cdt != vect_external_def)
> + return;
> +  }
> +else if (vls_type != VLS_STORE_INVARIANT)
> +  return;
> +*prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, 
> stmt_info,
> +   0, vect_prologue);
> +  };
> +
>if (memory_access_type == VMAT_ELEMENTWISE
>|| memory_access_type == VMAT_STRIDED_SLP)
>  {
> @@ -8646,14 +8673,8 @@ vectorizable_store (vec_info *vinfo,
>   if (!costing_p)
> vect_get_vec_defs (vinfo, next_stmt_info, slp_node, ncopies, op,
>_oprnds);
> - else if (!slp)
> -   {
> - enum vect_def_type cdt;
> - gcc_assert (vect_is_simple_use (op, vinfo, ));
> - if (cdt == vect_constant_def || cdt == vect_external_def)
> -   prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
> -  stmt_info, 0, 
> vect_prologue);
> -   }
> + else
> +   update_prologue_cost (_cost, op);
>   unsigned int group_el = 0;
>   unsigned HOST_WIDE_INT
> elsz = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
> @@ -8857,13 +8878,7 @@ vectorizable_store (vec_info *vinfo,
>if (memory_access_type == VMAT_LOAD_STORE_LANES)
>  {
>gcc_assert (!slp && grouped_store);
> -  if (costing_p)
> -   {
> - vect_model_store_cost (vinfo, stmt_info, ncopies, 
> memory_access_type,
> -alignment_support_scheme, misalignment,
> -vls_type, slp_node, cost_vec);
> - return true;
> -   }
> +  unsigned inside_cost = 0, prologue_cost = 0;
>for (j = 0; j < ncopies; j++)
> {
>   gimple *new_stmt;
> @@ -8879,29 +8894,39 @@ vectorizable_store (vec_info *vinfo,
>  DR_GROUP_SIZE is the exact number of stmts in the
>  chain. Therefore, NEXT_STMT_INFO can't be 

Re: [PATCH 05/10] vect: Adjust vectorizable_store costing on VMAT_ELEMENTWISE and VMAT_STRIDED_SLP

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> This patch adjusts the cost handling on VMAT_ELEMENTWISE
> and VMAT_STRIDED_SLP in function vectorizable_store.  We
> don't call function vect_model_store_cost for them any more.
>
> Like what we improved for PR82255 on load side, this change
> helps us to get rid of unnecessary vec_to_scalar costing
> for some case with VMAT_STRIDED_SLP.  One typical test case
> gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c has been
> associated.  And it helps some cases with some inconsistent
> costing too.
>
> Besides, this also special-cases the interleaving stores
> for these two affected memory access types, since for the
> interleaving stores the whole chain is vectorized when the
> last store in the chain is reached, the other stores in the
> group would be skipped.  To keep consistent with this and
> follows the transforming handlings like iterating the whole
> group, it only costs for the first store in the group.
> Ideally we can only cost for the last one but it's not
> trivial and using the first one is actually equivalent.

OK

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vect_model_store_cost): Assert it won't get
> VMAT_ELEMENTWISE and VMAT_STRIDED_SLP any more, and remove their
> related handlings.
> (vectorizable_store): Adjust the cost handling on VMAT_ELEMENTWISE
> and VMAT_STRIDED_SLP without calling vect_model_store_cost.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c: New test.
> ---
>  .../costmodel/ppc/costmodel-vect-store-1.c|  23 +++
>  gcc/tree-vect-stmts.cc| 160 +++---
>  2 files changed, 120 insertions(+), 63 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c 
> b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
> new file mode 100644
> index 000..ab5f3301492
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/ppc/costmodel-vect-store-1.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" }
> +
> +/* This test case is partially extracted from case
> +   gcc.dg/vect/vect-avg-16.c, it's to verify we don't
> +   cost a store with vec_to_scalar when we shouldn't.  */
> +
> +void
> +test (signed char *restrict a, signed char *restrict b, signed char 
> *restrict c,
> +  int n)
> +{
> +  for (int j = 0; j < n; ++j)
> +{
> +  for (int i = 0; i < 16; ++i)
> +   a[i] = (b[i] + c[i]) >> 1;
> +  a += 20;
> +  b += 20;
> +  c += 20;
> +}
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vec_to_scalar" 0 "vect" } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 048c14d291c..3d01168080a 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -964,7 +964,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>vec_load_store_type vls_type, slp_tree slp_node,
>stmt_vector_for_cost *cost_vec)
>  {
> -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> + && memory_access_type != VMAT_ELEMENTWISE
> + && memory_access_type != VMAT_STRIDED_SLP);
>unsigned int inside_cost = 0, prologue_cost = 0;
>stmt_vec_info first_stmt_info = stmt_info;
>bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
> @@ -1010,29 +1012,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>   group_size);
>  }
>
> -  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>/* Costs of the stores.  */
> -  if (memory_access_type == VMAT_ELEMENTWISE)
> -{
> -  unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> -  /* N scalar stores plus extracting the elements.  */
> -  inside_cost += record_stmt_cost (cost_vec,
> -  ncopies * assumed_nunits,
> -  scalar_store, stmt_info, 0, vect_body);
> -}
> -  else
> -vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> -misalignment, _cost, cost_vec);
> -
> -  if (memory_access_type == VMAT_ELEMENTWISE
> -  || memory_access_type == VMAT_STRIDED_SLP)
> -{
> -  /* N scalar stores plus extracting the elements.  */
> -  unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> -  inside_cost += record_stmt_cost (cost_vec,
> -  ncopies * assumed_nunits,
> -  vec_to_scalar, stmt_info, 0, 
> vect_body);
> -}
> +  vect_get_store_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
> +  

Re: [PATCH 04/10] vect: Simplify costing on vectorizable_scan_store

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> This patch is to simplify the costing on the case
> vectorizable_scan_store without calling function
> vect_model_store_cost any more.
>
> I considered if moving the costing into function
> vectorizable_scan_store is a good idea, for doing
> that, we have to pass several variables down which
> are only used for costing, and for now we just
> want to keep the costing as the previous, haven't
> tried to make this costing consistent with what the
> transforming does, so I think we can leave it for now.

OK

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vectorizable_store): Adjust costing on
> vectorizable_scan_store without calling vect_model_store_cost
> any more.
> ---
>  gcc/tree-vect-stmts.cc | 18 +++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 3f908242fee..048c14d291c 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8432,11 +8432,23 @@ vectorizable_store (vec_info *vinfo,
>else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3)
>  {
>gcc_assert (memory_access_type == VMAT_CONTIGUOUS);
> +  gcc_assert (!slp);
>if (costing_p)
> {
> - vect_model_store_cost (vinfo, stmt_info, ncopies, 
> memory_access_type,
> -alignment_support_scheme, misalignment,
> -vls_type, slp_node, cost_vec);
> + unsigned int inside_cost = 0, prologue_cost = 0;
> + if (vls_type == VLS_STORE_INVARIANT)
> +   prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
> +  stmt_info, 0, vect_prologue);
> + vect_get_store_cost (vinfo, stmt_info, ncopies,
> +  alignment_support_scheme, misalignment,
> +  _cost, cost_vec);
> +
> + if (dump_enabled_p ())
> +   dump_printf_loc (MSG_NOTE, vect_location,
> +"vect_model_store_cost: inside_cost = %d, "
> +"prologue_cost = %d .\n",
> +inside_cost, prologue_cost);
> +
>   return true;
> }
>return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, 
> ncopies);
> --
> 2.31.1
>


Re: [PATCH 03/10] vect: Adjust vectorizable_store costing on VMAT_GATHER_SCATTER

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> This patch adjusts the cost handling on VMAT_GATHER_SCATTER
> in function vectorizable_store (all three cases), then we
> won't depend on vect_model_load_store for its costing any
> more.  This patch shouldn't have any functional changes.

OK.

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vect_model_store_cost): Assert it won't get
> VMAT_GATHER_SCATTER any more, remove VMAT_GATHER_SCATTER related
> handlings and the related parameter gs_info.
> (vect_build_scatter_store_calls): Add the handlings on costing with
> one more argument cost_vec.
> (vectorizable_store): Adjust the cost handling on VMAT_GATHER_SCATTER
> without calling vect_model_store_cost any more.
> ---
>  gcc/tree-vect-stmts.cc | 188 ++---
>  1 file changed, 118 insertions(+), 70 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 36f7c5b9f4b..3f908242fee 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -959,12 +959,12 @@ cfun_returns (tree decl)
>  static void
>  vect_model_store_cost (vec_info *vinfo, stmt_vec_info stmt_info, int ncopies,
>vect_memory_access_type memory_access_type,
> -  gather_scatter_info *gs_info,
>dr_alignment_support alignment_support_scheme,
>int misalignment,
>vec_load_store_type vls_type, slp_tree slp_node,
>stmt_vector_for_cost *cost_vec)
>  {
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
>unsigned int inside_cost = 0, prologue_cost = 0;
>stmt_vec_info first_stmt_info = stmt_info;
>bool grouped_access_p = STMT_VINFO_GROUPED_ACCESS (stmt_info);
> @@ -1012,18 +1012,9 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>
>tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>/* Costs of the stores.  */
> -  if (memory_access_type == VMAT_ELEMENTWISE
> -  || memory_access_type == VMAT_GATHER_SCATTER)
> +  if (memory_access_type == VMAT_ELEMENTWISE)
>  {
>unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> -  if (memory_access_type == VMAT_GATHER_SCATTER
> - && gs_info->ifn == IFN_LAST && !gs_info->decl)
> -   /* For emulated scatter N offset vector element extracts
> -  (we assume the scalar scaling and ptr + offset add is consumed by
> -  the load).  */
> -   inside_cost += record_stmt_cost (cost_vec, ncopies * assumed_nunits,
> -vec_to_scalar, stmt_info, 0,
> -vect_body);
>/* N scalar stores plus extracting the elements.  */
>inside_cost += record_stmt_cost (cost_vec,
>ncopies * assumed_nunits,
> @@ -1034,9 +1025,7 @@ vect_model_store_cost (vec_info *vinfo, stmt_vec_info 
> stmt_info, int ncopies,
>  misalignment, _cost, cost_vec);
>
>if (memory_access_type == VMAT_ELEMENTWISE
> -  || memory_access_type == VMAT_STRIDED_SLP
> -  || (memory_access_type == VMAT_GATHER_SCATTER
> - && gs_info->ifn == IFN_LAST && !gs_info->decl))
> +  || memory_access_type == VMAT_STRIDED_SLP)
>  {
>/* N scalar stores plus extracting the elements.  */
>unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
> @@ -2999,7 +2988,8 @@ vect_build_gather_load_calls (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>  static void
>  vect_build_scatter_store_calls (vec_info *vinfo, stmt_vec_info stmt_info,
> gimple_stmt_iterator *gsi, gimple **vec_stmt,
> -   gather_scatter_info *gs_info, tree mask)
> +   gather_scatter_info *gs_info, tree mask,
> +   stmt_vector_for_cost *cost_vec)
>  {
>loop_vec_info loop_vinfo = dyn_cast (vinfo);
>tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> @@ -3009,6 +2999,30 @@ vect_build_scatter_store_calls (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>poly_uint64 scatter_off_nunits
>  = TYPE_VECTOR_SUBPARTS (gs_info->offset_vectype);
>
> +  /* FIXME: Keep the previous costing way in vect_model_store_cost by
> + costing N scalar stores, but it should be tweaked to use target
> + specific costs on related scatter store calls.  */
> +  if (cost_vec)
> +{
> +  tree op = vect_get_store_rhs (stmt_info);
> +  enum vect_def_type dt;
> +  gcc_assert (vect_is_simple_use (op, vinfo, ));
> +  unsigned int inside_cost, prologue_cost = 0;
> +  if (dt == vect_constant_def || dt == vect_external_def)
> +   prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
> +  stmt_info, 0, vect_prologue);
> +  unsigned int assumed_nunits = vect_nunits_for_cost 

Re: [PATCH 02/10] vect: Move vect_model_store_cost next to the transform in vectorizable_store

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> This patch is an initial patch to move costing next to the
> transform, it still adopts vect_model_store_cost for costing
> but moves and duplicates it down according to the handlings
> of different vect_memory_access_types or some special
> handling need, hope it can make the subsequent patches easy
> to review.  This patch should not have any functional
> changes.

OK

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vectorizable_store): Move and duplicate the call
> to vect_model_store_cost down to some different transform paths
> according to the handlings of different vect_memory_access_types
> or some special handling need.
> ---
>  gcc/tree-vect-stmts.cc | 79 --
>  1 file changed, 60 insertions(+), 19 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index a5caaf0bca2..36f7c5b9f4b 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8372,7 +8372,8 @@ vectorizable_store (vec_info *vinfo,
> return false;
>  }
>
> -  if (!vec_stmt) /* transformation not required.  */
> +  bool costing_p = !vec_stmt;
> +  if (costing_p) /* transformation not required.  */
>  {
>STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
>
> @@ -8401,11 +8402,6 @@ vectorizable_store (vec_info *vinfo,
>  "Vectorizing an unaligned access.\n");
>
>STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
> -  vect_model_store_cost (vinfo, stmt_info, ncopies,
> -memory_access_type, _info,
> -alignment_support_scheme,
> -misalignment, vls_type, slp_node, cost_vec);
> -  return true;
>  }
>gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE 
> (stmt_info));
>
> @@ -8415,12 +8411,27 @@ vectorizable_store (vec_info *vinfo,
>
>if (memory_access_type == VMAT_GATHER_SCATTER && gs_info.decl)
>  {
> -  vect_build_scatter_store_calls (vinfo, stmt_info, gsi, vec_stmt,
> - _info, mask);
> +  if (costing_p)
> +   vect_model_store_cost (vinfo, stmt_info, ncopies, memory_access_type,
> +  _info, alignment_support_scheme, 
> misalignment,
> +  vls_type, slp_node, cost_vec);
> +  else
> +   vect_build_scatter_store_calls (vinfo, stmt_info, gsi, vec_stmt,
> +   _info, mask);
>return true;
>  }
>else if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) >= 3)
> -return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, 
> ncopies);
> +{
> +  gcc_assert (memory_access_type == VMAT_CONTIGUOUS);
> +  if (costing_p)
> +   {
> + vect_model_store_cost (vinfo, stmt_info, ncopies, 
> memory_access_type,
> +_info, alignment_support_scheme,
> +misalignment, vls_type, slp_node, cost_vec);
> + return true;
> +   }
> +  return vectorizable_scan_store (vinfo, stmt_info, gsi, vec_stmt, 
> ncopies);
> +}
>
>if (grouped_store)
>  {
> @@ -8449,13 +8460,21 @@ vectorizable_store (vec_info *vinfo,
>else
>  ref_type = reference_alias_ptr_type (DR_REF (first_dr_info->dr));
>
> -  if (dump_enabled_p ())
> -dump_printf_loc (MSG_NOTE, vect_location,
> - "transform store. ncopies = %d\n", ncopies);
> +  if (!costing_p && dump_enabled_p ())
> +dump_printf_loc (MSG_NOTE, vect_location, "transform store. ncopies = 
> %d\n",
> +ncopies);
>
>if (memory_access_type == VMAT_ELEMENTWISE
>|| memory_access_type == VMAT_STRIDED_SLP)
>  {
> +  if (costing_p)
> +   {
> + vect_model_store_cost (vinfo, stmt_info, ncopies, 
> memory_access_type,
> +_info, alignment_support_scheme,
> +misalignment, vls_type, slp_node, cost_vec);
> + return true;
> +   }
> +
>gimple_stmt_iterator incr_gsi;
>bool insert_after;
>gimple *incr;
> @@ -8718,8 +8737,9 @@ vectorizable_store (vec_info *vinfo,
>else if (memory_access_type == VMAT_GATHER_SCATTER)
>  {
>aggr_type = elem_type;
> -  vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, _info,
> -  , _offset, loop_lens);
> +  if (!costing_p)
> +   vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, _info,
> +, _offset, loop_lens);
>  }
>else
>  {
> @@ -8731,7 +8751,7 @@ vectorizable_store (vec_info *vinfo,
>   memory_access_type, loop_lens);
>  }
>
> -  if (mask)
> +  if (mask && !costing_p)
>  LOOP_VINFO_HAS_MASK_STORE (loop_vinfo) = true;
>
>/* In case the 

Re: [PATCH 01/10] vect: Ensure vect store is supported for some VMAT_ELEMENTWISE case

2023-09-27 Thread Richard Biener
On Thu, Sep 14, 2023 at 5:12 AM Kewen Lin  wrote:
>
> When making/testing patches to move costing next to the
> transform code for vectorizable_store, some ICEs got
> exposed when I further refined the costing handlings on
> VMAT_ELEMENTWISE.  The apparent cause is triggering the
> assertion in rs6000 specific function for costing
> rs6000_builtin_vectorization_cost:
>
>   if (TARGET_ALTIVEC)
>  /* Misaligned stores are not supported.  */
>  gcc_unreachable ();
>
> I used vect_get_store_cost instead of the original way by
> record_stmt_cost with scalar_store for costing, that is to
> use one unaligned_store instead, it matches what we use in
> transforming, it's a vector store as below:
>
>   else if (group_size >= const_nunits
>&& group_size % const_nunits == 0)
> {
>nstores = 1;
>lnel = const_nunits;
>ltype = vectype;
>lvectype = vectype;
> }
>
> So IMHO it's more consistent with vector store instead of
> scalar store, with the given compilation option
> -mno-allow-movmisalign, the misaligned vector store is
> unexpected to be used in vectorizer, but why it's still
> adopted?  In the current implementation of function
> get_group_load_store_type, we always set alignment support
> scheme as dr_unaligned_supported for VMAT_ELEMENTWISE, it
> is true if we always adopt scalar stores, but as the above
> code shows, we could use vector stores for some cases, so
> we should use the correct alignment support scheme for it.
>
> This patch is to ensure the vector store is supported by
> further checking with vect_supportable_dr_alignment.  The
> ICEs got exposed with patches moving costing next to the
> transform but they haven't been landed, the test coverage
> would be there once they get landed.  The affected test
> cases are:
>   - gcc.dg/vect/slp-45.c
>   - gcc.dg/vect/vect-alias-check-{10,11,12}.c
>
> btw, I tried to make some correctness test case, but I
> realized that -mno-allow-movmisalign is mainly for noting
> movmisalign optab and it doesn't guard for the actual hw
> vector memory access insns, so I failed to make it unless
> I also altered some conditions for them as it.

OK.

> gcc/ChangeLog:
>
> * tree-vect-stmts.cc (vectorizable_store): Ensure the generated
> vector store for some case of VMAT_ELEMENTWISE is supported.
> ---
>  gcc/tree-vect-stmts.cc | 16 
>  1 file changed, 12 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index cd7c1090d88..a5caaf0bca2 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -8558,10 +8558,18 @@ vectorizable_store (vec_info *vinfo,
>   else if (group_size >= const_nunits
>&& group_size % const_nunits == 0)
> {
> - nstores = 1;
> - lnel = const_nunits;
> - ltype = vectype;
> - lvectype = vectype;
> + int mis_align = dr_misalignment (first_dr_info, vectype);
> + dr_alignment_support dr_align
> +   = vect_supportable_dr_alignment (vinfo, dr_info, vectype,
> +mis_align);
> + if (dr_align == dr_aligned
> + || dr_align == dr_unaligned_supported)
> +   {
> + nstores = 1;
> + lnel = const_nunits;
> + ltype = vectype;
> + lvectype = vectype;
> +   }
> }
>   ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type));
>   ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
> --
> 2.31.1
>


Re: [PATCH] vec.h, v2: Make some ops work with non-trivially copy constructible and/or destructible types

2023-09-27 Thread Richard Biener
On Wed, 27 Sep 2023, Jakub Jelinek wrote:

> On Wed, Sep 27, 2023 at 07:17:22AM +, Richard Biener wrote:
> > OK I guess.  Can you summarize the limitations for non-POD types
> > in the big comment at the start of vec.h?
> 
> Still haven't done that, but will do after we flesh out the details
> below.
> 
> > (can we put in static_asserts
> > in the places that obviously do not work?)
> 
> I've tried to do this though, I think the static_asserts will allow
> making sure we only use what is supportable and will serve better than
> any kind of comment.
> 
> But, I've run into quite a few triggered assertion failures with that, and
> the question is what we want to do with them.  Applying
> --- gcc/vec.h.jj  2023-09-27 12:11:56.0 +0200
> +++ gcc/vec.h 2023-09-27 12:39:50.971613964 +0200
> @@ -1160,7 +1160,7 @@ template
>  inline void
>  vec::qsort (int (*cmp) (const void *, const void *))
>  {
> -  static_assert (std::is_trivially_copyable ::value, "");
> +//  static_assert (std::is_trivially_copyable ::value, "");
>if (length () > 1)
>  gcc_qsort (address (), length (), sizeof (T), cmp);
>  }
> @@ -1359,7 +1359,7 @@ inline void
>  vec::quick_grow (unsigned len)
>  {
>gcc_checking_assert (length () <= len && len <= m_vecpfx.m_alloc);
> -  static_assert (std::is_trivially_default_constructible ::value, "");
> +//  static_assert (std::is_trivially_default_constructible ::value, "");
>m_vecpfx.m_num = len;
>  }
>  
> incremental patch makes stuff work (at lest make in gcc subdir for
> x86_64-linux with --enable-languages=c,c++,fortran,lto succeeds), so it is 
> just
> those 2 asserts.
> Following is full list of failures and discusses details.
> 
> dbgcnt.cc:132  limits[index].qsort (cmp_tuples);
> T = std::pair
> where std::pair is not trivially copyable.  Our qsort implementation uses
> memcpys/memmoves to reshuffle the array elements (as it isn't inlined and
> so can't use std::swap and the like), so I think we need the types trivially
> copyable (or at least trivially copy assignable from the same type or
> something similar if we close all eyes).  Is there some std::pair
> alternative which is trivially copyable, or do we need to define structures
> to achieve that?  Or just close all eyes and allow qsort/sort/stablesort
> either on trivially copyable types, or on std::pair where both template
> parameters are trivially copyable?
> 
> genrecog.cc:3466  candidates.qsort (subroutine_candidate_cmp);
> T = std::pair
> ditto
> 
> dwarf2asm.cc:1061  temp.qsort (compare_strings);
> T = std::pair
> ditto
> 
> tree-ssa-dce.cc:1776  args.qsort (sort_phi_args);
> T = std::pair
> ditto
> 
> tree-ssa-loop-manip.cc:376  names.qsort (loop_name_cmp);
> T = std::pair
> ditto
> 
> omp-oacc-neuter-broadcast.cc:1730  priority.qsort (sort_size_descending);
> T = std::pair
> ditto
> 
> ipa-icf.cc:3087  to_split.qsort (sort_congruence_split);
> T = std::pair
> ditto
> 
> ipa-icf.cc:3360  classes.qsort (sort_congruence_class_groups_by_decl_uid);
> T = std::pair
> ditto
> 
> tree-vect-slp.cc:6991  li_scalar_costs.qsort (li_cost_vec_cmp);
> T = std::pair
> ditto
> 
> tree-vect-slp.cc:7249  lane_defs.qsort (vld_cmp);
> T = std::pair
> ditto

make an exception as discussed on IRC

> cfganal.cc:471  control_dependence_map.quick_grow (last_basic_block_for_fn 
> (cfun));
> T = bitmap_head
> This is a different case, bitmap_head has a non-trivial default constructor
> that essentially fills it with zeros, and for quick_grow we have
> quick_grow_cleared which default constructs elements, but quick_grow leaves
> them unconstructed/uninitialized, which is why I wanted to add an assert
> there, e.g. quick_grow on the wide_int/widest_int WIP would break stuff
> terribly.  In the cfganal.cc case, we call bitmap_initialize on it after
> growing it, which overwrites all elements and doesn't depend on values of
> any, so grow_cleared actually is invalid from strict C++ POV, but not
> in reality.  Do we still want the assert in and slow cfganal.cc slightly
> by using quick_grow_cleared?  Or another option would be to make
> quick_grow work transparently the same as quick_grow_cleared for
> non-trivially default constructible types.  Though, I think it might be
> better if people explicitly see that the operation is more expensive.

I've added the CTOR for checking purposes, I suppose we can add
a bitmap_head_pod omitting it?

> tree-ssa-loop-im.cc:2592  first_edge_seq.safe_grow 
> (fes_length
>   + 
> extra_refs.length ());
> T = seq_entry
> Here, seq_entry is:
> struct seq_entry
> {
>   seq_entry () {}
>   seq_entry (unsigned f, sm_kind k, tree fr = NULL)
> : first (f), second (k), from (fr) {}
>   unsigned first;
>   sm_kind second;
>   tree from;
> };
> Wonder if making seq_entry () = default;
> wouldn't cure this.

Possibly?  It's also a pattern that might go into vec:: directly,
it's a splice_at (aka 

Re: [PATCH, rs6000] Enable vector compare for 16-byte memory equality compare [PR111449]

2023-09-27 Thread Richard Sandiford
"Kewen.Lin"  writes:
> Hi,
>
> on 2023/9/20 16:49, HAO CHEN GUI wrote:
>> Hi,
>>   This patch enables vector compare for 16-byte memory equality compare.
>> The 16-byte memory equality compare can be efficiently implemented by
>> instruction "vcmpequb." It reduces one branch and one compare compared
>> with two 8-byte compare sequence.
>
> It looks nice to exploit vcmpequb. for this comparison.
>
>> 
>>   16-byte vector compare is not enabled on 32bit sub-targets as TImode
>> hasn't been supported well on 32bit sub-targets.
>
> But it sounds weird to say it is with TImode but the underlying instruction
> is V16QImode.  This does NOT necessarily depend on TImode, so if it's coded
> with V16QImode it would not suffer this unsupported issue.
>
> The reason why you hacked with TImode seems that the generic part of code
> only considers the scalar mode?  I wonder if we can extend the generic code
> to consider the vector mode as well.  It also makes thing better if we will
> have wider vector mode one day.
>
> I guess there is no blocking/limitation for not considering vector modes?

Yeah, I agree there doesn't seem to be a good reason to exclude vectors.
Sorry to dive straight into details, but maybe we should have something
called bitwise_mode_for_size that tries to use integer modes where possible,
but falls back to vector modes otherwise.  That mode could then be used
for copying, storing, bitwise ops, and equality comparisons (if there
is appropriate optabs support).

Thanks,
Richard

> CC some experts.
>
> BR,
> Kewen
>
>> 
>>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
>> 
>> Thanks
>> Gui Haochen
>> 
>> ChangeLog
>> rs6000: Enable vector compare for 16-byte memory equality compare
>> 
>> gcc/
>>  PR target/111449
>>  * config/rs6000/altivec.md (cbranchti4): New expand pattern.
>>  * config/rs6000/rs6000.cc (rs6000_generate_compare): Generate insn
>>  sequence for TImode vector equality compare.
>>  * config/rs6000/rs6000.h (MOVE_MAX_PIECES): Define.
>>  (COMPARE_MAX_PIECES): Define.
>> 
>> gcc/testsuite/
>>  PR target/111449
>>  * gcc.target/powerpc/pr111449.c: New.
>> 
>> patch.diff
>> diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
>> index e8a596fb7e9..99264235cbe 100644
>> --- a/gcc/config/rs6000/altivec.md
>> +++ b/gcc/config/rs6000/altivec.md
>> @@ -2605,6 +2605,24 @@ (define_insn "altivec_vupklpx"
>>  }
>>[(set_attr "type" "vecperm")])
>> 
>> +(define_expand "cbranchti4"
>> +  [(use (match_operator 0 "equality_operator"
>> +[(match_operand:TI 1 "memory_operand")
>> + (match_operand:TI 2 "memory_operand")]))
>> +   (use (match_operand 3))]
>> +  "VECTOR_UNIT_ALTIVEC_P (V16QImode)"
>> +{
>> +  rtx op1 = simplify_subreg (V16QImode, operands[1], TImode, 0);
>> +  rtx op2 = simplify_subreg (V16QImode, operands[2], TImode, 0);
>> +  operands[1] = force_reg (V16QImode, op1);
>> +  operands[2] = force_reg (V16QImode, op2);
>> +  rtx_code code = GET_CODE (operands[0]);
>> +  operands[0] = gen_rtx_fmt_ee (code, V16QImode, operands[1],
>> +operands[2]);
>> +  rs6000_emit_cbranch (TImode, operands);
>> +  DONE;
>> +})
>> +
>>  ;; Compare vectors producing a vector result and a predicate, setting CR6 to
>>  ;; indicate a combined status
>>  (define_insn "altivec_vcmpequ_p"
>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
>> index efe9adce1f8..c6b935a64e7 100644
>> --- a/gcc/config/rs6000/rs6000.cc
>> +++ b/gcc/config/rs6000/rs6000.cc
>> @@ -15264,6 +15264,15 @@ rs6000_generate_compare (rtx cmp, machine_mode mode)
>>else
>>  emit_insn (gen_stack_protect_testsi (compare_result, op0, op1b));
>>  }
>> +  else if (mode == TImode)
>> +{
>> +  gcc_assert (code == EQ || code == NE);
>> +
>> +  rtx result_vector = gen_reg_rtx (V16QImode);
>> +  compare_result = gen_rtx_REG (CCmode, CR6_REGNO);
>> +  emit_insn (gen_altivec_vcmpequb_p (result_vector, op0, op1));
>> +  code = (code == NE) ? GE : LT;
>> +}
>>else
>>  emit_insn (gen_rtx_SET (compare_result,
>>  gen_rtx_COMPARE (comp_mode, op0, op1)));
>> diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
>> index 3503614efbd..dc33bca0802 100644
>> --- a/gcc/config/rs6000/rs6000.h
>> +++ b/gcc/config/rs6000/rs6000.h
>> @@ -1730,6 +1730,8 @@ typedef struct rs6000_args
>> in one reasonably fast instruction.  */
>>  #define MOVE_MAX (! TARGET_POWERPC64 ? 4 : 8)
>>  #define MAX_MOVE_MAX 8
>> +#define MOVE_MAX_PIECES (!TARGET_POWERPC64 ? 4 : 16)
>> +#define COMPARE_MAX_PIECES (!TARGET_POWERPC64 ? 4 : 16)
>> 
>>  /* Nonzero if access to memory by bytes is no faster than for words.
>> Also nonzero if doing byte operations (specifically shifts) in registers
>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr111449.c 
>> b/gcc/testsuite/gcc.target/powerpc/pr111449.c
>> new file mode 100644
>> index 

Re: [PATCH 2/3] recog: Support space in "[ cons"

2023-09-27 Thread Richard Sandiford
Andrea Corallo  writes:
> Hi all,
>
> this is to allow for spaces before "cons:" in the definitions of
> patterns using the new compact syntax, ex:
>
> (define_insn "aarch64_simd_dup"
>   [(set (match_operand:VDQ_I 0 "register_operand")
> (vec_duplicate:VDQ_I
>   (match_operand: 1 "register_operand")))]
>   "TARGET_SIMD"
>   {@ [ cons: =0 , 1  ; attrs: type  ]
>  [ w, w  ; neon_dup  ] dup\t%0., %1.[0]
>  [ w, ?r ; neon_from_gp  ] dup\t%0., %1
>   }
> )
>
> gcc/Changelog
>
> 2023-09-20  Andrea Corallo  
>
>   * gensupport.cc (convert_syntax): Skip spaces before "cons:"
>   in new compact pattern syntax.

OK, thanks.

Richard

> ---
>  gcc/gensupport.cc | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/gcc/gensupport.cc b/gcc/gensupport.cc
> index 7e125e3d8db..dd920d673b4 100644
> --- a/gcc/gensupport.cc
> +++ b/gcc/gensupport.cc
> @@ -894,6 +894,8 @@ convert_syntax (rtx x, file_location loc)
>if (!expect_char (, '['))
>  fatal_at (loc, "expecing `[' to begin section list");
>  
> +  skip_spaces ();
> +
>parse_section_layout (loc, , "cons:", tconvec, true);
>  
>if (*templ != ']')


Re: [PATCH 1/3] recog: Improve parser for pattern new compact syntax

2023-09-27 Thread Richard Sandiford
Andrea Corallo  writes:
> From: Richard Sandiford 
>
> Hi all,
>
> this is to add support to the new compact pattern syntax for the case
> where the constraints do appear unsorted like:
>
> (define_insn "*si3_insn_uxtw"
>   [(set (match_operand:DI 0 "register_operand")
> (zero_extend:DI (SHIFT_no_rotate:SI
>  (match_operand:SI 1 "register_operand")
>  (match_operand:QI 2 "aarch64_reg_or_shift_imm_si"]
>   ""
>   {@ [cons: =0, 2,   1]
>  [  r,  Uss, r] \\t%w0, %w1, %2
>  [  r,  r,   r] \\t%w0, %w1, %w2
>   }
>   [(set_attr "type" "bfx,shift_reg")]
> )
>
> Best Regards
>
>   Andrea
>
> gcc/Changelog
>
> 2023-09-20  Richard Sandiford  
>
>   * gensupport.cc (convert_syntax): Updated to support unordered
>   constraints in compact syntax.

Thanks for picking this up.  OK for trunk.

Richard

> ---
>  gcc/gensupport.cc | 32 
>  1 file changed, 16 insertions(+), 16 deletions(-)
>
> diff --git a/gcc/gensupport.cc b/gcc/gensupport.cc
> index f7164b3214d..7e125e3d8db 100644
> --- a/gcc/gensupport.cc
> +++ b/gcc/gensupport.cc
> @@ -896,19 +896,6 @@ convert_syntax (rtx x, file_location loc)
>  
>parse_section_layout (loc, , "cons:", tconvec, true);
>  
> -  /* Check for any duplicate cons entries and sort based on i.  */
> -  for (auto e : tconvec)
> -{
> -  unsigned idx = e.idx;
> -  if (idx >= convec.size ())
> - convec.resize (idx + 1);
> -
> -  if (convec[idx].idx >= 0)
> - fatal_at (loc, "duplicate cons number found: %d", idx);
> -  convec[idx] = e;
> -}
> -  tconvec.clear ();
> -
>if (*templ != ']')
>  {
>if (*templ == ';')
> @@ -951,13 +938,13 @@ convert_syntax (rtx x, file_location loc)
> new_templ += '\n';
> new_templ.append (buffer);
> /* Parse the constraint list, then the attribute list.  */
> -   if (convec.size () > 0)
> - parse_section (, convec.size (), alt_no, convec, loc,
> +   if (tconvec.size () > 0)
> + parse_section (, tconvec.size (), alt_no, tconvec, loc,
>  "constraint");
>  
> if (attrvec.size () > 0)
>   {
> -   if (convec.size () > 0 && !expect_char (, ';'))
> +   if (tconvec.size () > 0 && !expect_char (, ';'))
>   fatal_at (loc, "expected `;' to separate constraints "
>  "and attributes in alternative %d", alt_no);
>  
> @@ -1027,6 +1014,19 @@ convert_syntax (rtx x, file_location loc)
>++alt_no;
>  }
>  
> +  /* Check for any duplicate cons entries and sort based on i.  */
> +  for (auto e : tconvec)
> +{
> +  unsigned idx = e.idx;
> +  if (idx >= convec.size ())
> + convec.resize (idx + 1);
> +
> +  if (convec[idx].idx >= 0)
> + fatal_at (loc, "duplicate cons number found: %d", idx);
> +  convec[idx] = e;
> +}
> +  tconvec.clear ();
> +
>/* Write the constraints and attributes into their proper places.  */
>if (convec.size () > 0)
>  add_constraints (x, loc, convec);


[PATCH] Simplify abs (copysign (x, y))

2023-09-27 Thread Richard Biener
The following adds simplification of abs (copysign (x, y)) to abs (x).

Bootstrap & regtest in progress on x86_64-unknown-linux-gnu.

Richard.

* match.pd (abs (copysign (x, y)) -> abs (x)): New pattern.

* gcc.dg/fold-abs-6.c: New testcase.
---
 gcc/match.pd  | 5 +
 gcc/testsuite/gcc.dg/fold-abs-6.c | 9 +
 2 files changed, 14 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/fold-abs-6.c

diff --git a/gcc/match.pd b/gcc/match.pd
index a17778fbaa6..069d832decc 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7429,6 +7429,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (COPYSIGN_ALL @0 tree_expr_nonnegative_p@1)
  (abs @0))
 
+(simplify
+ /* fabs (copysign(x, y)) -> fabs (x).  */
+ (abs (COPYSIGN_ALL @0 @1))
+ (abs @0))
+
 (for scale (LDEXP SCALBN SCALBLN)
  /* ldexp(0, x) -> 0.  */
  (simplify
diff --git a/gcc/testsuite/gcc.dg/fold-abs-6.c 
b/gcc/testsuite/gcc.dg/fold-abs-6.c
new file mode 100644
index 000..42ef9230dfd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-abs-6.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-original" } */
+
+float foo (float x, float y)
+{
+  return __builtin_fabsf (__builtin_copysignf (x, y));
+}
+
+/* { dg-final { scan-tree-dump "return ABS_EXPR ;" "original" } } */
-- 
2.35.3


[PATCH] vec.h, v2: Make some ops work with non-trivially copy constructible and/or destructible types

2023-09-27 Thread Jakub Jelinek
On Wed, Sep 27, 2023 at 07:17:22AM +, Richard Biener wrote:
> OK I guess.  Can you summarize the limitations for non-POD types
> in the big comment at the start of vec.h?

Still haven't done that, but will do after we flesh out the details
below.

> (can we put in static_asserts
> in the places that obviously do not work?)

I've tried to do this though, I think the static_asserts will allow
making sure we only use what is supportable and will serve better than
any kind of comment.

But, I've run into quite a few triggered assertion failures with that, and
the question is what we want to do with them.  Applying
--- gcc/vec.h.jj2023-09-27 12:11:56.0 +0200
+++ gcc/vec.h   2023-09-27 12:39:50.971613964 +0200
@@ -1160,7 +1160,7 @@ template
 inline void
 vec::qsort (int (*cmp) (const void *, const void *))
 {
-  static_assert (std::is_trivially_copyable ::value, "");
+//  static_assert (std::is_trivially_copyable ::value, "");
   if (length () > 1)
 gcc_qsort (address (), length (), sizeof (T), cmp);
 }
@@ -1359,7 +1359,7 @@ inline void
 vec::quick_grow (unsigned len)
 {
   gcc_checking_assert (length () <= len && len <= m_vecpfx.m_alloc);
-  static_assert (std::is_trivially_default_constructible ::value, "");
+//  static_assert (std::is_trivially_default_constructible ::value, "");
   m_vecpfx.m_num = len;
 }
 
incremental patch makes stuff work (at lest make in gcc subdir for
x86_64-linux with --enable-languages=c,c++,fortran,lto succeeds), so it is just
those 2 asserts.
Following is full list of failures and discusses details.

dbgcnt.cc:132  limits[index].qsort (cmp_tuples);
T = std::pair
where std::pair is not trivially copyable.  Our qsort implementation uses
memcpys/memmoves to reshuffle the array elements (as it isn't inlined and
so can't use std::swap and the like), so I think we need the types trivially
copyable (or at least trivially copy assignable from the same type or
something similar if we close all eyes).  Is there some std::pair
alternative which is trivially copyable, or do we need to define structures
to achieve that?  Or just close all eyes and allow qsort/sort/stablesort
either on trivially copyable types, or on std::pair where both template
parameters are trivially copyable?

genrecog.cc:3466  candidates.qsort (subroutine_candidate_cmp);
T = std::pair
ditto

dwarf2asm.cc:1061  temp.qsort (compare_strings);
T = std::pair
ditto

tree-ssa-dce.cc:1776  args.qsort (sort_phi_args);
T = std::pair
ditto

tree-ssa-loop-manip.cc:376  names.qsort (loop_name_cmp);
T = std::pair
ditto

omp-oacc-neuter-broadcast.cc:1730  priority.qsort (sort_size_descending);
T = std::pair
ditto

ipa-icf.cc:3087  to_split.qsort (sort_congruence_split);
T = std::pair
ditto

ipa-icf.cc:3360  classes.qsort (sort_congruence_class_groups_by_decl_uid);
T = std::pair
ditto

tree-vect-slp.cc:6991  li_scalar_costs.qsort (li_cost_vec_cmp);
T = std::pair
ditto

tree-vect-slp.cc:7249  lane_defs.qsort (vld_cmp);
T = std::pair
ditto

cfganal.cc:471  control_dependence_map.quick_grow (last_basic_block_for_fn 
(cfun));
T = bitmap_head
This is a different case, bitmap_head has a non-trivial default constructor
that essentially fills it with zeros, and for quick_grow we have
quick_grow_cleared which default constructs elements, but quick_grow leaves
them unconstructed/uninitialized, which is why I wanted to add an assert
there, e.g. quick_grow on the wide_int/widest_int WIP would break stuff
terribly.  In the cfganal.cc case, we call bitmap_initialize on it after
growing it, which overwrites all elements and doesn't depend on values of
any, so grow_cleared actually is invalid from strict C++ POV, but not
in reality.  Do we still want the assert in and slow cfganal.cc slightly
by using quick_grow_cleared?  Or another option would be to make
quick_grow work transparently the same as quick_grow_cleared for
non-trivially default constructible types.  Though, I think it might be
better if people explicitly see that the operation is more expensive.

tree-ssa-loop-im.cc:2592  first_edge_seq.safe_grow 
(fes_length
+ 
extra_refs.length ());
T = seq_entry
Here, seq_entry is:
struct seq_entry
{
  seq_entry () {}
  seq_entry (unsigned f, sm_kind k, tree fr = NULL)
: first (f), second (k), from (fr) {}
  unsigned first;
  sm_kind second;
  tree from;
};
Wonder if making seq_entry () = default;
wouldn't cure this.

tree-ssa-loop-im.cc:3499  memory_accesses.refs_loaded_in_loop.quick_grow 
(number_of_loops (cfun));
T = bitmap_head
like the cfganal.cc case.

tree-ssa-live.cc:1364  active.quick_grow (last_basic_block_for_fn (fn));
T = bitmap_head
ditto

rtl-ssa/blocks.cc:60  bb_phis.quick_grow (num_bb_indices);
T = rtl_ssa::function_info::bb_phi_info
This structure contains bitmap_head, so again isn't default constructible.

rtl-ssa/blocks.cc:617  frontiers.safe_grow (num_bb_indices);
T = bitmap_head
see above


Re: [PATCH]AArch64: Use SVE unpredicated LOGICAL expressions when Advanced SIMD inefficient [PR109154]

2023-09-27 Thread Richard Sandiford
Tamar Christina  writes:
>> -Original Message-
>> From: Richard Sandiford 
>> Sent: Wednesday, September 27, 2023 9:50 AM
>> To: Tamar Christina 
>> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
>> ; Marcus Shawcroft
>> ; Kyrylo Tkachov 
>> Subject: Re: [PATCH]AArch64: Use SVE unpredicated LOGICAL expressions
>> when Advanced SIMD inefficient [PR109154]
>> 
>> Tamar Christina  writes:
>> > Hi All,
>> >
>> > SVE has much bigger immediate encoding range for bitmasks than
>> > Advanced SIMD has and so on a system that is SVE capable if we need an
>> > Advanced SIMD Inclusive-OR by immediate and would require a reload then
>> an unpredicated SVE ORR instead.
>> >
>> > This has both speed and size improvements.
>> >
>> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> >
>> > Ok for master?
>> >
>> > Thanks,
>> > Tamar
>> >
>> > gcc/ChangeLog:
>> >
>> >PR tree-optimization/109154
>> >* config/aarch64/aarch64.md (3): Convert to new
>> syntax and
>> >SVE split case.
>> >* config/aarch64/iterators.md (VCONV, vconv): New.
>> >
>> > gcc/testsuite/ChangeLog:
>> >
>> >PR tree-optimization/109154
>> >* gcc.target/aarch64/sve/fneg-abs_2.c: Updated.
>> >* gcc.target/aarch64/sve/fneg-abs_4.c: Updated.
>> >
>> > --- inline copy of patch --
>> > diff --git a/gcc/config/aarch64/aarch64.md
>> > b/gcc/config/aarch64/aarch64.md index
>> >
>> 60c92213c75a2a4c18a6b59ae52fe45d1e872718..377c5cafedd43d8d13204
>> 89a3626
>> > 7cc6e5f15239 100644
>> > --- a/gcc/config/aarch64/aarch64.md
>> > +++ b/gcc/config/aarch64/aarch64.md
>> > @@ -4551,17 +4551,27 @@ (define_insn_and_split
>> "*aarch64_and_imm2"
>> >}
>> >  )
>> >
>> > -(define_insn "3"
>> > -  [(set (match_operand:GPI 0 "register_operand" "=r,rk,w")
>> > -  (LOGICAL:GPI (match_operand:GPI 1 "register_operand" "%r,r,w")
>> > -   (match_operand:GPI 2 "aarch64_logical_operand"
>> "r,,w")))]
>> > -  ""
>> > -  "@
>> > -  \\t%0, %1, %2
>> > -  \\t%0, %1, %2
>> > -  \\t%0., %1., %2."
>> > -  [(set_attr "type" "logic_reg,logic_imm,neon_logic")
>> > -   (set_attr "arch" "*,*,simd")]
>> > +(define_insn_and_split "3"
>> > +  [(set (match_operand:GPI 0 "register_operand")
>> > +  (LOGICAL:GPI (match_operand:GPI 1 "register_operand")
>> > +   (match_operand:GPI 2 "aarch64_logical_operand")))]
>> > +  ""
>> > +  {@ [cons: =0, 1, 2; attrs: type, arch]
>> > + [r , %r, r   ; logic_reg , *   ] \t%0, %1, %2
>> > + [rk, r , ; logic_imm , *   ] \t%0, %1, %2
>> > + [w , 0 , ; * , sve ] #
>> > + [w , w , w   ; neon_logic, simd] \t%0., 
>> > %1.,
>> %2.
>> > +  }
>> > +  "&& TARGET_SVE && rtx_equal_p (operands[0], operands[1])
>> > +   && satisfies_constraint_ (operands[2])
>> > +   && FP_REGNUM_P (REGNO (operands[0]))"
>> > +  [(const_int 0)]
>> > +  {
>> > +rtx op1 = lowpart_subreg (mode, operands[1], mode);
>> > +rtx op2 = gen_const_vec_duplicate (mode, operands[2]);
>> > +emit_insn (gen_3 (op1, op1, op2));
>> > +DONE;
>> > +  }
>> >  )
>> 
>> The WIP SME patches add a %Z modifier for 'z' register prefixes, similarly to
>> b/h/s/d for scalar FP.  With that I think the alternative can be:
>> 
>>  [w , 0 , ; * , sve ] \t%Z0., %Z0., #%2
>> 
>> although it would be nice to keep the hex constant.
>
> My original patch added a %u for (undecorated) which just prints the register
> number and changed %C to also accept a single constant instead of only a 
> uniform vector.

Not saying no to %u in future, but %Z seems more consistent with the
current approach.  And yeah, I'd also wondered about extending %C.
The problem is guessing whether to print a 32-bit, 64-bit or 128-bit
constant for negative immediates.

Thanks,
Richard

> But I figured you wouldn't like that? 
>
> Cheers,
> Tamar
>
>> 
>> Will try to post the patches up to that part soon.
>> 
>> Thanks,
>> Richard
>> 
>> >
>> >  ;; zero_extend version of above
>> > diff --git a/gcc/config/aarch64/iterators.md
>> > b/gcc/config/aarch64/iterators.md index
>> >
>> d17becc37e230684beaee3c69e2a0f0ce612eda5..568cd5d1a3a9e00475376
>> 177ad13
>> > de72609df3d8 100644
>> > --- a/gcc/config/aarch64/iterators.md
>> > +++ b/gcc/config/aarch64/iterators.md
>> > @@ -1432,6 +1432,11 @@ (define_mode_attr VCONQ [(V8QI "V16QI")
>> (V16QI "V16QI")
>> > (HI   "V8HI") (QI   "V16QI")
>> > (SF   "V4SF") (DF   "V2DF")])
>> >
>> > +;; 128-bit container modes for the lower part of an SVE vector to the
>> > +inner or ;; scalar source mode.
>> > +(define_mode_attr VCONV [(SI "VNx4SI") (DI "VNx2DI")])
>> > +(define_mode_attr vconv [(SI "vnx4si") (DI "vnx2di")])
>> > +
>> >  ;; Half modes of all vector modes.
>> >  (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
>> > (V4HI "V2HI")  (V8HI  "V4HI")
>> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
>> > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
>> > index
>> >
>> 

Re: [PATCH]AArch64 Rewrite simd move immediate patterns to new syntax

2023-09-27 Thread Richard Sandiford
Tamar Christina  writes:
> Hi All,
>
> This rewrites the simd MOV patterns to use the new compact syntax.
> No change in semantics is expected.  This will be needed in follow on patches.
>
> This also merges the splits into the define_insn which will also be needed 
> soon.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   PR tree-optimization/109154
>   * config/aarch64/aarch64-simd.md (*aarch64_simd_mov):
>   Rewrite to new syntax.
>   (*aarch64_simd_mov   splits.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> e955691f1be8830efacc237465119764ce2a4942..7b4d5a37a9795fefda785aaacc246918826ed0a2
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -143,54 +143,57 @@ (define_insn "aarch64_dup_lane_"
>  )
>  
>  (define_insn "*aarch64_simd_mov"
> -  [(set (match_operand:VDMOV 0 "nonimmediate_operand"
> - "=w, r, m,  m, m,  w, ?r, ?w, ?r,  w,  w")
> - (match_operand:VDMOV 1 "general_operand"
> - "m,  m, Dz, w, r,  w,  w,  r,  r, Dn, Dz"))]
> +  [(set (match_operand:VDMOV 0 "nonimmediate_operand")
> + (match_operand:VDMOV 1 "general_operand"))]
>"TARGET_FLOAT
> && (register_operand (operands[0], mode)
> || aarch64_simd_reg_or_zero (operands[1], mode))"
> -  "@
> -   ldr\t%d0, %1
> -   ldr\t%x0, %1
> -   str\txzr, %0
> -   str\t%d1, %0
> -   str\t%x1, %0
> -   * return TARGET_SIMD ? \"mov\t%0., %1.\" : \"fmov\t%d0, 
> %d1\";
> -   * return TARGET_SIMD ? \"umov\t%0, %1.d[0]\" : \"fmov\t%x0, %d1\";
> -   fmov\t%d0, %1
> -   mov\t%0, %1
> -   * return aarch64_output_simd_mov_immediate (operands[1], 64);
> -   fmov\t%d0, xzr"
> -  [(set_attr "type" "neon_load1_1reg, load_8, store_8, 
> neon_store1_1reg,\
> -  store_8, neon_logic, neon_to_gp, f_mcr,\
> -  mov_reg, neon_move, f_mcr")
> -   (set_attr "arch" "*,*,*,*,*,*,*,*,*,simd,*")]
> -)
> -
> -(define_insn "*aarch64_simd_mov"
> -  [(set (match_operand:VQMOV 0 "nonimmediate_operand"
> - "=w, Umn,  m,  w, ?r, ?w, ?r, w,  w")
> - (match_operand:VQMOV 1 "general_operand"
> - "m,  Dz, w,  w,  w,  r,  r, Dn, Dz"))]
> +  {@ [cons: =0, 1; attrs: type, arch]
> + [w , m ; neon_load1_1reg , *   ] ldr\t%d0, %1
> + [r , m ; load_8 , *   ] ldr\t%x0, %1
> + [m , Dz; store_8, *   ] str\txzr, %0
> + [m , w ; neon_store1_1reg, *   ] str\t%d1, %0
> + [m , r ; store_8, *   ] str\t%x1, %0
> + [w , w ; neon_logic  , simd] mov\t%0., %1.
> + [w , w ; neon_logic  , *   ] fmov\t%d0, %d1
> + [?r, w ; neon_to_gp  , simd] umov\t%0, %1.d[0]
> + [?r, w ; neon_to_gp  , *   ] fmov\t%x0, %d1
> + [?w, r ; f_mcr  , *   ] fmov\t%d0, %1
> + [?r, r ; mov_reg, *   ] mov\t%0, %1
> + [w , Dn; neon_move   , simd] << 
> aarch64_output_simd_mov_immediate (operands[1], 64);
> + [w , Dz; f_mcr  , *   ] fmov\t%d0, xzr
> +  }
> +)
> +
> +(define_insn_and_split "*aarch64_simd_mov"
> +  [(set (match_operand:VQMOV 0 "nonimmediate_operand")
> + (match_operand:VQMOV 1 "general_operand"))]
>"TARGET_FLOAT
> && (register_operand (operands[0], mode)
> || aarch64_simd_reg_or_zero (operands[1], mode))"
> -  "@
> -   ldr\t%q0, %1
> -   stp\txzr, xzr, %0
> -   str\t%q1, %0
> -   mov\t%0., %1.
> -   #
> -   #
> -   #
> -   * return aarch64_output_simd_mov_immediate (operands[1], 128);
> -   fmov\t%d0, xzr"
> -  [(set_attr "type" "neon_load1_1reg, store_16, neon_store1_1reg,\
> -  neon_logic, multiple, multiple,\
> -  multiple, neon_move, fmov")
> -   (set_attr "length" "4,4,4,4,8,8,8,4,4")
> -   (set_attr "arch" "*,*,*,simd,*,*,*,simd,*")]
> +  {@ [cons: =0, 1; attrs: type, arch, length]
> + [w  , m ; neon_load1_1reg , *   , 4] ldr\t%q0, %1
> + [Umn, Dz; store_16   , *   , 4] stp\txzr, xzr, %0
> + [m  , w ; neon_store1_1reg, *   , 4] str\t%q1, %0
> + [w  , w ; neon_logic  , simd, 4] mov\t%0., %1.
> + [?r , w ; multiple   , *   , 8] #
> + [?w , r ; multiple   , *   , 8] #
> + [?r , r ; multiple   , *   , 8] #
> + [w  , Dn; neon_move   , simd, 4] << 
> aarch64_output_simd_mov_immediate (operands[1], 128);
> + [w  , Dz; fmov   , *   , 4] fmov\t%d0, xzr
> +  }
> +  "&& reload_completed
> +   && !(FP_REGNUM_P (REGNO (operands[0]))
> + && FP_REGNUM_P (REGNO (operands[1])))"

Won't this also trigger for the load, store, and Dn alternatives?

Looks OK otherwise.

Thanks,
Richard

> +  [(const_int 0)]
> +  {
> +if (GP_REGNUM_P (REGNO (operands[0]))
> + && GP_REGNUM_P (REGNO (operands[1])))
> +  aarch64_simd_emit_reg_reg_move (operands, DImode, 2);
> +else
> +  aarch64_split_simd_move 

Re: [PATCH]AArch64 Add special patterns for creating DI scalar and vector constant 1 << 63 [PR109154]

2023-09-27 Thread Richard Sandiford
Tamar Christina  writes:
> Hi All,
>
> This adds a way to generate special sequences for creation of constants for
> which we don't have single instructions sequences which would have normally
> lead to a GP -> FP transfer or a literal load.
>
> The patch starts out by adding support for creating 1 << 63 using fneg (mov 
> 0).
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   PR tree-optimization/109154
>   * config/aarch64/aarch64-protos.h (aarch64_simd_special_constant_p):
>   New.
>   * config/aarch64/aarch64-simd.md (*aarch64_simd_mov): Add
>   new coden for special constants.
>   * config/aarch64/aarch64.cc (aarch64_extract_vec_duplicate_wide_int):
>   Take optional mode.
>   (aarch64_simd_special_constant_p): New.
>   * config/aarch64/aarch64.md (*movdi_aarch64): Add new codegen for
>   special constants.
>   * config/aarch64/constraints.md (Dx): new.
>
> gcc/testsuite/ChangeLog:
>
>   PR tree-optimization/109154
>   * gcc.target/aarch64/fneg-abs_1.c: Updated.
>   * gcc.target/aarch64/fneg-abs_2.c: Updated.
>   * gcc.target/aarch64/fneg-abs_4.c: Updated.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 70303d6fd953e0c397b9138ede8858c2db2e53db..2af9f6a774c20268bf90756c17064bbff8f8ff87
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -827,6 +827,7 @@ bool aarch64_sve_ptrue_svpattern_p (rtx, struct 
> simd_immediate_info *);
>  bool aarch64_simd_valid_immediate (rtx, struct simd_immediate_info *,
>   enum simd_immediate_check w = AARCH64_CHECK_MOV);
>  rtx aarch64_check_zero_based_sve_index_immediate (rtx);
> +bool aarch64_simd_special_constant_p (rtx, rtx, machine_mode);
>  bool aarch64_sve_index_immediate_p (rtx);
>  bool aarch64_sve_arith_immediate_p (machine_mode, rtx, bool);
>  bool aarch64_sve_sqadd_sqsub_immediate_p (machine_mode, rtx, bool);
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 7b4d5a37a9795fefda785aaacc246918826ed0a2..63c802d942a186b5a94c66d2e83828a82a88ffa8
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -181,17 +181,28 @@ (define_insn_and_split "*aarch64_simd_mov"
>   [?r , r ; multiple   , *   , 8] #
>   [w  , Dn; neon_move   , simd, 4] << 
> aarch64_output_simd_mov_immediate (operands[1], 128);
>   [w  , Dz; fmov   , *   , 4] fmov\t%d0, xzr
> + [w  , Dx; neon_move  , simd, 8] #
>}
>"&& reload_completed
> -   && !(FP_REGNUM_P (REGNO (operands[0]))
> - && FP_REGNUM_P (REGNO (operands[1])))"
> +   && (!(FP_REGNUM_P (REGNO (operands[0]))
> +  && FP_REGNUM_P (REGNO (operands[1])))
> +   || (aarch64_simd_special_constant_p (operands[1], NULL_RTX, 
> mode)
> +&& FP_REGNUM_P (REGNO (operands[0]"

Unless I'm missing something, the new test is already covered by the:

  !(FP_REGNUM_P (REGNO (operands[0]))
&& FP_REGNUM_P (REGNO (operands[1]))

>[(const_int 0)]
>{
>  if (GP_REGNUM_P (REGNO (operands[0]))
>   && GP_REGNUM_P (REGNO (operands[1])))
>aarch64_simd_emit_reg_reg_move (operands, DImode, 2);
>  else
> -  aarch64_split_simd_move (operands[0], operands[1]);
> +  {
> + if (FP_REGNUM_P (REGNO (operands[0]))
> + && mode == V2DImode
> + && aarch64_simd_special_constant_p (operands[1], operands[0],
> + mode))
> +   ;

This looked odd at first, since _p functions don't normally have
side effects.  So it looked like this case was expanding to nothing.

How about renaming aarch64_simd_special_constant_p to
aarch64_maybe_generate_simd_constant, and then making
aarch64_simd_special_constant_p a wrapper that passes the NULL_RTX?
Minor nit, but most other functions put the destination first.

> + else
> +   aarch64_split_simd_move (operands[0], operands[1]);
> +  }
>  DONE;
>}
>  )
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 3739a44bfd909b69a76529cc6b0ae2f01d6fb36e..6e7ee446f1b31ee8bcf121c97c1c6fa87725bf42
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -11799,16 +11799,18 @@ aarch64_get_condition_code_1 (machine_mode mode, 
> enum rtx_code comp_code)
>  /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
> duplicate of such constants.  If so, store in RET_WI the wide_int
> representation of the constant paired with the inner mode of the vector 
> mode
> -   or TImode for scalar X constants.  */
> +   or SMODE for scalar X constants.  If SMODE is not provided then TImode is
> +   used.  */

s/SMODE/MODE/, based on the code.

>  
>  static bool
> 

Re: Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]

2023-09-27 Thread juzhe.zh...@rivai.ai
Since after removing mem-to-mem pattern.

program main
  integer, dimension(:,:), allocatable :: a, b
  integer, dimension(:), allocatable :: sh
  allocate (a(2,2))
  allocate (b(2,2))
  allocate (sh(3))
  a = 1
  b = cshift(a,sh)
end program main

This case will failed if we don't change mov pattern.



juzhe.zh...@rivai.ai
 
From: Kito Cheng
Date: 2023-09-27 18:07
To: juzhe.zh...@rivai.ai
CC: kito.cheng; gcc-patches; jeffreyalaw; Robin Dapp
Subject: Re: Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]
I can understand why remove mem to mem pattern, but why the normal mov
pattern for VLS_AVL_IMM need to change too?
 
On Wed, Sep 27, 2023 at 10:39 AM juzhe.zh...@rivai.ai
 wrote:
>
> >> Why add `can_create_pseudo_p ()` here? this will split after reload,
> >> but we forbid that pattern between reload and split2?
>
> I have no ideal. Some fortran tests just need recognization of mem-to-mem 
> pattern before RA.
> I don't know the reason.
>
> 
> juzhe.zh...@rivai.ai
>
>
> From: Kito Cheng
> Date: 2023-09-27 17:33
> To: Juzhe-Zhong
> CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc
> Subject: Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]
> >  (define_insn_and_split "*mov"
> >[(set (match_operand:VLS_AVL_IMM 0 "reg_or_mem_operand" "=vr, m, vr")
> > (match_operand:VLS_AVL_IMM 1 "reg_or_mem_operand" "  m,vr, vr"))]
> >"TARGET_VECTOR
> > -   && (register_operand (operands[0], mode)
> > +   && (can_create_pseudo_p ()
>
> Why add `can_create_pseudo_p ()` here? this will split after reload,
> but we forbid that pattern between reload and split2?
>
> > +   || register_operand (operands[0], mode)
> > || register_operand (operands[1], mode))"
> >"@
> > #
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> > index aedf98819bb..24bb7240db8 100644
> > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> > @@ -4,54 +4,6 @@
> >
> >  #include "def.h"
> >
> > -/*
> > -** mov0:
> > -** lbu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sb\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov0 (int8_t *in, int8_t *out)
> > -{
> > -  v1qi v = *(v1qi*)in;
> > -  *(v1qi*)out = v;
> > -}
> > -
> > -/*
> > -** mov1:
> > -** lhu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sh\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov1 (int8_t *in, int8_t *out)
> > -{
> > -  v2qi v = *(v2qi*)in;
> > -  *(v2qi*)out = v;
> > -}
> > -
> > -/*
> > -** mov2:
> > -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov2 (int8_t *in, int8_t *out)
> > -{
> > -  v4qi v = *(v4qi*)in;
> > -  *(v4qi*)out = v;
> > -}
> > -
> > -/*
> > -** mov3:
> > -** ld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov3 (int8_t *in, int8_t *out)
> > -{
> > -  v8qi v = *(v8qi*)in;
> > -  *(v8qi*)out = v;
> > -}
> > -
> >  /*
> >  ** mov4:
> >  ** vsetivli\s+zero,\s*16,\s*e8,\s*mf8,\s*t[au],\s*m[au]
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> > index 5e9615412b7..cae96b3be3f 100644
> > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> > @@ -4,18 +4,6 @@
> >
> >  #include "def.h"
> >
> > -/*
> > -** mov0:
> > -** fld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** fsd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov0 (double *in, double *out)
> > -{
> > -  v1df v = *(v1df*)in;
> > -  *(v1df*)out = v;
> > -}
> > -
> >  /*
> >  ** mov1:
> >  ** vsetivli\s+zero,\s*2,\s*e64,\s*m1,\s*t[au],\s*m[au]
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
> > deleted file mode 100644
> > index 10ae1972db7..000
> > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
> > +++ /dev/null
> > @@ -1,19 +0,0 @@
> > -/* { dg-do compile } */
> > -/* { dg-options "-march=rv32gcv_zvfh_zvl4096b -mabi=ilp32d -O3 
> > -fno-schedule-insns -fno-schedule-insns2" } */
> > -/* { dg-final { check-function-bodies "**" "" } } */
> > -
> > -#include "def.h"
> > -
> > -/*
> > -** mov:
> > -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** lw\s+[a-x0-9]+,4\s*\([a-x0-9]+\)
> > -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sw\s+[a-x0-9]+,4\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov (int8_t *in, int8_t *out)
> > -{
> > -  v8qi v = *(v8qi*)in;
> > -  *(v8qi*)out = v;
> > -}
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> > index f2880ae5e77..86ce22896c5 100644
> > --- 

Re: Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]

2023-09-27 Thread Kito Cheng
I can understand why remove mem to mem pattern, but why the normal mov
pattern for VLS_AVL_IMM need to change too?

On Wed, Sep 27, 2023 at 10:39 AM juzhe.zh...@rivai.ai
 wrote:
>
> >> Why add `can_create_pseudo_p ()` here? this will split after reload,
> >> but we forbid that pattern between reload and split2?
>
> I have no ideal. Some fortran tests just need recognization of mem-to-mem 
> pattern before RA.
> I don't know the reason.
>
> 
> juzhe.zh...@rivai.ai
>
>
> From: Kito Cheng
> Date: 2023-09-27 17:33
> To: Juzhe-Zhong
> CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc
> Subject: Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]
> >  (define_insn_and_split "*mov"
> >[(set (match_operand:VLS_AVL_IMM 0 "reg_or_mem_operand" "=vr, m, vr")
> > (match_operand:VLS_AVL_IMM 1 "reg_or_mem_operand" "  m,vr, vr"))]
> >"TARGET_VECTOR
> > -   && (register_operand (operands[0], mode)
> > +   && (can_create_pseudo_p ()
>
> Why add `can_create_pseudo_p ()` here? this will split after reload,
> but we forbid that pattern between reload and split2?
>
> > +   || register_operand (operands[0], mode)
> > || register_operand (operands[1], mode))"
> >"@
> > #
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> > index aedf98819bb..24bb7240db8 100644
> > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> > @@ -4,54 +4,6 @@
> >
> >  #include "def.h"
> >
> > -/*
> > -** mov0:
> > -** lbu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sb\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov0 (int8_t *in, int8_t *out)
> > -{
> > -  v1qi v = *(v1qi*)in;
> > -  *(v1qi*)out = v;
> > -}
> > -
> > -/*
> > -** mov1:
> > -** lhu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sh\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov1 (int8_t *in, int8_t *out)
> > -{
> > -  v2qi v = *(v2qi*)in;
> > -  *(v2qi*)out = v;
> > -}
> > -
> > -/*
> > -** mov2:
> > -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov2 (int8_t *in, int8_t *out)
> > -{
> > -  v4qi v = *(v4qi*)in;
> > -  *(v4qi*)out = v;
> > -}
> > -
> > -/*
> > -** mov3:
> > -** ld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov3 (int8_t *in, int8_t *out)
> > -{
> > -  v8qi v = *(v8qi*)in;
> > -  *(v8qi*)out = v;
> > -}
> > -
> >  /*
> >  ** mov4:
> >  ** vsetivli\s+zero,\s*16,\s*e8,\s*mf8,\s*t[au],\s*m[au]
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> > index 5e9615412b7..cae96b3be3f 100644
> > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> > @@ -4,18 +4,6 @@
> >
> >  #include "def.h"
> >
> > -/*
> > -** mov0:
> > -** fld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** fsd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov0 (double *in, double *out)
> > -{
> > -  v1df v = *(v1df*)in;
> > -  *(v1df*)out = v;
> > -}
> > -
> >  /*
> >  ** mov1:
> >  ** vsetivli\s+zero,\s*2,\s*e64,\s*m1,\s*t[au],\s*m[au]
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
> > deleted file mode 100644
> > index 10ae1972db7..000
> > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
> > +++ /dev/null
> > @@ -1,19 +0,0 @@
> > -/* { dg-do compile } */
> > -/* { dg-options "-march=rv32gcv_zvfh_zvl4096b -mabi=ilp32d -O3 
> > -fno-schedule-insns -fno-schedule-insns2" } */
> > -/* { dg-final { check-function-bodies "**" "" } } */
> > -
> > -#include "def.h"
> > -
> > -/*
> > -** mov:
> > -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** lw\s+[a-x0-9]+,4\s*\([a-x0-9]+\)
> > -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sw\s+[a-x0-9]+,4\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov (int8_t *in, int8_t *out)
> > -{
> > -  v8qi v = *(v8qi*)in;
> > -  *(v8qi*)out = v;
> > -}
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c 
> > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> > index f2880ae5e77..86ce22896c5 100644
> > --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> > @@ -4,42 +4,6 @@
> >
> >  #include "def.h"
> >
> > -/*
> > -** mov0:
> > -** lhu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sh\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void mov0 (int16_t *in, int16_t *out)
> > -{
> > -  v1hi v = *(v1hi*)in;
> > -  *(v1hi*)out = v;
> > -}
> > -
> > -/*
> > -** mov1:
> > -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> > -**  ret
> > -*/
> > -void 

RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]

2023-09-27 Thread Richard Biener
On Wed, 27 Sep 2023, Tamar Christina wrote:

> > -Original Message-
> > From: Tamar Christina 
> > Sent: Wednesday, September 27, 2023 8:57 AM
> > To: Richard Biener 
> > Cc: Andrew Pinski ; gcc-patches@gcc.gnu.org; nd
> > ; j...@ventanamicro.com
> > Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 
> > <<
> > signbit(x)) [PR109154]
> > 
> > > -Original Message-
> > > From: Richard Biener 
> > > Sent: Wednesday, September 27, 2023 8:12 AM
> > > To: Tamar Christina 
> > > Cc: Andrew Pinski ; gcc-patches@gcc.gnu.org; nd
> > > ; j...@ventanamicro.com
> > > Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x
> > > | (1 <<
> > > signbit(x)) [PR109154]
> > >
> > > On Wed, 27 Sep 2023, Tamar Christina wrote:
> > >
> > > > > -Original Message-
> > > > > From: Andrew Pinski 
> > > > > Sent: Wednesday, September 27, 2023 2:17 AM
> > > > > To: Tamar Christina 
> > > > > Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de;
> > > > > j...@ventanamicro.com
> > > > > Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x))
> > > > > to x | (1 <<
> > > > > signbit(x)) [PR109154]
> > > > >
> > > > > On Tue, Sep 26, 2023 at 5:51?PM Tamar Christina
> > > > > 
> > > > > wrote:
> > > > > >
> > > > > > Hi All,
> > > > > >
> > > > > > For targets that allow conversion between int and float modes
> > > > > > this adds a new optimization transforming fneg (fabs (x)) into x
> > > > > > | (1 << signbit(x)).  Such sequences are common in scientific
> > > > > > code working with
> > > > > gradients.
> > > > > >
> > > > > > The transformed instruction if the target has an inclusive-OR
> > > > > > that takes an immediate is both shorter an faster.  For those
> > > > > > that don't the immediate has to be seperate constructed but this
> > > > > > still ends up being faster as the immediate construction is not
> > > > > > on the critical
> > > path.
> > > > > >
> > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > > > >
> > > > > > Ok for master?
> > > > >
> > > > > I think this should be part of isel instead of match.
> > > > > Maybe we could use genmatch to generate the code that does the
> > > > > transformations but this does not belong as part of match really.
> > > >
> > > > I disagree.. I don't think this belongs in isel. Isel is for
> > > > structural
> > > transformations.
> > > > If there is a case for something else I'd imagine backwardprop is a
> > > > better
> > > choice.
> > > >
> > > > But I don't see why it doesn't belong here considering it *is* a
> > > > mathematical optimization and the file has plenty of transformations
> > > > such as mask optimizations and vector conditional rewriting.
> > >
> > > But the mathematical transform would more generally be fneg (fabs (x))
> > > -> copysign (x, -1.) and that can be optimally expanded at RTL expansion
> > time?
> > 
> > Ah sure, atm I did copysign (x, -1) -> x | 1 << signbits.  I can do it the 
> > other way
> > around.  And I guess since copysign (-x, y), copysign(|x|, y) -> copysign 
> > (x, y)
> > that should solve the trigonometry problem too.
> > 
> > Cool will do that instead, thanks!
> 
> Hmm this seems to conflict with the pattern
> 
> /* copysign(x, CST) -> [-]abs (x).  */
> (for copysigns (COPYSIGN_ALL)
>  (simplify
>   (copysigns @0 REAL_CST@1)
>   (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
>(negate (abs @0))
>(abs @0
> 
> Which does the opposite transformation.

I suppose the idea is that -abs(x) might be easier to optimize with
other patterns (consider a - copysign(x,...), optimizing to a + abs(x)).

For abs vs copysign it's a canonicalization, but (negate (abs @0))
is less canonical than copysign.

> Should I try removing this?

I'd say yes (and put the reverse canonicalization next to this pattern).

Richard.

> Thanks,
> Tamar
> 
> > 
> > Tamar
> > 
> > >
> > > Richard.
> > >
> > > > Regards,
> > > > Tamar
> > > >
> > > > >
> > > > > Thanks,
> > > > > Andrew
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR tree-optimization/109154
> > > > > > * match.pd: Add new neg+abs rule.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > PR tree-optimization/109154
> > > > > > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > > > > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > > > > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > > > > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > > > > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > > > > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > > > > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > > > > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > > > > >
> > > > > > --- inline copy of patch --
> > > > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > > > >
> > > > >
> > >

Re: Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]

2023-09-27 Thread juzhe.zh...@rivai.ai
>> Why add `can_create_pseudo_p ()` here? this will split after reload,
>> but we forbid that pattern between reload and split2?

I have no ideal. Some fortran tests just need recognization of mem-to-mem 
pattern before RA.
I don't know the reason.



juzhe.zh...@rivai.ai
 
From: Kito Cheng
Date: 2023-09-27 17:33
To: Juzhe-Zhong
CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc
Subject: Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]
>  (define_insn_and_split "*mov"
>[(set (match_operand:VLS_AVL_IMM 0 "reg_or_mem_operand" "=vr, m, vr")
> (match_operand:VLS_AVL_IMM 1 "reg_or_mem_operand" "  m,vr, vr"))]
>"TARGET_VECTOR
> -   && (register_operand (operands[0], mode)
> +   && (can_create_pseudo_p ()
 
Why add `can_create_pseudo_p ()` here? this will split after reload,
but we forbid that pattern between reload and split2?
 
> +   || register_operand (operands[0], mode)
> || register_operand (operands[1], mode))"
>"@
> #
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> index aedf98819bb..24bb7240db8 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> @@ -4,54 +4,6 @@
>
>  #include "def.h"
>
> -/*
> -** mov0:
> -** lbu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sb\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov0 (int8_t *in, int8_t *out)
> -{
> -  v1qi v = *(v1qi*)in;
> -  *(v1qi*)out = v;
> -}
> -
> -/*
> -** mov1:
> -** lhu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sh\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov1 (int8_t *in, int8_t *out)
> -{
> -  v2qi v = *(v2qi*)in;
> -  *(v2qi*)out = v;
> -}
> -
> -/*
> -** mov2:
> -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov2 (int8_t *in, int8_t *out)
> -{
> -  v4qi v = *(v4qi*)in;
> -  *(v4qi*)out = v;
> -}
> -
> -/*
> -** mov3:
> -** ld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov3 (int8_t *in, int8_t *out)
> -{
> -  v8qi v = *(v8qi*)in;
> -  *(v8qi*)out = v;
> -}
> -
>  /*
>  ** mov4:
>  ** vsetivli\s+zero,\s*16,\s*e8,\s*mf8,\s*t[au],\s*m[au]
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> index 5e9615412b7..cae96b3be3f 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> @@ -4,18 +4,6 @@
>
>  #include "def.h"
>
> -/*
> -** mov0:
> -** fld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** fsd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov0 (double *in, double *out)
> -{
> -  v1df v = *(v1df*)in;
> -  *(v1df*)out = v;
> -}
> -
>  /*
>  ** mov1:
>  ** vsetivli\s+zero,\s*2,\s*e64,\s*m1,\s*t[au],\s*m[au]
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
> deleted file mode 100644
> index 10ae1972db7..000
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
> +++ /dev/null
> @@ -1,19 +0,0 @@
> -/* { dg-do compile } */
> -/* { dg-options "-march=rv32gcv_zvfh_zvl4096b -mabi=ilp32d -O3 
> -fno-schedule-insns -fno-schedule-insns2" } */
> -/* { dg-final { check-function-bodies "**" "" } } */
> -
> -#include "def.h"
> -
> -/*
> -** mov:
> -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** lw\s+[a-x0-9]+,4\s*\([a-x0-9]+\)
> -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sw\s+[a-x0-9]+,4\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov (int8_t *in, int8_t *out)
> -{
> -  v8qi v = *(v8qi*)in;
> -  *(v8qi*)out = v;
> -}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> index f2880ae5e77..86ce22896c5 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> @@ -4,42 +4,6 @@
>
>  #include "def.h"
>
> -/*
> -** mov0:
> -** lhu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sh\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov0 (int16_t *in, int16_t *out)
> -{
> -  v1hi v = *(v1hi*)in;
> -  *(v1hi*)out = v;
> -}
> -
> -/*
> -** mov1:
> -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov1 (int16_t *in, int16_t *out)
> -{
> -  v2hi v = *(v2hi*)in;
> -  *(v2hi*)out = v;
> -}
> -
> -/*
> -** mov2:
> -** ld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov2 (int16_t *in, int16_t *out)
> -{
> -  v4hi v = *(v4hi*)in;
> -  *(v4hi*)out = v;
> -}
> -
>  /*
>  ** mov3:
>  ** vsetivli\s+zero,\s*8,\s*e16,\s*mf4,\s*t[au],\s*m[au]
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-4.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-4.c
> deleted 

RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]

2023-09-27 Thread Tamar Christina
> -Original Message-
> From: Tamar Christina 
> Sent: Wednesday, September 27, 2023 8:57 AM
> To: Richard Biener 
> Cc: Andrew Pinski ; gcc-patches@gcc.gnu.org; nd
> ; j...@ventanamicro.com
> Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 <<
> signbit(x)) [PR109154]
> 
> > -Original Message-
> > From: Richard Biener 
> > Sent: Wednesday, September 27, 2023 8:12 AM
> > To: Tamar Christina 
> > Cc: Andrew Pinski ; gcc-patches@gcc.gnu.org; nd
> > ; j...@ventanamicro.com
> > Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x
> > | (1 <<
> > signbit(x)) [PR109154]
> >
> > On Wed, 27 Sep 2023, Tamar Christina wrote:
> >
> > > > -Original Message-
> > > > From: Andrew Pinski 
> > > > Sent: Wednesday, September 27, 2023 2:17 AM
> > > > To: Tamar Christina 
> > > > Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de;
> > > > j...@ventanamicro.com
> > > > Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x))
> > > > to x | (1 <<
> > > > signbit(x)) [PR109154]
> > > >
> > > > On Tue, Sep 26, 2023 at 5:51?PM Tamar Christina
> > > > 
> > > > wrote:
> > > > >
> > > > > Hi All,
> > > > >
> > > > > For targets that allow conversion between int and float modes
> > > > > this adds a new optimization transforming fneg (fabs (x)) into x
> > > > > | (1 << signbit(x)).  Such sequences are common in scientific
> > > > > code working with
> > > > gradients.
> > > > >
> > > > > The transformed instruction if the target has an inclusive-OR
> > > > > that takes an immediate is both shorter an faster.  For those
> > > > > that don't the immediate has to be seperate constructed but this
> > > > > still ends up being faster as the immediate construction is not
> > > > > on the critical
> > path.
> > > > >
> > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > > >
> > > > > Ok for master?
> > > >
> > > > I think this should be part of isel instead of match.
> > > > Maybe we could use genmatch to generate the code that does the
> > > > transformations but this does not belong as part of match really.
> > >
> > > I disagree.. I don't think this belongs in isel. Isel is for
> > > structural
> > transformations.
> > > If there is a case for something else I'd imagine backwardprop is a
> > > better
> > choice.
> > >
> > > But I don't see why it doesn't belong here considering it *is* a
> > > mathematical optimization and the file has plenty of transformations
> > > such as mask optimizations and vector conditional rewriting.
> >
> > But the mathematical transform would more generally be fneg (fabs (x))
> > -> copysign (x, -1.) and that can be optimally expanded at RTL expansion
> time?
> 
> Ah sure, atm I did copysign (x, -1) -> x | 1 << signbits.  I can do it the 
> other way
> around.  And I guess since copysign (-x, y), copysign(|x|, y) -> copysign (x, 
> y)
> that should solve the trigonometry problem too.
> 
> Cool will do that instead, thanks!

Hmm this seems to conflict with the pattern

/* copysign(x, CST) -> [-]abs (x).  */
(for copysigns (COPYSIGN_ALL)
 (simplify
  (copysigns @0 REAL_CST@1)
  (if (REAL_VALUE_NEGATIVE (TREE_REAL_CST (@1)))
   (negate (abs @0))
   (abs @0

Which does the opposite transformation.

Should I try removing this?

Thanks,
Tamar

> 
> Tamar
> 
> >
> > Richard.
> >
> > > Regards,
> > > Tamar
> > >
> > > >
> > > > Thanks,
> > > > Andrew
> > > >
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > > PR tree-optimization/109154
> > > > > * match.pd: Add new neg+abs rule.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > PR tree-optimization/109154
> > > > > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > > > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > > > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > > > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > > > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > > > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > > > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > > > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > > > >
> > > > > --- inline copy of patch --
> > > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > > >
> > > >
> >
> 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d69482
> > > > 6cffad0f
> > > > > b17e1136600a 100644
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -9476,3 +9476,57 @@ and,
> > > > >}
> > > > >(if (full_perm_p)
> > > > > (vec_perm (op@3 @0 @1) @3 @2))
> > > > > +
> > > > > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X).  */
> > > > > +
> > > > > +(simplify
> > > > > + (negate (abs @0))
> > > > > + (if (FLOAT_TYPE_P (type)
> > > > > +  /* We have to delay this rewriting till after forward
> > > > > +prop because
> > > > otherwise
> > > > > +it's harder to do 

Re: [PATCH V3] RISC-V: Remove mem-to-mem VLS move pattern[PR111566]

2023-09-27 Thread Kito Cheng
>  (define_insn_and_split "*mov"
>[(set (match_operand:VLS_AVL_IMM 0 "reg_or_mem_operand" "=vr, m, vr")
> (match_operand:VLS_AVL_IMM 1 "reg_or_mem_operand" "  m,vr, vr"))]
>"TARGET_VECTOR
> -   && (register_operand (operands[0], mode)
> +   && (can_create_pseudo_p ()

Why add `can_create_pseudo_p ()` here? this will split after reload,
but we forbid that pattern between reload and split2?

> +   || register_operand (operands[0], mode)
> || register_operand (operands[1], mode))"
>"@
> #
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> index aedf98819bb..24bb7240db8 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-1.c
> @@ -4,54 +4,6 @@
>
>  #include "def.h"
>
> -/*
> -** mov0:
> -** lbu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sb\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov0 (int8_t *in, int8_t *out)
> -{
> -  v1qi v = *(v1qi*)in;
> -  *(v1qi*)out = v;
> -}
> -
> -/*
> -** mov1:
> -** lhu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sh\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov1 (int8_t *in, int8_t *out)
> -{
> -  v2qi v = *(v2qi*)in;
> -  *(v2qi*)out = v;
> -}
> -
> -/*
> -** mov2:
> -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov2 (int8_t *in, int8_t *out)
> -{
> -  v4qi v = *(v4qi*)in;
> -  *(v4qi*)out = v;
> -}
> -
> -/*
> -** mov3:
> -** ld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov3 (int8_t *in, int8_t *out)
> -{
> -  v8qi v = *(v8qi*)in;
> -  *(v8qi*)out = v;
> -}
> -
>  /*
>  ** mov4:
>  ** vsetivli\s+zero,\s*16,\s*e8,\s*mf8,\s*t[au],\s*m[au]
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> index 5e9615412b7..cae96b3be3f 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-10.c
> @@ -4,18 +4,6 @@
>
>  #include "def.h"
>
> -/*
> -** mov0:
> -** fld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** fsd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov0 (double *in, double *out)
> -{
> -  v1df v = *(v1df*)in;
> -  *(v1df*)out = v;
> -}
> -
>  /*
>  ** mov1:
>  ** vsetivli\s+zero,\s*2,\s*e64,\s*m1,\s*t[au],\s*m[au]
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
> deleted file mode 100644
> index 10ae1972db7..000
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-2.c
> +++ /dev/null
> @@ -1,19 +0,0 @@
> -/* { dg-do compile } */
> -/* { dg-options "-march=rv32gcv_zvfh_zvl4096b -mabi=ilp32d -O3 
> -fno-schedule-insns -fno-schedule-insns2" } */
> -/* { dg-final { check-function-bodies "**" "" } } */
> -
> -#include "def.h"
> -
> -/*
> -** mov:
> -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** lw\s+[a-x0-9]+,4\s*\([a-x0-9]+\)
> -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sw\s+[a-x0-9]+,4\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov (int8_t *in, int8_t *out)
> -{
> -  v8qi v = *(v8qi*)in;
> -  *(v8qi*)out = v;
> -}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> index f2880ae5e77..86ce22896c5 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-3.c
> @@ -4,42 +4,6 @@
>
>  #include "def.h"
>
> -/*
> -** mov0:
> -** lhu\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sh\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov0 (int16_t *in, int16_t *out)
> -{
> -  v1hi v = *(v1hi*)in;
> -  *(v1hi*)out = v;
> -}
> -
> -/*
> -** mov1:
> -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov1 (int16_t *in, int16_t *out)
> -{
> -  v2hi v = *(v2hi*)in;
> -  *(v2hi*)out = v;
> -}
> -
> -/*
> -** mov2:
> -** ld\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** sd\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -**  ret
> -*/
> -void mov2 (int16_t *in, int16_t *out)
> -{
> -  v4hi v = *(v4hi*)in;
> -  *(v4hi*)out = v;
> -}
> -
>  /*
>  ** mov3:
>  ** vsetivli\s+zero,\s*8,\s*e16,\s*mf4,\s*t[au],\s*m[au]
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-4.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-4.c
> deleted file mode 100644
> index f81f1697d65..000
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mov-4.c
> +++ /dev/null
> @@ -1,19 +0,0 @@
> -/* { dg-do compile } */
> -/* { dg-options "-march=rv32gcv_zvfh_zvl4096b -mabi=ilp32d -O3 
> -fno-schedule-insns -fno-schedule-insns2" } */
> -/* { dg-final { check-function-bodies "**" "" } } */
> -
> -#include "def.h"
> -
> -/*
> -** mov:
> -** lw\s+[a-x0-9]+,0\s*\([a-x0-9]+\)
> -** 

committed [RISC-V]: Harden test scan patterns

2023-09-27 Thread Joern Rennecke
I got tired of scan tests failing when they have an underspecified
pattern that matches LTO information, so I did a global replace for
the most common form of such scan patterns in the gcc.target/riscv
testsuite.

regression tested for:
riscv-sim

riscv-sim/-march=rv32gcv_zfh/-mabi=ilp32d/-ftree-vectorize/--param=riscv-autovec-preference=scalable
riscv-sim/-march=rv32imac/-mabi=ilp32

riscv-sim/-march=rv64gcv_zfh_zvfh_zba_zbb_zbc_zicond_zicboz_zawrs/-mabi=lp64d/-ftree-vectorize/--param=riscv-autovec-preferenc
e=scalable
riscv-sim/-march=rv64imac/-mabi=lp64

Committed as obvious.
commit d326bb6d7588425d013791299272f913fb23e56d
Author: Joern Rennecke 
Date:   Wed Sep 27 10:05:13 2023 +0100

Harden scan patterns with a bit of scripting:

$ egrep -r 'scan-assembler(|-not|-times) "[[:alnum:].]{1,7}"' riscv
$ egrep -rl 'scan-assembler(|-not|-times) "[[:alnum:].]{1,7}"' riscv > files
$ cat edcmds
g/\(scan-assembler\(\|-not\|-times\) 
\+\)"\([[:alnum:]]\{1,5\}\)\.\([[:alpha:].]\{1,3\}\)"/s//\1{\\m\3\\.\4\\M}/
g/\(scan-assembler\(\|-not\|-times\) 
\+\)"\([[:alnum:]]\{1,7\}\)"/s//\1{\\m\3}/
w
q
$ sed 's/.*/ed & < edcmds/' < files > tmp
$ source tmp

gcc/testsuite/
* gcc.target/riscv/shift-shift-1.c: Avoid spurious pattern matches.
* gcc.target/riscv/shift-shift-3.c: Likewise.
* gcc.target/riscv/zba-shNadd-01.c: Likewise.
* gcc.target/riscv/zba-shNadd-02.c: Likewise.
* gcc.target/riscv/zbb-andn-orn-xnor-01.c: Likewise.
* gcc.target/riscv/zbb-andn-orn-xnor-02.c: Likewise.
* gcc.target/riscv/zbb-min-max.c: Likewise.
* gcc.target/riscv/zero-extend-1.c: Likewise.
* gcc.target/riscv/zero-extend-2.c: Likewise.
* gcc.target/riscv/zero-extend-3.c: Likewise.
* gcc.target/riscv/zero-extend-4.c: Likewise.
* gcc.target/riscv/zero-extend-5.c: Likewise.
* gcc.target/riscv/_Float16-soft-2.c: Likewise.
* gcc.target/riscv/_Float16-soft-3.c: Likewise.
* gcc.target/riscv/_Float16-zfh-1.c: Likewise.
* gcc.target/riscv/_Float16-zfh-2.c: Likewise.
* gcc.target/riscv/_Float16-zfh-3.c: Likewise.
* gcc.target/riscv/and-extend-1.c: Likewise.
* gcc.target/riscv/and-extend-2.c: Likewise.
* gcc.target/riscv/pr108987.c: Likewise.
* gcc.target/riscv/ret-1.c: Likewise.
* gcc.target/riscv/rvv/autovec/align-1.c: Likewise.
* gcc.target/riscv/rvv/autovec/align-2.c: Likewise.
* gcc.target/riscv/zba-shNadd-04.c: Likewise.
* gcc.target/riscv/zba-shNadd-07.c: Likewise.
* gcc.target/riscv/zbb-rol-ror-02.c: Likewise.
* gcc.target/riscv/zbbw.c: Likewise.
* gcc.target/riscv/zbc32.c: Likewise.
* gcc.target/riscv/zbc64.c: Likewise.
* gcc.target/riscv/zbkb32.c: Likewise.
* gcc.target/riscv/zbkb64.c: Likewise.
* gcc.target/riscv/zbkc32.c: Likewise.
* gcc.target/riscv/zbkc64.c: Likewise.
* gcc.target/riscv/zbkx32.c: Likewise.
* gcc.target/riscv/zbkx64.c: Likewise.
* gcc.target/riscv/zfa-fleq-fltq.c: Likewise.
* gcc.target/riscv/zfa-fli-zfh.c: Likewise.
* gcc.target/riscv/zfa-fli.c: Likewise.
* gcc.target/riscv/zknd64.c: Likewise.
* gcc.target/riscv/zksed32.c: Likewise.
* gcc.target/riscv/zksed64.c: Likewise.
* gcc.target/riscv/zksh32.c: Likewise.
* gcc.target/riscv/zksh64.c: Likewise.
* gcc.target/riscv/_Float16-soft-1.c: Likewise.
* gcc.target/riscv/_Float16-zfhmin-1.c: Likewise.
* gcc.target/riscv/_Float16-zfhmin-2.c: Likewise.
* gcc.target/riscv/_Float16-zfhmin-3.c: Likewise.
* gcc.target/riscv/_Float16-zhinxmin-1.c: Likewise.
* gcc.target/riscv/_Float16-zhinxmin-2.c: Likewise.
* gcc.target/riscv/_Float16-zhinxmin-3.c: Likewise.
* gcc.target/riscv/fle-ieee.c: Likewise.
* gcc.target/riscv/fle-snan.c: Likewise.
* gcc.target/riscv/flef-ieee.c: Likewise.
* gcc.target/riscv/flef-snan.c: Likewise.
* gcc.target/riscv/flt-ieee.c: Likewise.
* gcc.target/riscv/flt-snan.c: Likewise.
* gcc.target/riscv/fltf-ieee.c: Likewise.
* gcc.target/riscv/fltf-snan.c: Likewise.
* gcc.target/riscv/interrupt-1.c: Likewise.
* gcc.target/riscv/interrupt-mmode.c: Likewise.
* gcc.target/riscv/interrupt-smode.c: Likewise.
* gcc.target/riscv/interrupt-umode.c: Likewise.
* gcc.target/riscv/pr106888.c: Likewise.
* gcc.target/riscv/pr89835.c: Likewise.
* gcc.target/riscv/shift-and-1.c: Likewise.
* gcc.target/riscv/shift-and-2.c: Likewise.
  

[PATCH v5] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-27 Thread Manos Anagnostakis
This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
to provide the requested behaviour for handling ldp and stp:

  /* Allow the tuning structure to disable LDP instruction formation
 from combining instructions (e.g., in peephole2).
 TODO: Implement fine-grained tuning control for LDP and STP:
   1. control policies for load and store separately;
   2. support the following policies:
  - default (use what is in the tuning structure)
  - always
  - never
  - aligned (only if the compiler can prove that the
load will be aligned to 2 * element_size)  */

It provides two new and concrete target-specific command-line parameters
-param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
to give the ability to control load and store policies seperately as
stated in part 1 of the TODO.

The accepted values for both parameters are:
- default: Use the policy of the tuning structure (default).
- always: Emit ldp/stp regardless of alignment.
- never: Do not emit ldp/stp.
- aligned: In order to emit ldp/stp, first check if the load/store will
  be aligned to 2 * element_size.

Bootstrapped and regtested aarch64-linux.

gcc/ChangeLog:
* config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): New
enum type.
(enum aarch64_stp_policy): New enum type.
* config/aarch64/aarch64-protos.h (struct tune_params): Add
appropriate enums for the policies.
(aarch64_mem_ok_with_ldpstp_policy_model): New declaration.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
options.
* config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
function to parse ldp-policy parameter.
(aarch64_parse_stp_policy): New function to parse stp-policy parameter.
(aarch64_override_options_internal): Call parsing functions.
(aarch64_mem_ok_with_ldpstp_policy_model): New function.
(aarch64_operands_ok_for_ldpstp): Add call to
aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
check and alignment check and remove superseded ones.
(aarch64_operands_adjust_ok_for_ldpstp): Add call to
aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
check and alignment check and remove superseded ones.
* config/aarch64/aarch64.opt (aarch64-ldp-policy): New param.
(aarch64-stp-policy): New param.
* doc/invoke.texi: Document the parameters accordingly.

gcc/testsuite/ChangeLog:
* gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
* gcc.target/aarch64/ldp_aligned.c: New test.
* gcc.target/aarch64/ldp_always.c: New test.
* gcc.target/aarch64/ldp_never.c: New test.
* gcc.target/aarch64/stp_aligned.c: New test.
* gcc.target/aarch64/stp_always.c: New test.
* gcc.target/aarch64/stp_never.c: New test.

Signed-off-by: Manos Anagnostakis 
---
Changes in v5:
- Adjust ChangeLog for aarch64.opt.

 gcc/config/aarch64/aarch64-opts.h |  16 ++
 gcc/config/aarch64/aarch64-protos.h   |  25 +++
 gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
 gcc/config/aarch64/aarch64.cc | 212 +-
 gcc/config/aarch64/aarch64.opt|  38 
 gcc/doc/invoke.texi   |  20 ++
 .../aarch64/ampere1-no_ldp_combine.c  |  11 -
 .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
 gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
 .../gcc.target/aarch64/stp_aligned.c  |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
 gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
 13 files changed, 632 insertions(+), 76 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.target/aarch64/ampere1-no_ldp_combine.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_never.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_aligned.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_always.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/stp_never.c

diff --git a/gcc/config/aarch64/aarch64-opts.h 
b/gcc/config/aarch64/aarch64-opts.h
index 7e8f1babed8..db8348507a3 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -108,4 +108,20 @@ enum aarch64_key_type {
   AARCH64_KEY_B
 };
 
+/* Load pair policy type.  */
+enum aarch64_ldp_policy {
+  LDP_POLICY_DEFAULT,
+  LDP_POLICY_ALWAYS,
+  LDP_POLICY_NEVER,
+  LDP_POLICY_ALIGNED
+};
+
+/* Store pair policy type.  */
+enum aarch64_stp_policy {
+  STP_POLICY_DEFAULT,
+  STP_POLICY_ALWAYS,
+  STP_POLICY_NEVER,
+  STP_POLICY_ALIGNED
+};
+
 #endif
diff --git 

Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-27 Thread Richard Biener
On Wed, 27 Sep 2023, Andre Vieira (lists) wrote:

> 
> 
> On 26/09/2023 17:37, Andrew Stubbs wrote:
> > I don't have authority to approve anything, but here's a review anyway.
> > 
> > Thanks for working on this.
> 
> Thank you for reviewing and apologies for the mess of a patch, may have rushed
> it ;)
> > 
> >> diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
> >> b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
> >> new file mode 100644
> >> index
> >> ..09127b8cb6f2e3699b6073591f58be7047330273
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
> >> @@ -0,0 +1,23 @@
> >> +/* { dg-require-effective-target vect_simd_clones } */
> >> +/* { dg-do compile } */
> >> +/* { dg-additional-options "-fopenmp-simd" } */
> >> +
> > 
> > Do you need -fopenmp-simd for this?
> Nope, I keep forgetting that you only need it for pragmas.
> 
> Dealt with the other comments too.

The patch is OK.

> Any thoughts on changing gimple_call_internal_fn  instead? My main argument
> against is that IFN_MASK_CALL should not appear outside of ifconvert and
> vectorizer. On the other hand, we may inspect the flags elsewhere in the
> vectorizer (now or in the future) and changing gimple_call_internal_fn would
> prevent the need to handle the IFN separately elsewhere.

But gimple_call_internal_fn is only half of the work since arguments are
shifted.  So I think handling this in if-conversion and the vectorizer
is the right thing as it's a very short-lived IFN.

Richard.

> Kind Regards,
> Andre
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH v4] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-27 Thread Manos Anagnostakis
Thanks Kyrill!

Submitting the obvious v5.

Manos.

On Wed, Sep 27, 2023 at 11:40 AM Kyrylo Tkachov 
wrote:

> Hi Manos,
>
> > -Original Message-
> > From: Manos Anagnostakis 
> > Sent: Tuesday, September 26, 2023 2:52 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Kyrylo Tkachov ; Tamar Christina
> > ; Philipp Tomsich ;
> > Manos Anagnostakis 
> > Subject: [PATCH v4] aarch64: Fine-grained policies to control ldp-stp
> > formation.
> >
> > This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
> > to provide the requested behaviour for handling ldp and stp:
> >
> >   /* Allow the tuning structure to disable LDP instruction formation
> >  from combining instructions (e.g., in peephole2).
> >  TODO: Implement fine-grained tuning control for LDP and STP:
> >1. control policies for load and store separately;
> >2. support the following policies:
> >   - default (use what is in the tuning structure)
> >   - always
> >   - never
> >   - aligned (only if the compiler can prove that the
> > load will be aligned to 2 * element_size)  */
> >
> > It provides two new and concrete target-specific command-line parameters
> > -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> > to give the ability to control load and store policies seperately as
> > stated in part 1 of the TODO.
> >
> > The accepted values for both parameters are:
> > - default: Use the policy of the tuning structure (default).
> > - always: Emit ldp/stp regardless of alignment.
> > - never: Do not emit ldp/stp.
> > - aligned: In order to emit ldp/stp, first check if the load/store will
> >   be aligned to 2 * element_size.
> >
> > Bootstrapped and regtested aarch64-linux.
> >
> > gcc/ChangeLog:
> > * config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): New
> >   enum type.
> > (enum aarch64_stp_policy): New enum type.
> > * config/aarch64/aarch64-protos.h (struct tune_params): Add
> >   appropriate enums for the policies.
> >   (aarch64_mem_ok_with_ldpstp_policy_model): New declaration.
> > * config/aarch64/aarch64-tuning-flags.def
> >   (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
> >   options.
> > * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
> >   function to parse ldp-policy parameter.
> > (aarch64_parse_stp_policy): New function to parse stp-policy
> parameter.
> > (aarch64_override_options_internal): Call parsing functions.
> >   (aarch64_mem_ok_with_ldpstp_policy_model): New function.
> > (aarch64_operands_ok_for_ldpstp): Add call to
> >   aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
> >   check and alignment check and remove superseded ones.
> > (aarch64_operands_adjust_ok_for_ldpstp): Add call to
> > aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
> >   check and alignment check and remove superseded ones.
> > * config/aarch64/aarch64.opt: Add parameters.
> >   * doc/invoke.texi: Document the parameters accordingly.
>
> The ChangeLog entry should name the new parameters. For example:
> * config/aarch64/aarch64.opt (aarch64-ldp-policy): New param.
>
> Ok with the fixed ChangeLog.
> Thank you for the work!
> Kyrill
>
> >
> > gcc/testsuite/ChangeLog:
> >   * gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
> > * gcc.target/aarch64/ldp_aligned.c: New test.
> > * gcc.target/aarch64/ldp_always.c: New test.
> > * gcc.target/aarch64/ldp_never.c: New test.
> > * gcc.target/aarch64/stp_aligned.c: New test.
> > * gcc.target/aarch64/stp_always.c: New test.
> > * gcc.target/aarch64/stp_never.c: New test.
> >
> > Signed-off-by: Manos Anagnostakis 
> > ---
> > Changes in v4:
> > - Changed the parameters to accept enum instead of an
> >   integer and updated documentation in doc/invoke.texi.
> > - Packed all the new checks in aarch64_operands_ok_for_ldpstp/
> >   aarch64_operands_adjust_ok_for_ldpstp in a new function
> >   called aarch64_mem_ok_with_ldpstp_policy_model.
> >
> >  gcc/config/aarch64/aarch64-opts.h |  16 ++
> >  gcc/config/aarch64/aarch64-protos.h   |  25 +++
> >  gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
> >  gcc/config/aarch64/aarch64.cc | 212 +-
> >  gcc/config/aarch64/aarch64.opt|  38 
> >  gcc/doc/invoke.texi   |  20 ++
> >  .../aarch64/ampere1-no_ldp_combine.c  |  11 -
> >  .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
> >  gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
> >  gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
> >  .../gcc.target/aarch64/stp_aligned.c  |  60 +
> >  gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
> >  

Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-27 Thread Andrew Stubbs

On 27/09/2023 08:56, Andre Vieira (lists) wrote:



On 26/09/2023 17:37, Andrew Stubbs wrote:

I don't have authority to approve anything, but here's a review anyway.

Thanks for working on this.


Thank you for reviewing and apologies for the mess of a patch, may have 
rushed it ;)


diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c

new file mode 100644
index 
..09127b8cb6f2e3699b6073591f58be7047330273

--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+/* { dg-additional-options "-fopenmp-simd" } */
+


Do you need -fopenmp-simd for this?

Nope, I keep forgetting that you only need it for pragmas.

Dealt with the other comments too.

Any thoughts on changing gimple_call_internal_fn  instead? My main 
argument against is that IFN_MASK_CALL should not appear outside of 
ifconvert and vectorizer. On the other hand, we may inspect the flags 
elsewhere in the vectorizer (now or in the future) and changing 
gimple_call_internal_fn would prevent the need to handle the IFN 
separately elsewhere.


Sorry, I haven't looked closely enough to have an opinion on that, or 
what the side-effects might be.


You have a solution, and like you said, this should be the only case.

I have no further comments on this patch. :)

Andrew


Re: [PATCH V4 2/2] rs6000: use mtvsrws to move sf from si p9

2023-09-27 Thread Kewen.Lin
Hi Jeff,

on 2023/8/30 15:43, Jiufu Guo wrote:
> Hi,
> 
> As mentioned in PR108338, on p9, we could use mtvsrws to implement
> the bitcast from SI to SF (or lowpart DI to SF).
> 
> For code:
>   *(long long*)buff = di;
>   float f = *(float*)(buff);
> 
> "sldi 9,3,32 ; mtvsrd 1,9 ; xscvspdpn 1,1" is generated.
> A better one would be "mtvsrws 1,3 ; xscvspdpn 1,1".
> 
> Compare with previous patch:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623533.html
> "highpart DI-->SF" is put to a seperate patch.
> 
> Pass bootstrap and regression on ppc64{,le}.
> Is this ok for trunk?
> 
> BR,
> Jeff (Jiufu Guo)
> 
Nit: Missing a PR marker line.

> gcc/ChangeLog:
> 
>   * config/rs6000/rs6000.md (movsf_from_si): Update to generate mtvsrws
>   for P9.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/powerpc/pr108338.c: Updated to check mtvsrws for p9.
> 
> ---
>  gcc/config/rs6000/rs6000.md | 25 -
>  gcc/testsuite/gcc.target/powerpc/pr108338.c |  6 +++--
>  2 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index 
> 8c92cbf976de915136ad5dba24e69a363d21438d..c03e677bca79e8fb1acb276d07d0acfae009f6d8
>  100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -8280,13 +8280,26 @@ (define_insn_and_split "movsf_from_si"
>  {
>rtx op0 = operands[0];
>rtx op1 = operands[1];
> -  rtx op2 = operands[2];
> -  rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
> 
> -  /* Move SF value to upper 32-bits for xscvspdpn.  */
> -  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
> -  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
> -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
> +  /* Move lowpart 32-bits from register for SFmode.  */
> +  if (TARGET_P9_VECTOR)
> +{
> +  /* Using mtvsrws;xscvspdpn.  */
> +  rtx op0_v = gen_rtx_REG (V4SImode, REGNO (op0));
> +  emit_insn (gen_vsx_splat_v4si (op0_v, op1));
> +  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
> +}
> +  else
> +{
> +  rtx op2 = operands[2];
> +  rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
> +
> +  /* Using ashl;mtvsrd;xscvspdpn.  */

Nit: Use sldi instead of ashl as the others are actual
mnemonics but ashl isn't.

> +  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
> +  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
> +  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
> +}
> +
>DONE;
>  }
>[(set_attr "length"
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108338.c 
> b/gcc/testsuite/gcc.target/powerpc/pr108338.c
> index 
> 6db65595343c2407fc32f68f5f52a1f7196c371d..0565e5254ed0a8cc579cf505a3f865426dcf62ae
>  100644
> --- a/gcc/testsuite/gcc.target/powerpc/pr108338.c
> +++ b/gcc/testsuite/gcc.target/powerpc/pr108338.c
> @@ -19,9 +19,11 @@ float  __attribute__ ((noipa)) sf_from_di_off4 (long long 
> l)
> 
>  /* Under lp64, parameter 'l' is in one DI reg, then bitcast sub DI to SF. */
>  /* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 { target { lp64 && 
> has_arch_pwr8 } } } } */
> -/* { dg-final { scan-assembler-times {\mmtvsrd\M} 2 { target { lp64 && 
> has_arch_pwr8 } } } } */
> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 2 { target { lp64 && { 
> has_arch_pwr8 && { ! has_arch_pwr9 } } } } } } */
> +/* { dg-final { scan-assembler-times {\msldi\M} 1 { target { lp64 && { 
> has_arch_pwr8 && { ! has_arch_pwr9 } } } } } } */
> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 1 { target { lp64 && 
> has_arch_pwr9 } } } } */
> +/* { dg-final { scan-assembler-times {\mmtvsrws\M} 1 { target { lp64 && 
> has_arch_pwr9 } } } } */
>  /* { dg-final { scan-assembler-times {\mrldicr\M} 1 { target { lp64 && 
> has_arch_pwr8 } } } } */
> -/* { dg-final { scan-assembler-times {\msldi\M} 1 { target { lp64 && 
> has_arch_pwr8 } } } } */
> 

This part might need a fresh as the comments to patch 1/2.

The others look good to me, thanks!

BR,
Kewen


RE: [PATCH]AArch64: Use SVE unpredicated LOGICAL expressions when Advanced SIMD inefficient [PR109154]

2023-09-27 Thread Tamar Christina
> -Original Message-
> From: Richard Sandiford 
> Sent: Wednesday, September 27, 2023 9:50 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; Kyrylo Tkachov 
> Subject: Re: [PATCH]AArch64: Use SVE unpredicated LOGICAL expressions
> when Advanced SIMD inefficient [PR109154]
> 
> Tamar Christina  writes:
> > Hi All,
> >
> > SVE has much bigger immediate encoding range for bitmasks than
> > Advanced SIMD has and so on a system that is SVE capable if we need an
> > Advanced SIMD Inclusive-OR by immediate and would require a reload then
> an unpredicated SVE ORR instead.
> >
> > This has both speed and size improvements.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > PR tree-optimization/109154
> > * config/aarch64/aarch64.md (3): Convert to new
> syntax and
> > SVE split case.
> > * config/aarch64/iterators.md (VCONV, vconv): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR tree-optimization/109154
> > * gcc.target/aarch64/sve/fneg-abs_2.c: Updated.
> > * gcc.target/aarch64/sve/fneg-abs_4.c: Updated.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64.md
> > b/gcc/config/aarch64/aarch64.md index
> >
> 60c92213c75a2a4c18a6b59ae52fe45d1e872718..377c5cafedd43d8d13204
> 89a3626
> > 7cc6e5f15239 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -4551,17 +4551,27 @@ (define_insn_and_split
> "*aarch64_and_imm2"
> >}
> >  )
> >
> > -(define_insn "3"
> > -  [(set (match_operand:GPI 0 "register_operand" "=r,rk,w")
> > -   (LOGICAL:GPI (match_operand:GPI 1 "register_operand" "%r,r,w")
> > -(match_operand:GPI 2 "aarch64_logical_operand"
> "r,,w")))]
> > -  ""
> > -  "@
> > -  \\t%0, %1, %2
> > -  \\t%0, %1, %2
> > -  \\t%0., %1., %2."
> > -  [(set_attr "type" "logic_reg,logic_imm,neon_logic")
> > -   (set_attr "arch" "*,*,simd")]
> > +(define_insn_and_split "3"
> > +  [(set (match_operand:GPI 0 "register_operand")
> > +   (LOGICAL:GPI (match_operand:GPI 1 "register_operand")
> > +(match_operand:GPI 2 "aarch64_logical_operand")))]
> > +  ""
> > +  {@ [cons: =0, 1, 2; attrs: type, arch]
> > + [r , %r, r   ; logic_reg , *   ] \t%0, %1, %2
> > + [rk, r , ; logic_imm , *   ] \t%0, %1, %2
> > + [w , 0 , ; * , sve ] #
> > + [w , w , w   ; neon_logic, simd] \t%0., 
> > %1.,
> %2.
> > +  }
> > +  "&& TARGET_SVE && rtx_equal_p (operands[0], operands[1])
> > +   && satisfies_constraint_ (operands[2])
> > +   && FP_REGNUM_P (REGNO (operands[0]))"
> > +  [(const_int 0)]
> > +  {
> > +rtx op1 = lowpart_subreg (mode, operands[1], mode);
> > +rtx op2 = gen_const_vec_duplicate (mode, operands[2]);
> > +emit_insn (gen_3 (op1, op1, op2));
> > +DONE;
> > +  }
> >  )
> 
> The WIP SME patches add a %Z modifier for 'z' register prefixes, similarly to
> b/h/s/d for scalar FP.  With that I think the alternative can be:
> 
>  [w , 0 , ; * , sve ] \t%Z0., %Z0., #%2
> 
> although it would be nice to keep the hex constant.

My original patch added a %u for (undecorated) which just prints the register
number and changed %C to also accept a single constant instead of only a 
uniform vector.

But I figured you wouldn't like that? 

Cheers,
Tamar

> 
> Will try to post the patches up to that part soon.
> 
> Thanks,
> Richard
> 
> >
> >  ;; zero_extend version of above
> > diff --git a/gcc/config/aarch64/iterators.md
> > b/gcc/config/aarch64/iterators.md index
> >
> d17becc37e230684beaee3c69e2a0f0ce612eda5..568cd5d1a3a9e00475376
> 177ad13
> > de72609df3d8 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -1432,6 +1432,11 @@ (define_mode_attr VCONQ [(V8QI "V16QI")
> (V16QI "V16QI")
> >  (HI   "V8HI") (QI   "V16QI")
> >  (SF   "V4SF") (DF   "V2DF")])
> >
> > +;; 128-bit container modes for the lower part of an SVE vector to the
> > +inner or ;; scalar source mode.
> > +(define_mode_attr VCONV [(SI "VNx4SI") (DI "VNx2DI")])
> > +(define_mode_attr vconv [(SI "vnx4si") (DI "vnx2di")])
> > +
> >  ;; Half modes of all vector modes.
> >  (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
> >  (V4HI "V2HI")  (V8HI  "V4HI")
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > index
> >
> a60cd31b9294af2dac69eed1c93f899bd5c78fca..fe9f27bf91b8fb18205a589
> 1a5d5
> > e847a5d88e4b 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> > @@ -7,8 +7,7 @@
> >
> >  /*
> >  ** f1:
> > -** moviv[0-9]+.2s, 0x80, lsl 24
> > -** orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> > +** orr z0.s, z0.s, #0x8000
> >  ** ret
> >  */
> >  float32_t f1 (float32_t a)

Re: [PATCH V4 1/2] rs6000: optimize moving to sf from highpart di

2023-09-27 Thread Kewen.Lin
Hi Jeff,

on 2023/8/30 15:43, Jiufu Guo wrote:
> Hi,
> 
> Currently, we have the pattern "movsf_from_si2" which was trying
> to support moving high part DI to SF.
> 
> The pattern looks like: XX:SF=bitcast:SF(subreg(YY:DI>>32),0)
> It only accepts the "ashiftrt" for ">>", but "lshiftrt" is also ok.
> And the offset of "subreg" is hard code 0, which only works for LE.
> 
> "movsf_from_si2" is updated to cover BE for "subreg", and cover
> the logical shift for ":DI>>32".
> 
> Pass bootstrap and regression on ppc64{,le}.
> Is this ok for trunk?
> 
> BR,
> Jeff (Jiufu Guo)
> 
>   PR target/108338
> 
> gcc/ChangeLog:
> 
>   * config/rs6000/predicates.md (lowpart_subreg_operator): New
>   define_predicate.
>   * config/rs6000/rs6000.md (any_rshift): New code_iterator.
>   (movsf_from_si2): Rename to ...
>   (movsf_from_si2_): ... this.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/powerpc/pr108338.c: New test.
> 
> ---
>  gcc/config/rs6000/predicates.md |  5 +++
>  gcc/config/rs6000/rs6000.md | 11 +++---
>  gcc/testsuite/gcc.target/powerpc/pr108338.c | 40 +
>  3 files changed, 51 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108338.c
> 
> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
> index 
> 3552d908e9d149a30993e3e6568466de537336be..e25b3b4864f681d47e9d5c2eb88bcde0aea6d17b
>  100644
> --- a/gcc/config/rs6000/predicates.md
> +++ b/gcc/config/rs6000/predicates.md
> @@ -2098,3 +2098,8 @@ (define_predicate "macho_pic_address"
>else
>  return false;
>  })
> +
> +(define_predicate "lowpart_subreg_operator"
> +  (and (match_code "subreg")
> +   (match_test "subreg_lowpart_offset (mode, GET_MODE (SUBREG_REG (op)))
> + == SUBREG_BYTE (op)")))
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index 
> 1a9a7b1a47918f39fc91038607f21a8ba9a2e740..8c92cbf976de915136ad5dba24e69a363d21438d
>  100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -8299,18 +8299,19 @@ (define_insn_and_split "movsf_from_si"
>   "*,  *, p9v,   p8v,   *, *,
>p8v,p8v,   p8v,   *")])
> 
> +(define_code_iterator any_rshift [ashiftrt lshiftrt])

Nit: Could we name this as any_shiftrt instead and move this close to the
existing any_* code_iterator?

> +
>  ;; For extracting high part element from DImode register like:
>  ;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>  ;; split it before reload with "and mask" to avoid generating shift right
>  ;; 32 bit then shift left 32 bit.
> -(define_insn_and_split "movsf_from_si2"
> +(define_insn_and_split "movsf_from_si2_"
>[(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
>   (unspec:SF
> -  [(subreg:SI
> -(ashiftrt:DI
> +  [(match_operator:SI 3 "lowpart_subreg_operator"
> +[(any_rshift:DI
>   (match_operand:DI 1 "input_operand" "r")
> - (const_int 32))
> -0)]
> + (const_int 32))])]
>UNSPEC_SF_FROM_SI))
>(clobber (match_scratch:DI 2 "=r"))]
>"TARGET_NO_SF_SUBREG"
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108338.c 
> b/gcc/testsuite/gcc.target/powerpc/pr108338.c
> new file mode 100644
> index 
> ..6db65595343c2407fc32f68f5f52a1f7196c371d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr108338.c
> @@ -0,0 +1,40 @@
> +// { dg-do run }
> +// { dg-options "-O2 -save-temps" }

I think we need to check effective target hard_float to ensure
the expected assembly? 

> +
> +float __attribute__ ((noipa)) sf_from_di_off0 (long long l)
> +{
> +  char buff[16];
> +  *(long long*)buff = l;
> +  float f = *(float*)(buff);
> +  return f;
> +}
> +
> +float  __attribute__ ((noipa)) sf_from_di_off4 (long long l)
> +{
> +  char buff[16];
> +  *(long long*)buff = l;
> +  float f = *(float*)(buff + 4);
> +  return f; 
> +}
> +

IIUC, this patch is to deal with high 32-bit, but why you proposed
two functions is due to endianness difference, then could we use
endianness macro like __LITTLE_ENDIAN__ to simplify the corresponding
offset value (0 on BE, 4 on LE)?  so that we have only function and
IMHO it's more focused.

> +/* Under lp64, parameter 'l' is in one DI reg, then bitcast sub DI to SF. */
> +/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 { target { lp64 && 
> has_arch_pwr8 } } } } */
> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 2 { target { lp64 && 
> has_arch_pwr8 } } } } */
> +/* { dg-final { scan-assembler-times {\mrldicr\M} 1 { target { lp64 && 
> has_arch_pwr8 } } } } */
> +/* { dg-final { scan-assembler-times {\msldi\M} 1 { target { lp64 && 
> has_arch_pwr8 } } } } */
> +

Nit: Could you move this to the end of this file or closely
follow the top dg-*?

The others look good to me, thanks!

BR,
Kewen

> +union 

Re: [PATCH]AArch64: Use SVE unpredicated LOGICAL expressions when Advanced SIMD inefficient [PR109154]

2023-09-27 Thread Richard Sandiford
Tamar Christina  writes:
> Hi All,
>
> SVE has much bigger immediate encoding range for bitmasks than Advanced SIMD 
> has
> and so on a system that is SVE capable if we need an Advanced SIMD 
> Inclusive-OR
> by immediate and would require a reload then an unpredicated SVE ORR instead.
>
> This has both speed and size improvements.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   PR tree-optimization/109154
>   * config/aarch64/aarch64.md (3): Convert to new syntax and
>   SVE split case.
>   * config/aarch64/iterators.md (VCONV, vconv): New.
>
> gcc/testsuite/ChangeLog:
>
>   PR tree-optimization/109154
>   * gcc.target/aarch64/sve/fneg-abs_2.c: Updated.
>   * gcc.target/aarch64/sve/fneg-abs_4.c: Updated.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 60c92213c75a2a4c18a6b59ae52fe45d1e872718..377c5cafedd43d8d1320489a36267cc6e5f15239
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4551,17 +4551,27 @@ (define_insn_and_split "*aarch64_and_imm2"
>}
>  )
>  
> -(define_insn "3"
> -  [(set (match_operand:GPI 0 "register_operand" "=r,rk,w")
> - (LOGICAL:GPI (match_operand:GPI 1 "register_operand" "%r,r,w")
> -  (match_operand:GPI 2 "aarch64_logical_operand" 
> "r,,w")))]
> -  ""
> -  "@
> -  \\t%0, %1, %2
> -  \\t%0, %1, %2
> -  \\t%0., %1., %2."
> -  [(set_attr "type" "logic_reg,logic_imm,neon_logic")
> -   (set_attr "arch" "*,*,simd")]
> +(define_insn_and_split "3"
> +  [(set (match_operand:GPI 0 "register_operand")
> + (LOGICAL:GPI (match_operand:GPI 1 "register_operand")
> +  (match_operand:GPI 2 "aarch64_logical_operand")))]
> +  ""
> +  {@ [cons: =0, 1, 2; attrs: type, arch]
> + [r , %r, r   ; logic_reg , *   ] \t%0, %1, %2
> + [rk, r , ; logic_imm , *   ] \t%0, %1, %2
> + [w , 0 , ; * , sve ] #
> + [w , w , w   ; neon_logic, simd] \t%0., 
> %1., %2.
> +  }
> +  "&& TARGET_SVE && rtx_equal_p (operands[0], operands[1])
> +   && satisfies_constraint_ (operands[2])
> +   && FP_REGNUM_P (REGNO (operands[0]))"
> +  [(const_int 0)]
> +  {
> +rtx op1 = lowpart_subreg (mode, operands[1], mode);
> +rtx op2 = gen_const_vec_duplicate (mode, operands[2]);
> +emit_insn (gen_3 (op1, op1, op2));
> +DONE;
> +  }
>  )

The WIP SME patches add a %Z modifier for 'z' register prefixes,
similarly to b/h/s/d for scalar FP.  With that I think the alternative
can be:

 [w , 0 , ; * , sve ] \t%Z0., %Z0., #%2

although it would be nice to keep the hex constant.

Will try to post the patches up to that part soon.

Thanks,
Richard

>  
>  ;; zero_extend version of above
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> d17becc37e230684beaee3c69e2a0f0ce612eda5..568cd5d1a3a9e00475376177ad13de72609df3d8
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1432,6 +1432,11 @@ (define_mode_attr VCONQ [(V8QI "V16QI") (V16QI "V16QI")
>(HI   "V8HI") (QI   "V16QI")
>(SF   "V4SF") (DF   "V2DF")])
>  
> +;; 128-bit container modes for the lower part of an SVE vector to the inner 
> or
> +;; scalar source mode.
> +(define_mode_attr VCONV [(SI "VNx4SI") (DI "VNx2DI")])
> +(define_mode_attr vconv [(SI "vnx4si") (DI "vnx2di")])
> +
>  ;; Half modes of all vector modes.
>  (define_mode_attr VHALF [(V8QI "V4QI")  (V16QI "V8QI")
>(V4HI "V2HI")  (V8HI  "V4HI")
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> index 
> a60cd31b9294af2dac69eed1c93f899bd5c78fca..fe9f27bf91b8fb18205a5891a5d5e847a5d88e4b
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> @@ -7,8 +7,7 @@
>  
>  /*
>  ** f1:
> -**   moviv[0-9]+.2s, 0x80, lsl 24
> -**   orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +**   orr z0.s, z0.s, #0x8000
>  **   ret
>  */
>  float32_t f1 (float32_t a)
> @@ -18,9 +17,7 @@ float32_t f1 (float32_t a)
>  
>  /*
>  ** f2:
> -**   mov x0, -9223372036854775808
> -**   fmovd[0-9]+, x0
> -**   orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +**   orr z0.d, z0.d, #0x8000
>  **   ret
>  */
>  float64_t f2 (float64_t a)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> index 
> 21f2a8da2a5d44e3d01f6604ca7be87e3744d494..707bcb0b6c53e212b55a255f500e9e548e9ccd80
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_4.c
> @@ -6,9 +6,7 @@
>  
>  /*
>  ** negabs:
> -**   mov x0, -9223372036854775808
> -**   fmovd[0-9]+, x0
> -**   orr v[0-9]+.8b, v[0-9]+.8b, 

RE: [PATCH v4] aarch64: Fine-grained policies to control ldp-stp formation.

2023-09-27 Thread Kyrylo Tkachov
Hi Manos,

> -Original Message-
> From: Manos Anagnostakis 
> Sent: Tuesday, September 26, 2023 2:52 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Kyrylo Tkachov ; Tamar Christina
> ; Philipp Tomsich ;
> Manos Anagnostakis 
> Subject: [PATCH v4] aarch64: Fine-grained policies to control ldp-stp
> formation.
> 
> This patch implements the following TODO in gcc/config/aarch64/aarch64.cc
> to provide the requested behaviour for handling ldp and stp:
> 
>   /* Allow the tuning structure to disable LDP instruction formation
>  from combining instructions (e.g., in peephole2).
>  TODO: Implement fine-grained tuning control for LDP and STP:
>1. control policies for load and store separately;
>2. support the following policies:
>   - default (use what is in the tuning structure)
>   - always
>   - never
>   - aligned (only if the compiler can prove that the
> load will be aligned to 2 * element_size)  */
> 
> It provides two new and concrete target-specific command-line parameters
> -param=aarch64-ldp-policy= and -param=aarch64-stp-policy=
> to give the ability to control load and store policies seperately as
> stated in part 1 of the TODO.
> 
> The accepted values for both parameters are:
> - default: Use the policy of the tuning structure (default).
> - always: Emit ldp/stp regardless of alignment.
> - never: Do not emit ldp/stp.
> - aligned: In order to emit ldp/stp, first check if the load/store will
>   be aligned to 2 * element_size.
> 
> Bootstrapped and regtested aarch64-linux.
> 
> gcc/ChangeLog:
> * config/aarch64/aarch64-opts.h (enum aarch64_ldp_policy): New
>   enum type.
> (enum aarch64_stp_policy): New enum type.
> * config/aarch64/aarch64-protos.h (struct tune_params): Add
>   appropriate enums for the policies.
>   (aarch64_mem_ok_with_ldpstp_policy_model): New declaration.
> * config/aarch64/aarch64-tuning-flags.def
>   (AARCH64_EXTRA_TUNING_OPTION): Remove superseded tuning
>   options.
> * config/aarch64/aarch64.cc (aarch64_parse_ldp_policy): New
>   function to parse ldp-policy parameter.
> (aarch64_parse_stp_policy): New function to parse stp-policy 
> parameter.
> (aarch64_override_options_internal): Call parsing functions.
>   (aarch64_mem_ok_with_ldpstp_policy_model): New function.
> (aarch64_operands_ok_for_ldpstp): Add call to
>   aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
>   check and alignment check and remove superseded ones.
> (aarch64_operands_adjust_ok_for_ldpstp): Add call to
> aarch64_mem_ok_with_ldpstp_policy_model for parameter-value
>   check and alignment check and remove superseded ones.
> * config/aarch64/aarch64.opt: Add parameters.
>   * doc/invoke.texi: Document the parameters accordingly.

The ChangeLog entry should name the new parameters. For example:
* config/aarch64/aarch64.opt (aarch64-ldp-policy): New param.

Ok with the fixed ChangeLog.
Thank you for the work!
Kyrill

> 
> gcc/testsuite/ChangeLog:
>   * gcc.target/aarch64/ampere1-no_ldp_combine.c: Removed.
> * gcc.target/aarch64/ldp_aligned.c: New test.
> * gcc.target/aarch64/ldp_always.c: New test.
> * gcc.target/aarch64/ldp_never.c: New test.
> * gcc.target/aarch64/stp_aligned.c: New test.
> * gcc.target/aarch64/stp_always.c: New test.
> * gcc.target/aarch64/stp_never.c: New test.
> 
> Signed-off-by: Manos Anagnostakis 
> ---
> Changes in v4:
> - Changed the parameters to accept enum instead of an
>   integer and updated documentation in doc/invoke.texi.
> - Packed all the new checks in aarch64_operands_ok_for_ldpstp/
>   aarch64_operands_adjust_ok_for_ldpstp in a new function
>   called aarch64_mem_ok_with_ldpstp_policy_model.
> 
>  gcc/config/aarch64/aarch64-opts.h |  16 ++
>  gcc/config/aarch64/aarch64-protos.h   |  25 +++
>  gcc/config/aarch64/aarch64-tuning-flags.def   |   8 -
>  gcc/config/aarch64/aarch64.cc | 212 +-
>  gcc/config/aarch64/aarch64.opt|  38 
>  gcc/doc/invoke.texi   |  20 ++
>  .../aarch64/ampere1-no_ldp_combine.c  |  11 -
>  .../gcc.target/aarch64/ldp_aligned.c  |  66 ++
>  gcc/testsuite/gcc.target/aarch64/ldp_always.c |  66 ++
>  gcc/testsuite/gcc.target/aarch64/ldp_never.c  |  66 ++
>  .../gcc.target/aarch64/stp_aligned.c  |  60 +
>  gcc/testsuite/gcc.target/aarch64/stp_always.c |  60 +
>  gcc/testsuite/gcc.target/aarch64/stp_never.c  |  60 +
>  13 files changed, 632 insertions(+), 76 deletions(-)
>  delete mode 100644 gcc/testsuite/gcc.target/aarch64/ampere1-
> no_ldp_combine.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_aligned.c
>  create mode 100644 

RE: [PATCH v1] RISC-V: Support FP roundeven auto-vectorization

2023-09-27 Thread Li, Pan2
Committed, thanks Juzhe.

Pan

From: juzhe.zh...@rivai.ai 
Sent: Wednesday, September 27, 2023 4:24 PM
To: Li, Pan2 ; gcc-patches 
Cc: Li, Pan2 ; Wang, Yanzhang ; 
kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Support FP roundeven auto-vectorization

LGTM


juzhe.zh...@rivai.ai

From: pan2.li
Date: 2023-09-27 16:20
To: gcc-patches
CC: juzhe.zhong; 
pan2.li; 
yanzhang.wang; 
kito.cheng
Subject: [PATCH v1] RISC-V: Support FP roundeven auto-vectorization
From: Pan Li mailto:pan2...@intel.com>>

This patch would like to support auto-vectorization for the
roundeven API in math.h. It depends on the -ffast-math option.

When we would like to call roundeven like v2 = roundeven (v1), we will
convert it into below insns (reference the implementation of llvm).

* vfcvt.x.f v3, v1, RNE
* vfcvt.f.x v2, v3

However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.

  +---+---+-+
  | raw float | binary layout | after roundeven |
  +---+---+-+
  | 8388607.5 | 0x4aff| 8388608.0   |
  | 8388608.0 | 0x4b00| 8388608.0   |
  | 8388609.0 | 0x4b01| 8388609.0   |
  +---+---+-+

All single floating point glte 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.

Befor this patch:
math-roundeven-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callroundeven
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  ...
  fsrmi   0   // Rounding to nearest, ties to even
.L4:
  vfabs.v v1,v2
  vmflt.vfv0,v1,fa5
  vfcvt.x.f.v v3,v2,v0.t
  vfcvt.f.x.v v1,v3,v0.t
  vfsgnj.vv   v1,v1,v2
  bne .L4
.L14:
  fsrma6
  ret

Please note VLS mode is also involved in this patch and covered by the
test cases.  We will add more run test with zfa support later.

gcc/ChangeLog:

* config/riscv/autovec.md (roundeven2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_roundeven): New func decl.
* config/riscv/riscv-v.cc (expand_vec_roundeven): New func impl.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/unop/math-roundeven-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-3.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-roundeven-1.c: New test.

Signed-off-by: Pan Li mailto:pan2...@intel.com>>
---
gcc/config/riscv/autovec.md   | 10 
gcc/config/riscv/riscv-protos.h   |  5 ++
gcc/config/riscv/riscv-v.cc   | 24 
.../riscv/rvv/autovec/unop/math-roundeven-0.c | 23 
.../riscv/rvv/autovec/unop/math-roundeven-1.c | 23 
.../riscv/rvv/autovec/unop/math-roundeven-2.c | 23 
.../riscv/rvv/autovec/unop/math-roundeven-3.c | 25 +
.../riscv/rvv/autovec/vls/math-roundeven-1.c  | 56 +++
8 files changed, 189 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-0.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-1.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-2.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-3.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-roundeven-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 680a3374972..cd0cbdd2889 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2271,3 +2271,13 @@ (define_expand "btrunc2"
 DONE;
   }
)
+
+(define_expand "roundeven2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_roundeven (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 536e70bdcd3..368982a447b 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -259,6 +259,9 @@ enum insn_flags : unsigned int
   /* Means INSN has FRM operand and the value is FRM_RMM.  */
   FRM_RMM_P = 1 << 18,
+
+  /* Means INSN has FRM operand and the value is FRM_RNE.  */
+  FRM_RNE_P = 1 << 19,
};
enum insn_type : unsigned int
@@ -303,6 +306,7 @@ enum insn_type : unsigned int
   UNARY_OP_TAMU_FRM_RUP = UNARY_OP_TAMU | FRM_RUP_P,
   

Re: [PATCH v1] RISC-V: Support FP roundeven auto-vectorization

2023-09-27 Thread juzhe.zh...@rivai.ai
LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-09-27 16:20
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Support FP roundeven auto-vectorization
From: Pan Li 
 
This patch would like to support auto-vectorization for the
roundeven API in math.h. It depends on the -ffast-math option.
 
When we would like to call roundeven like v2 = roundeven (v1), we will
convert it into below insns (reference the implementation of llvm).
 
* vfcvt.x.f v3, v1, RNE
* vfcvt.f.x v2, v3
 
However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.
 
  +---+---+-+
  | raw float | binary layout | after roundeven |
  +---+---+-+
  | 8388607.5 | 0x4aff| 8388608.0   |
  | 8388608.0 | 0x4b00| 8388608.0   |
  | 8388609.0 | 0x4b01| 8388609.0   |
  +---+---+-+
 
All single floating point glte 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.
 
Befor this patch:
math-roundeven-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callroundeven
  fsw fa0,-4(s1)
  bne s0,s2,.L3
 
After this patch:
  ...
  fsrmi   0   // Rounding to nearest, ties to even
.L4:
  vfabs.v v1,v2
  vmflt.vfv0,v1,fa5
  vfcvt.x.f.v v3,v2,v0.t
  vfcvt.f.x.v v1,v3,v0.t
  vfsgnj.vv   v1,v1,v2
  bne .L4
.L14:
  fsrma6
  ret
 
Please note VLS mode is also involved in this patch and covered by the
test cases.  We will add more run test with zfa support later.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (roundeven2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_roundeven): New func decl.
* config/riscv/riscv-v.cc (expand_vec_roundeven): New func impl.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-3.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-roundeven-1.c: New test.
 
Signed-off-by: Pan Li 
---
gcc/config/riscv/autovec.md   | 10 
gcc/config/riscv/riscv-protos.h   |  5 ++
gcc/config/riscv/riscv-v.cc   | 24 
.../riscv/rvv/autovec/unop/math-roundeven-0.c | 23 
.../riscv/rvv/autovec/unop/math-roundeven-1.c | 23 
.../riscv/rvv/autovec/unop/math-roundeven-2.c | 23 
.../riscv/rvv/autovec/unop/math-roundeven-3.c | 25 +
.../riscv/rvv/autovec/vls/math-roundeven-1.c  | 56 +++
8 files changed, 189 insertions(+)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-0.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-1.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-2.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-3.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-roundeven-1.c
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 680a3374972..cd0cbdd2889 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2271,3 +2271,13 @@ (define_expand "btrunc2"
 DONE;
   }
)
+
+(define_expand "roundeven2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_roundeven (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 536e70bdcd3..368982a447b 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -259,6 +259,9 @@ enum insn_flags : unsigned int
   /* Means INSN has FRM operand and the value is FRM_RMM.  */
   FRM_RMM_P = 1 << 18,
+
+  /* Means INSN has FRM operand and the value is FRM_RNE.  */
+  FRM_RNE_P = 1 << 19,
};
enum insn_type : unsigned int
@@ -303,6 +306,7 @@ enum insn_type : unsigned int
   UNARY_OP_TAMU_FRM_RUP = UNARY_OP_TAMU | FRM_RUP_P,
   UNARY_OP_TAMU_FRM_RDN = UNARY_OP_TAMU | FRM_RDN_P,
   UNARY_OP_TAMU_FRM_RMM = UNARY_OP_TAMU | FRM_RMM_P,
+  UNARY_OP_TAMU_FRM_RNE = UNARY_OP_TAMU | FRM_RNE_P,
   /* Binary operator.  */
   BINARY_OP = __NORMAL_OP | BINARY_OP_P,
@@ -469,6 +473,7 @@ void expand_vec_nearbyint (rtx, rtx, machine_mode, 
machine_mode);
void expand_vec_rint (rtx, rtx, machine_mode, machine_mode);
void expand_vec_round (rtx, rtx, machine_mode, machine_mode);
void expand_vec_trunc (rtx, rtx, machine_mode, machine_mode);
+void 

Re: [PATCH] DSE: Fix ICE when the mode with access_size don't exist on the target[PR111590]

2023-09-27 Thread Richard Sandiford
Juzhe-Zhong  writes:
> hen doing fortran test with 'V' extension enabled on RISC-V port.
> I saw multiple ICE: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111590
>
> The root cause is on DSE:
>
> internal compiler error: in smallest_mode_for_size, at stor-layout.cc:356
> 0x1918f70 smallest_mode_for_size(poly_int<2u, unsigned long>, mode_class)
> ../../../../gcc/gcc/stor-layout.cc:356
> 0x11f75bb smallest_int_mode_for_size(poly_int<2u, unsigned long>)
> ../../../../gcc/gcc/machmode.h:916
> 0x3304141 find_shift_sequence
> ../../../../gcc/gcc/dse.cc:1738
> 0x3304f1a get_stored_val
> ../../../../gcc/gcc/dse.cc:1906
> 0x3305377 replace_read
> ../../../../gcc/gcc/dse.cc:2010
> 0x3306226 check_mem_read_rtx
> ../../../../gcc/gcc/dse.cc:2310
> 0x330667b check_mem_read_use
> ../../../../gcc/gcc/dse.cc:2415
>
> After investigations, DSE is trying to do optimization like this following 
> codes:
>
> (insn 86 85 87 9 (set (reg:V4DI 168)
> (mem/u/c:V4DI (reg/f:DI 171) [0  S32 A128])) "bug.f90":6:18 discrim 6 
> 1167 {*movv4di}
>  (expr_list:REG_EQUAL (const_vector:V4DI [
> (const_int 4 [0x4])
> (const_int 1 [0x1]) repeated x2
> (const_int 3 [0x3])
> ])
> (nil)))
>
> (set (mem) (reg:V4DI 168))
>
> Then it ICE on: auto new_mode = smallest_int_mode_for_size (access_size * 
> BITS_PER_UNIT);
>
> The access_size may be 24 or 32. We don't have such integer modes with these 
> size so it ICE.
>
> TODO: The better way maybe make DSE use native_encode_rtx/native_decode_rtx
>   but I don't know how to do that.  So let's quickly fix this issue, we
>   can improve the fix later.
>
> gcc/ChangeLog:
>
>   * dse.cc (find_shift_sequence): Check the mode with access_size exist 
> on the target.

OK, thanks, but...

> Authored-By: Richard Sandiford 

...it was just a review comment.  I didn't write the patch.

Richard

>
> ---
>  gcc/dse.cc | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/dse.cc b/gcc/dse.cc
> index 8b07be17674..1a85dae1f8c 100644
> --- a/gcc/dse.cc
> +++ b/gcc/dse.cc
> @@ -1733,7 +1733,8 @@ find_shift_sequence (poly_int64 access_size,
>/* If a constant was stored into memory, try to simplify it here,
>   otherwise the cost of the shift might preclude this optimization
>   e.g. at -Os, even when no actual shift will be needed.  */
> -  if (store_info->const_rhs)
> +  if (store_info->const_rhs
> +  && known_le (access_size, GET_MODE_SIZE (MAX_MODE_INT)))
>  {
>auto new_mode = smallest_int_mode_for_size (access_size * 
> BITS_PER_UNIT);
>auto byte = subreg_lowpart_offset (new_mode, store_mode);


[PATCH v1] RISC-V: Support FP roundeven auto-vectorization

2023-09-27 Thread pan2 . li
From: Pan Li 

This patch would like to support auto-vectorization for the
roundeven API in math.h. It depends on the -ffast-math option.

When we would like to call roundeven like v2 = roundeven (v1), we will
convert it into below insns (reference the implementation of llvm).

* vfcvt.x.f v3, v1, RNE
* vfcvt.f.x v2, v3

However, the floating point value may not need the cvt as above if
its mantissa is zero. For example single precision floating point below.

  +---+---+-+
  | raw float | binary layout | after roundeven |
  +---+---+-+
  | 8388607.5 | 0x4aff| 8388608.0   |
  | 8388608.0 | 0x4b00| 8388608.0   |
  | 8388609.0 | 0x4b01| 8388609.0   |
  +---+---+-+

All single floating point glte 8388608.0 will have all zero mantisaa.
We leverage vmflt and mask to filter them out in vector and only do the
cvt on mask.

Befor this patch:
math-roundeven-1.c:21:1: missed: couldn't vectorize loop
  ...
.L3:
  flw fa0,0(s0)
  addis0,s0,4
  addis1,s1,4
  callroundeven
  fsw fa0,-4(s1)
  bne s0,s2,.L3

After this patch:
  ...
  fsrmi   0   // Rounding to nearest, ties to even
.L4:
  vfabs.v v1,v2
  vmflt.vfv0,v1,fa5
  vfcvt.x.f.v v3,v2,v0.t
  vfcvt.f.x.v v1,v3,v0.t
  vfsgnj.vv   v1,v1,v2
  bne .L4
.L14:
  fsrma6
  ret

Please note VLS mode is also involved in this patch and covered by the
test cases.  We will add more run test with zfa support later.

gcc/ChangeLog:

* config/riscv/autovec.md (roundeven2): New pattern.
* config/riscv/riscv-protos.h (enum insn_flags): New enum type.
(enum insn_type): Ditto.
(expand_vec_roundeven): New func decl.
* config/riscv/riscv-v.cc (expand_vec_roundeven): New func impl.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/unop/math-roundeven-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-1.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-2.c: New test.
* gcc.target/riscv/rvv/autovec/unop/math-roundeven-3.c: New test.
* gcc.target/riscv/rvv/autovec/vls/math-roundeven-1.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec.md   | 10 
 gcc/config/riscv/riscv-protos.h   |  5 ++
 gcc/config/riscv/riscv-v.cc   | 24 
 .../riscv/rvv/autovec/unop/math-roundeven-0.c | 23 
 .../riscv/rvv/autovec/unop/math-roundeven-1.c | 23 
 .../riscv/rvv/autovec/unop/math-roundeven-2.c | 23 
 .../riscv/rvv/autovec/unop/math-roundeven-3.c | 25 +
 .../riscv/rvv/autovec/vls/math-roundeven-1.c  | 56 +++
 8 files changed, 189 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-0.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-1.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-2.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/math-roundeven-3.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/math-roundeven-1.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 680a3374972..cd0cbdd2889 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2271,3 +2271,13 @@ (define_expand "btrunc2"
 DONE;
   }
 )
+
+(define_expand "roundeven2"
+  [(match_operand:V_VLSF 0 "register_operand")
+   (match_operand:V_VLSF 1 "register_operand")]
+  "TARGET_VECTOR && !flag_trapping_math && !flag_rounding_math"
+  {
+riscv_vector::expand_vec_roundeven (operands[0], operands[1], mode, 
mode);
+DONE;
+  }
+)
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 536e70bdcd3..368982a447b 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -259,6 +259,9 @@ enum insn_flags : unsigned int
 
   /* Means INSN has FRM operand and the value is FRM_RMM.  */
   FRM_RMM_P = 1 << 18,
+
+  /* Means INSN has FRM operand and the value is FRM_RNE.  */
+  FRM_RNE_P = 1 << 19,
 };
 
 enum insn_type : unsigned int
@@ -303,6 +306,7 @@ enum insn_type : unsigned int
   UNARY_OP_TAMU_FRM_RUP = UNARY_OP_TAMU | FRM_RUP_P,
   UNARY_OP_TAMU_FRM_RDN = UNARY_OP_TAMU | FRM_RDN_P,
   UNARY_OP_TAMU_FRM_RMM = UNARY_OP_TAMU | FRM_RMM_P,
+  UNARY_OP_TAMU_FRM_RNE = UNARY_OP_TAMU | FRM_RNE_P,
 
   /* Binary operator.  */
   BINARY_OP = __NORMAL_OP | BINARY_OP_P,
@@ -469,6 +473,7 @@ void expand_vec_nearbyint (rtx, rtx, machine_mode, 
machine_mode);
 void expand_vec_rint (rtx, rtx, machine_mode, machine_mode);
 void expand_vec_round (rtx, rtx, machine_mode, machine_mode);
 void expand_vec_trunc (rtx, rtx, machine_mode, machine_mode);
+void expand_vec_roundeven (rtx, rtx, machine_mode, machine_mode);
 #endif
 bool sew64_scalar_helper (rtx *, rtx *, rtx, machine_mode,

RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]

2023-09-27 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Wednesday, September 27, 2023 8:12 AM
> To: Tamar Christina 
> Cc: Andrew Pinski ; gcc-patches@gcc.gnu.org; nd
> ; j...@ventanamicro.com
> Subject: RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 <<
> signbit(x)) [PR109154]
> 
> On Wed, 27 Sep 2023, Tamar Christina wrote:
> 
> > > -Original Message-
> > > From: Andrew Pinski 
> > > Sent: Wednesday, September 27, 2023 2:17 AM
> > > To: Tamar Christina 
> > > Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de;
> > > j...@ventanamicro.com
> > > Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to
> > > x | (1 <<
> > > signbit(x)) [PR109154]
> > >
> > > On Tue, Sep 26, 2023 at 5:51?PM Tamar Christina
> > > 
> > > wrote:
> > > >
> > > > Hi All,
> > > >
> > > > For targets that allow conversion between int and float modes this
> > > > adds a new optimization transforming fneg (fabs (x)) into x | (1
> > > > << signbit(x)).  Such sequences are common in scientific code
> > > > working with
> > > gradients.
> > > >
> > > > The transformed instruction if the target has an inclusive-OR that
> > > > takes an immediate is both shorter an faster.  For those that
> > > > don't the immediate has to be seperate constructed but this still
> > > > ends up being faster as the immediate construction is not on the 
> > > > critical
> path.
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > > >
> > > > Ok for master?
> > >
> > > I think this should be part of isel instead of match.
> > > Maybe we could use genmatch to generate the code that does the
> > > transformations but this does not belong as part of match really.
> >
> > I disagree.. I don't think this belongs in isel. Isel is for structural
> transformations.
> > If there is a case for something else I'd imagine backwardprop is a better
> choice.
> >
> > But I don't see why it doesn't belong here considering it *is* a
> > mathematical optimization and the file has plenty of transformations
> > such as mask optimizations and vector conditional rewriting.
> 
> But the mathematical transform would more generally be fneg (fabs (x)) ->
> copysign (x, -1.) and that can be optimally expanded at RTL expansion time?

Ah sure, atm I did copysign (x, -1) -> x | 1 << signbits.  I can do it the 
other way
around.  And I guess since copysign (-x, y), copysign(|x|, y) -> copysign (x, 
y) that
should solve the trigonometry problem too.

Cool will do that instead, thanks!

Tamar

> 
> Richard.
> 
> > Regards,
> > Tamar
> >
> > >
> > > Thanks,
> > > Andrew
> > >
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > > PR tree-optimization/109154
> > > > * match.pd: Add new neg+abs rule.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR tree-optimization/109154
> > > > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > > >
> > > > --- inline copy of patch --
> > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > >
> > >
> 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d69482
> > > 6cffad0f
> > > > b17e1136600a 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -9476,3 +9476,57 @@ and,
> > > >}
> > > >(if (full_perm_p)
> > > > (vec_perm (op@3 @0 @1) @3 @2))
> > > > +
> > > > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X).  */
> > > > +
> > > > +(simplify
> > > > + (negate (abs @0))
> > > > + (if (FLOAT_TYPE_P (type)
> > > > +  /* We have to delay this rewriting till after forward prop
> > > > +because
> > > otherwise
> > > > +it's harder to do trigonometry optimizations. e.g. 
> > > > cos(-fabs(x)) is not
> > > > +matched in one go.  Instead cos (-x) is matched first
> > > > + followed by
> > > cos(|x|).
> > > > +The bottom op approach makes this rule match first and it's not
> untill
> > > > +fwdprop that we match top down.  There are manu such
> > > > + simplications
> > > so we
> > > > +delay this optimization till later on.  */
> > > > +  && canonicalize_math_after_vectorization_p ())  (with {
> > > > +tree itype = unsigned_type_for (type);
> > > > +machine_mode mode = TYPE_MODE (type);
> > > > +const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> > > > +auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> > > > +   (if (float_fmt
> > > > +   && float_fmt->signbit_rw >= 0
> > > > +   && targetm.can_change_mode_class (TYPE_MODE 

Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-27 Thread Andre Vieira (lists)



On 26/09/2023 17:37, Andrew Stubbs wrote:

I don't have authority to approve anything, but here's a review anyway.

Thanks for working on this.


Thank you for reviewing and apologies for the mess of a patch, may have 
rushed it ;)


diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c

new file mode 100644
index 
..09127b8cb6f2e3699b6073591f58be7047330273

--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+/* { dg-additional-options "-fopenmp-simd" } */
+


Do you need -fopenmp-simd for this?

Nope, I keep forgetting that you only need it for pragmas.

Dealt with the other comments too.

Any thoughts on changing gimple_call_internal_fn  instead? My main 
argument against is that IFN_MASK_CALL should not appear outside of 
ifconvert and vectorizer. On the other hand, we may inspect the flags 
elsewhere in the vectorizer (now or in the future) and changing 
gimple_call_internal_fn would prevent the need to handle the IFN 
separately elsewhere.


Kind Regards,
Andrediff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
new file mode 100644
index 
..e7ed56ca75470464307d0d266dacfa0d8d6e43c1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,22 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+
+int __attribute__ ((__simd__, const)) fn (int);
+
+void test (int * __restrict__ a, int * __restrict__ b, int n)
+{
+  for (int i = 0; i < n; ++i)
+{
+  int a_;
+  if (b[i] > 0)
+a_ = fn (b[i]);
+  else
+a_ = b[i] + 5;
+  a[i] = a_;
+}
+}
+
+/* { dg-final { scan-tree-dump-not {loop contains function calls or data 
references} "vect" } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 
6d3b7c2290e4db9c1168a4c763facb481157c97c..689aaeed72282bb0da2a17e19fb923a06e8d62fa
 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "vr-values.h"
 #include "range-op.h"
 #include "tree-ssa-loop-ivopts.h"
+#include "calls.h"
 
 static struct datadep_stats
 {
@@ -5816,6 +5817,15 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ break;
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl
+ || (flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }
break;
  default:
clobbers_memory = true;
@@ -5852,7 +5862,7 @@ get_references_in_stmt (gimple *stmt, vec *references)
 }
   else if (stmt_code == GIMPLE_CALL)
 {
-  unsigned i, n;
+  unsigned i = 0, n;
   tree ptr, type;
   unsigned int align;
 
@@ -5879,13 +5889,16 @@ get_references_in_stmt (gimple *stmt, vec *references)
   ptr);
references->safe_push (ref);
return false;
+ case IFN_MASK_CALL:
+   i = 1;
+   gcc_fallthrough ();
  default:
break;
  }
 
   op0 = gimple_call_lhs (stmt);
   n = gimple_call_num_args (stmt);
-  for (i = 0; i < n; i++)
+  for (; i < n; i++)
{
  op1 = gimple_call_arg (stmt, i);
 


Re: [PATCH] testsuite: Avoid uninit var in pr60510.f [PR111427]

2023-09-27 Thread Richard Biener
On Wed, Sep 27, 2023 at 7:39 AM Kewen.Lin  wrote:
>
> Hi,
>
> The uninitialized variable a in pr60510.f can cause some
> random failures as exposed in PR111427, see the details
> there.  This patch is to make it initialized accordingly.
>
> As verified, it can fix the reported -m32 failures on
> P7 and P8 BE.  It's also tested well on powerpc64-linux-gnu
> P9 and powerpc64le-linux-gnu P9 and P10.
>
> Is it ok for trunk?

OK.

Richard.

> BR,
> Kewen
> -
>
> PR testsuite/111427
>
> gcc/testsuite/ChangeLog:
>
> * gfortran.dg/vect/pr60510.f (test): Init variable a.
> ---
>  gcc/testsuite/gfortran.dg/vect/pr60510.f | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/gcc/testsuite/gfortran.dg/vect/pr60510.f 
> b/gcc/testsuite/gfortran.dg/vect/pr60510.f
> index ecd50dd5586..6cae82acece 100644
> --- a/gcc/testsuite/gfortran.dg/vect/pr60510.f
> +++ b/gcc/testsuite/gfortran.dg/vect/pr60510.f
> @@ -17,6 +17,7 @@
>
>program test
>real*8 x(1024),y(1024),a
> +  a = 0.0
>do i=1,1024
>  x(i) = i
>  y(i) = i+1
> --
> 2.35.4


RE: [PATCH] ifcvt: Fix comments

2023-09-27 Thread Li, Pan2
Committed, thanks Richard.

Pan

-Original Message-
From: Richard Biener  
Sent: Wednesday, September 27, 2023 3:18 PM
To: Juzhe-Zhong 
Cc: gcc-patches@gcc.gnu.org; richard.sandif...@arm.com; jeffreya...@gmail.com
Subject: Re: [PATCH] ifcvt: Fix comments

On Wed, 27 Sep 2023, Juzhe-Zhong wrote:

> Fix comments since original comment is confusing.

OK

> gcc/ChangeLog:
> 
>   * tree-if-conv.cc (is_cond_scalar_reduction): Fix comments.
> 
> ---
>  gcc/tree-if-conv.cc | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 799f071965e..a8c915913ae 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1747,11 +1747,11 @@ is_cond_scalar_reduction (gimple *phi, gimple 
> **reduc, tree arg_0, tree arg_1,
>  
>   and convert to
>  
> - reduc_2 = PHI <0, reduc_3>
> - tmp1 = (unsigned type)reduce_1;
> + reduc_2 = PHI <0, reduc_1>
> + tmp1 = (unsigned type)reduc_1;
>   ifcvt = cond_expr ? rhs2 : 0
>   tmp2 = tmp1 +/- ifcvt;
> - reduce_1 = (signed type)tmp2;  */
> + reduc_1 = (signed type)tmp2;  */
>  
>if (CONVERT_EXPR_CODE_P (reduction_op))
>  {
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH] ifcvt: Fix comments

2023-09-27 Thread Richard Biener
On Wed, 27 Sep 2023, Juzhe-Zhong wrote:

> Fix comments since original comment is confusing.

OK

> gcc/ChangeLog:
> 
>   * tree-if-conv.cc (is_cond_scalar_reduction): Fix comments.
> 
> ---
>  gcc/tree-if-conv.cc | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
> index 799f071965e..a8c915913ae 100644
> --- a/gcc/tree-if-conv.cc
> +++ b/gcc/tree-if-conv.cc
> @@ -1747,11 +1747,11 @@ is_cond_scalar_reduction (gimple *phi, gimple 
> **reduc, tree arg_0, tree arg_1,
>  
>   and convert to
>  
> - reduc_2 = PHI <0, reduc_3>
> - tmp1 = (unsigned type)reduce_1;
> + reduc_2 = PHI <0, reduc_1>
> + tmp1 = (unsigned type)reduc_1;
>   ifcvt = cond_expr ? rhs2 : 0
>   tmp2 = tmp1 +/- ifcvt;
> - reduce_1 = (signed type)tmp2;  */
> + reduc_1 = (signed type)tmp2;  */
>  
>if (CONVERT_EXPR_CODE_P (reduction_op))
>  {
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH] vec.h: Make some ops work with non-trivially copy constructible and/or destructible types

2023-09-27 Thread Richard Biener
On Wed, 27 Sep 2023, Jakub Jelinek wrote:

> Hi!
> 
> We have some very limited support for non-POD types in vec.h
> (in particular grow_cleared will invoke default ctors on the
> cleared elements and vector copying invokes copy ctors.
> 
> My pending work on wide_int/widest_int which makes those two
> non-trivially default/copy constructible, assignable and destructible
> shows this isn't enough though.
> In particular the uses of it in irange shows that quick_push
> still uses just assignment operator rather than copy construction
> and we never invoke destructors on anything.
> 
> The following patch does that for quick_push (copy construction
> using placement new rather than assignment, for trivially copy
> constructible types I think it should be the same) and invokes
> destructors (only if non-trivially destructible) in pop, release
> and truncate.  Now as discussed last night on IRC, the pop case
> is problematic, because our pop actually does two things,
> it decreases length (so the previous last element should be destructed)
> but also returns a reference to it.  We have some 300+ uses of this
> and the reference rather than returning it by value is useful at least
> for the elements which are (larger) POD structures, so I'm not
> prepared to change that.  Though obviously for types with non-trivial
> destructors returning a reference to just destructed element is not
> a good idea.  So, this patch for that case only makes pop return void
> instead and any users wishing to get the last element need to use last ()
> and pop () separately (currently there are none).
> 
> Note, a lot of vec.h operations is still not friendly for non-POD types,
> I've added a comment for quick_insert and ordered_remove, but qsort is
> such a case as well and in fact most others too.  For non-POD, I'd say
> with this patch it is safe just to {quick,safe}_grow_cleared (but not
> just *_grow), {quick,safe}_push, pop, truncate, release, copy
> them around and ops which do not add/remove any elements or move them
> around.  And obviously using non-trivially destructible types in
> va_gc vectors is undesirable as well.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK I guess.  Can you summarize the limitations for non-POD types
in the big comment at the start of vec.h?  (can we put in static_asserts
in the places that obviously do not work?)

Thanks,
Richard.

> 2023-09-27  Jakub Jelinek  
> 
>   * vec.h (vec_destruct): New function template.
>   (release): Use it for non-trivially destructible T.
>   (truncate): Likewise.
>   (quick_push): Perform a placement new into slot
>   instead of assignment.
>   (quick_insert, ordered_remove): Note that they aren't suitable
>   for non-PODs.
>   (pop): For non-trivially destructible T return void
>   rather than T & and destruct the popped element.
>   * edit-context.cc (class line_event): Move definition earlier.
> 
> --- gcc/vec.h.jj  2023-09-26 16:44:30.637902359 +0200
> +++ gcc/vec.h 2023-09-26 21:17:30.366534474 +0200
> @@ -185,6 +185,16 @@ extern void dump_vec_loc_statistics (voi
>  /* Hashtable mapping vec addresses to descriptors.  */
>  extern htab_t vec_mem_usage_hash;
>  
> +/* Destruct N elements in DST.  */
> +
> +template 
> +inline void
> +vec_destruct (T *dst, unsigned n)
> +{
> +  for ( ; n; ++dst, --n)
> +dst->~T ();
> +}
> +
>  /* Control data for vectors.  This contains the number of allocated
> and used slots inside a vector.  */
>  
> @@ -310,6 +320,9 @@ va_heap::release (vecif (v == NULL)
>  return;
>  
> +  if (!std::is_trivially_destructible ::value)
> +vec_destruct (v->address (), v->length ());
> +
>if (GATHER_STATISTICS)
>  v->m_vecpfx.release_overhead (v, elt_size * v->allocated (),
> v->allocated (), true);
> @@ -588,7 +601,10 @@ public:
>void splice (const vec &);
>void splice (const vec *src);
>T *quick_push (const T &);
> -  T  (void);
> +  using pop_ret_type
> += typename std::conditional ::value,
> +  T &, void>::type;
> +  pop_ret_type pop (void);
>void truncate (unsigned);
>void quick_insert (unsigned, const T &);
>void ordered_remove (unsigned);
> @@ -1005,19 +1021,24 @@ vec::quick_push (const T
>  {
>gcc_checking_assert (space (1));
>T *slot =  ()[m_vecpfx.m_num++];
> -  *slot = obj;
> +  ::new (static_cast(slot)) T (obj);
>return slot;
>  }
>  
>  
> -/* Pop and return the last element off the end of the vector.  */
> +/* Pop and return a reference to the last element off the end of the
> +   vector.  If T has non-trivial destructor, this method just pops
> +   the element and returns void type.  */
>  
>  template
> -inline T &
> +inline typename vec::pop_ret_type
>  vec::pop (void)
>  {
>gcc_checking_assert (length () > 0);
> -  return address ()[--m_vecpfx.m_num];
> +  T  = address ()[--m_vecpfx.m_num];
> +  if 

RE: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 << signbit(x)) [PR109154]

2023-09-27 Thread Richard Biener
On Wed, 27 Sep 2023, Tamar Christina wrote:

> > -Original Message-
> > From: Andrew Pinski 
> > Sent: Wednesday, September 27, 2023 2:17 AM
> > To: Tamar Christina 
> > Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de;
> > j...@ventanamicro.com
> > Subject: Re: [PATCH]middle-end match.pd: optimize fneg (fabs (x)) to x | (1 
> > <<
> > signbit(x)) [PR109154]
> > 
> > On Tue, Sep 26, 2023 at 5:51?PM Tamar Christina 
> > wrote:
> > >
> > > Hi All,
> > >
> > > For targets that allow conversion between int and float modes this
> > > adds a new optimization transforming fneg (fabs (x)) into x | (1 <<
> > > signbit(x)).  Such sequences are common in scientific code working with
> > gradients.
> > >
> > > The transformed instruction if the target has an inclusive-OR that
> > > takes an immediate is both shorter an faster.  For those that don't
> > > the immediate has to be seperate constructed but this still ends up
> > > being faster as the immediate construction is not on the critical path.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> > 
> > I think this should be part of isel instead of match.
> > Maybe we could use genmatch to generate the code that does the
> > transformations but this does not belong as part of match really.
> 
> I disagree.. I don't think this belongs in isel. Isel is for structural 
> transformations.
> If there is a case for something else I'd imagine backwardprop is a better 
> choice.
> 
> But I don't see why it doesn't belong here considering it *is* a mathematical 
> optimization
> and the file has plenty of transformations such as mask optimizations and 
> vector conditional
> rewriting.

But the mathematical transform would more generally be
fneg (fabs (x)) -> copysign (x, -1.) and that can be optimally expanded
at RTL expansion time?

Richard.

> Regards,
> Tamar
> 
> > 
> > Thanks,
> > Andrew
> > 
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > PR tree-optimization/109154
> > > * match.pd: Add new neg+abs rule.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > PR tree-optimization/109154
> > > * gcc.target/aarch64/fneg-abs_1.c: New test.
> > > * gcc.target/aarch64/fneg-abs_2.c: New test.
> > > * gcc.target/aarch64/fneg-abs_3.c: New test.
> > > * gcc.target/aarch64/fneg-abs_4.c: New test.
> > > * gcc.target/aarch64/sve/fneg-abs_1.c: New test.
> > > * gcc.target/aarch64/sve/fneg-abs_2.c: New test.
> > > * gcc.target/aarch64/sve/fneg-abs_3.c: New test.
> > > * gcc.target/aarch64/sve/fneg-abs_4.c: New test.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > >
> > 39c7ea1088f25538ed8bd26ee89711566141a71f..8ebde06dcd4b26d69482
> > 6cffad0f
> > > b17e1136600a 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -9476,3 +9476,57 @@ and,
> > >}
> > >(if (full_perm_p)
> > > (vec_perm (op@3 @0 @1) @3 @2))
> > > +
> > > +/* Transform fneg (fabs (X)) -> X | 1 << signbit (X).  */
> > > +
> > > +(simplify
> > > + (negate (abs @0))
> > > + (if (FLOAT_TYPE_P (type)
> > > +  /* We have to delay this rewriting till after forward prop because
> > otherwise
> > > +it's harder to do trigonometry optimizations. e.g. cos(-fabs(x)) 
> > > is not
> > > +matched in one go.  Instead cos (-x) is matched first followed by
> > cos(|x|).
> > > +The bottom op approach makes this rule match first and it's not 
> > > untill
> > > +fwdprop that we match top down.  There are manu such 
> > > simplications
> > so we
> > > +delay this optimization till later on.  */
> > > +  && canonicalize_math_after_vectorization_p ())
> > > +  (with {
> > > +tree itype = unsigned_type_for (type);
> > > +machine_mode mode = TYPE_MODE (type);
> > > +const struct real_format *float_fmt = FLOAT_MODE_FORMAT (mode);
> > > +auto optab = VECTOR_TYPE_P (type) ? optab_vector : optab_default; }
> > > +   (if (float_fmt
> > > +   && float_fmt->signbit_rw >= 0
> > > +   && targetm.can_change_mode_class (TYPE_MODE (itype),
> > > + TYPE_MODE (type), ALL_REGS)
> > > +&& target_supports_op_p (itype, BIT_IOR_EXPR, optab))
> > > +(with { wide_int wone = wi::one (element_precision (type));
> > > +   int sbit = float_fmt->signbit_rw;
> > > +   auto stype = VECTOR_TYPE_P (type) ? TREE_TYPE (itype) : itype;
> > > +   tree sign_bit = wide_int_to_tree (stype, wi::lshift (wone, 
> > > sbit));}
> > > + (view_convert:type
> > > +  (bit_ior (view_convert:itype @0)
> > > +  { build_uniform_cst (itype, sign_bit); } )))
> > > +
> > > +/* Repeat the same but for conditional negate.  */
> > > +
> > > +(simplify
> > > + (IFN_COND_NEG @1 (abs @0) @2)
> > > + (if (FLOAT_TYPE_P (type))
> > > +  (with {
> > > +tree itype = 

[PATCH] ifcvt: Fix comments

2023-09-27 Thread Juzhe-Zhong
Fix comments since original comment is confusing.

gcc/ChangeLog:

* tree-if-conv.cc (is_cond_scalar_reduction): Fix comments.

---
 gcc/tree-if-conv.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 799f071965e..a8c915913ae 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1747,11 +1747,11 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, 
tree arg_0, tree arg_1,
 
  and convert to
 
- reduc_2 = PHI <0, reduc_3>
- tmp1 = (unsigned type)reduce_1;
+ reduc_2 = PHI <0, reduc_1>
+ tmp1 = (unsigned type)reduc_1;
  ifcvt = cond_expr ? rhs2 : 0
  tmp2 = tmp1 +/- ifcvt;
- reduce_1 = (signed type)tmp2;  */
+ reduc_1 = (signed type)tmp2;  */
 
   if (CONVERT_EXPR_CODE_P (reduction_op))
 {
-- 
2.36.3



Re: [PATCH]AArch64 Add movi for 0 moves for scalar types [PR109154]

2023-09-27 Thread Richard Sandiford
Tamar Christina  writes:
> Hi All,
>
> Following the Neoverse N/V and Cortex-A optimization guides SIMD 0 immediates
> should be created with a movi of 0.
>
> At the moment we generate an `fmov .., xzr` which is slower and requires a
> GP -> FP transfer.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   PR tree-optimization/109154
>   * config/aarch64/aarch64.md (*mov_aarch64, *movsi_aarch64,
>   *movdi_aarch64): Add new w -> Z case.
>   * config/aarch64/iterators.md (Vbtype): Add QI and HI.
>
> gcc/testsuite/ChangeLog:
>
>   PR tree-optimization/109154
>   * gcc.target/aarch64/fneg-abs_2.c: Updated.
>   * gcc.target/aarch64/fneg-abs_4.c: Updated.

OK, thanks.

Richard

> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> b51f979dba12b726bff0c1109b75c6d2c7ae41ab..60c92213c75a2a4c18a6b59ae52fe45d1e872718
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1232,6 +1232,7 @@ (define_insn "*mov_aarch64"
>"(register_operand (operands[0], mode)
>  || aarch64_reg_or_zero (operands[1], mode))"
>{@ [cons: =0, 1; attrs: type, arch]
> + [w, Z; neon_move  , simd  ] movi\t%0., #0
>   [r, r; mov_reg, * ] mov\t%w0, %w1
>   [r, M; mov_imm, * ] mov\t%w0, %1
>   [w, D; neon_move  , simd  ] << 
> aarch64_output_scalar_simd_mov_immediate (operands[1], mode);
> @@ -1289,6 +1290,7 @@ (define_insn_and_split "*movsi_aarch64"
>"(register_operand (operands[0], SImode)
>  || aarch64_reg_or_zero (operands[1], SImode))"
>{@ [cons: =0, 1; attrs: type, arch, length]
> + [w  , Z  ; neon_move, simd, 4] movi\t%0.2d, #0
>   [r k, r  ; mov_reg  , *   , 4] mov\t%w0, %w1
>   [r  , k  ; mov_reg  , *   , 4] ^
>   [r  , M  ; mov_imm  , *   , 4] mov\t%w0, %1
> @@ -1322,6 +1324,7 @@ (define_insn_and_split "*movdi_aarch64"
>"(register_operand (operands[0], DImode)
>  || aarch64_reg_or_zero (operands[1], DImode))"
>{@ [cons: =0, 1; attrs: type, arch, length]
> + [w, Z  ; neon_move, simd, 4] movi\t%0.2d, #0
>   [r, r  ; mov_reg  , *   , 4] mov\t%x0, %x1
>   [k, r  ; mov_reg  , *   , 4] mov\t%0, %x1
>   [r, k  ; mov_reg  , *   , 4] mov\t%x0, %1
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 
> 2451d8c2cd8e2da6ac8339eed9bc975cf203fa4c..d17becc37e230684beaee3c69e2a0f0ce612eda5
>  100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1297,6 +1297,7 @@ (define_mode_attr Vbtype [(V8QI "8b")  (V16QI "16b")
> (V4SF "16b") (V2DF  "16b")
> (DI   "8b")  (DF"8b")
> (SI   "8b")  (SF"8b")
> +   (QI   "8b")  (HI"8b")
> (V4BF "8b")  (V8BF  "16b")])
>  
>  ;; Advanced SIMD vector structure to element modes.
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c 
> b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> index 
> fb14ec3e2210e0feeff80f2410d777d3046a9f78..5e253d3059cfc9b93bd0865e6eaed1231eba19bd
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> @@ -20,7 +20,7 @@ float32_t f1 (float32_t a)
>  
>  /*
>  ** f2:
> -**   fmovd[0-9]+, xzr
> +**   moviv[0-9]+.2d, #0
>  **   fnegv[0-9]+.2d, v[0-9]+.2d
>  **   orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>  **   ret
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c 
> b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> index 
> 4ea0105f6c0a9756070bcc60d34f142f53d8242c..c86fe3e032c9e5176467841ce1a679ea47bbd531
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c
> @@ -8,7 +8,7 @@
>  
>  /*
>  ** negabs:
> -**   fmovd[0-9]+, xzr
> +**   moviv31.2d, #0
>  **   fnegv[0-9]+.2d, v[0-9]+.2d
>  **   orr v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
>  **   ret


  1   2   >