[gcc r15-1071] AArch64: correct constraint on Upl early clobber alternatives

2024-06-06 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:afe85f8e22a703280b17c701f3490d89337f674a

commit r15-1071-gafe85f8e22a703280b17c701f3490d89337f674a
Author: Tamar Christina 
Date:   Thu Jun 6 14:35:48 2024 +0100

AArch64: correct constraint on Upl early clobber alternatives

I made an oversight in the previous patch, where I added a ?Upa
alternative to the Upl cases.  This causes it to create the tie
between the larger register file rather than the constrained one.

This fixes the affected patterns.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (@aarch64_pred_cmp,
*cmp_cc, *cmp_ptest,
@aarch64_pred_cmp_wide,
*aarch64_pred_cmp_wide_cc,
*aarch64_pred_cmp_wide_ptest): Fix Upl tie 
alternative.
* config/aarch64/aarch64-sve2.md 
(@aarch64_pred_): Fix
Upl tie alternative.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md  | 64 +++---
 gcc/config/aarch64/aarch64-sve2.md |  2 +-
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index d902bce62fd..d69db34016a 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8134,13 +8134,13 @@
  UNSPEC_PRED_Z))
(clobber (reg:CC_NZC CC_REGNUM))]
   "TARGET_SVE"
-  {@ [ cons: =0 , 1   , 3 , 4; attrs: pred_clobber ]
- [  , Upl , w , ; yes ] 
cmp\t%0., %1/z, %3., #%4
- [ ?Upa , 0Upl, w , ; yes ] ^
- [ Upa  , Upl , w , ; no  ] ^
- [  , Upl , w , w; yes ] 
cmp\t%0., %1/z, %3., %4.
- [ ?Upa , 0Upl, w , w; yes ] ^
- [ Upa  , Upl , w , w; no  ] ^
+  {@ [ cons: =0 , 1  , 3 , 4; attrs: pred_clobber ]
+ [  , Upl, w , ; yes ] 
cmp\t%0., %1/z, %3., #%4
+ [ ?Upl , 0  , w , ; yes ] ^
+ [ Upa  , Upl, w , ; no  ] ^
+ [  , Upl, w , w; yes ] 
cmp\t%0., %1/z, %3., %4.
+ [ ?Upl , 0  , w , w; yes ] ^
+ [ Upa  , Upl, w , w; no  ] ^
   }
 )
 
@@ -8170,13 +8170,13 @@
  UNSPEC_PRED_Z))]
   "TARGET_SVE
&& aarch64_sve_same_pred_for_ptest_p ([4], [6])"
-  {@ [ cons: =0 , 1, 2 , 3; attrs: pred_clobber ]
- [  ,  Upl , w , ; yes ] 
cmp\t%0., %1/z, %2., #%3
- [ ?Upa ,  0Upl, w , ; yes ] ^
- [ Upa  ,  Upl , w , ; no  ] ^
- [  ,  Upl , w , w; yes ] 
cmp\t%0., %1/z, %2., %3.
- [ ?Upa ,  0Upl, w , w; yes ] ^
- [ Upa  ,  Upl , w , w; no  ] ^
+  {@ [ cons: =0 , 1   , 2 , 3; attrs: pred_clobber ]
+ [  ,  Upl, w , ; yes ] 
cmp\t%0., %1/z, %2., #%3
+ [ ?Upl ,  0  , w , ; yes ] ^
+ [ Upa  ,  Upl, w , ; no  ] ^
+ [  ,  Upl, w , w; yes ] 
cmp\t%0., %1/z, %2., %3.
+ [ ?Upl ,  0  , w , w; yes ] ^
+ [ Upa  ,  Upl, w , w; no  ] ^
   }
   "&& !rtx_equal_p (operands[4], operands[6])"
   {
@@ -8205,12 +8205,12 @@
   "TARGET_SVE
&& aarch64_sve_same_pred_for_ptest_p ([4], [6])"
   {@ [ cons: =0, 1, 2 , 3; attrs: pred_clobber ]
- [ ,  Upl , w , ; yes ] 
cmp\t%0., %1/z, %2., #%3
- [ ?Upa,  0Upl, w , ; yes ] ^
- [ Upa ,  Upl , w , ; no  ] ^
- [ ,  Upl , w , w; yes ] 
cmp\t%0., %1/z, %2., %3.
- [ ?Upa,  0Upl, w , w; yes ] ^
- [ Upa ,  Upl , w , w; no  ] ^
+ [ ,  Upl, w , ; yes ] 
cmp\t%0., %1/z, %2., #%3
+ [ ?Upl,  0  , w , ; yes ] ^
+ [ Upa ,  Upl, w , ; no  ] ^
+ [ ,  Upl, w , w; yes ] 
cmp\t%0., %1/z, %2., %3.
+ [ ?Upl,  0  , w , w; yes ] ^
+ [ Upa ,  Upl, w , w; no  ] ^
   }
   "&& !rtx_equal_p (operands[4], operands[6])"
   {
@@ -8263,10 +8263,10 @@
  UNSPEC_PRED_Z))
(clobber (reg:CC_NZC CC_REGNUM))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1, 2, 3, 4; attrs: pred_clobber ]
- [ ,  Upl ,  , w, w; yes ] 
cmp\t%0., %1/z, %3., %4.d
- [ ?Upa,  0Upl,  , w, w; yes ] ^
- [ Upa ,  Upl ,  , w, w; no  ] ^
+  {@ [ cons: =0, 1   , 2, 3, 4; attrs: pred_clobber ]
+ 

[PATCH]AArch64: correct constraint on Upl early clobber alternatives

2024-06-06 Thread Tamar Christina
Hi All,

I made an oversight in the previous patch, where I added a ?Upa
alternative to the Upl cases.  This causes it to create the tie
between the larger register file rather than the constrained one.

This fixes the affected patterns.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Build SPECCPU 2017 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (@aarch64_pred_cmp,
*cmp_cc, *cmp_ptest,
@aarch64_pred_cmp_wide,
*aarch64_pred_cmp_wide_cc,
*aarch64_pred_cmp_wide_ptest): Fix Upl tie alternative.
* config/aarch64/aarch64-sve2.md (@aarch64_pred_): Fix
Upl tie alternative.

---
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
d902bce62fde88b6d85f8d71f305e7fc76a4d34e..d69db34016a55b4324faa129a3ac1f47227ba776
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8134,13 +8134,13 @@ (define_insn "@aarch64_pred_cmp"
  UNSPEC_PRED_Z))
(clobber (reg:CC_NZC CC_REGNUM))]
   "TARGET_SVE"
-  {@ [ cons: =0 , 1   , 3 , 4; attrs: pred_clobber ]
- [  , Upl , w , ; yes ] 
cmp\t%0., %1/z, %3., #%4
- [ ?Upa , 0Upl, w , ; yes ] ^
- [ Upa  , Upl , w , ; no  ] ^
- [  , Upl , w , w; yes ] 
cmp\t%0., %1/z, %3., %4.
- [ ?Upa , 0Upl, w , w; yes ] ^
- [ Upa  , Upl , w , w; no  ] ^
+  {@ [ cons: =0 , 1  , 3 , 4; attrs: pred_clobber ]
+ [  , Upl, w , ; yes ] 
cmp\t%0., %1/z, %3., #%4
+ [ ?Upl , 0  , w , ; yes ] ^
+ [ Upa  , Upl, w , ; no  ] ^
+ [  , Upl, w , w; yes ] 
cmp\t%0., %1/z, %3., %4.
+ [ ?Upl , 0  , w , w; yes ] ^
+ [ Upa  , Upl, w , w; no  ] ^
   }
 )
 
@@ -8170,13 +8170,13 @@ (define_insn_and_rewrite "*cmp_cc"
  UNSPEC_PRED_Z))]
   "TARGET_SVE
&& aarch64_sve_same_pred_for_ptest_p ([4], [6])"
-  {@ [ cons: =0 , 1, 2 , 3; attrs: pred_clobber ]
- [  ,  Upl , w , ; yes ] 
cmp\t%0., %1/z, %2., #%3
- [ ?Upa ,  0Upl, w , ; yes ] ^
- [ Upa  ,  Upl , w , ; no  ] ^
- [  ,  Upl , w , w; yes ] 
cmp\t%0., %1/z, %2., %3.
- [ ?Upa ,  0Upl, w , w; yes ] ^
- [ Upa  ,  Upl , w , w; no  ] ^
+  {@ [ cons: =0 , 1   , 2 , 3; attrs: pred_clobber ]
+ [  ,  Upl, w , ; yes ] 
cmp\t%0., %1/z, %2., #%3
+ [ ?Upl ,  0  , w , ; yes ] ^
+ [ Upa  ,  Upl, w , ; no  ] ^
+ [  ,  Upl, w , w; yes ] 
cmp\t%0., %1/z, %2., %3.
+ [ ?Upl ,  0  , w , w; yes ] ^
+ [ Upa  ,  Upl, w , w; no  ] ^
   }
   "&& !rtx_equal_p (operands[4], operands[6])"
   {
@@ -8205,12 +8205,12 @@ (define_insn_and_rewrite "*cmp_ptest"
   "TARGET_SVE
&& aarch64_sve_same_pred_for_ptest_p ([4], [6])"
   {@ [ cons: =0, 1, 2 , 3; attrs: pred_clobber ]
- [ ,  Upl , w , ; yes ] 
cmp\t%0., %1/z, %2., #%3
- [ ?Upa,  0Upl, w , ; yes ] ^
- [ Upa ,  Upl , w , ; no  ] ^
- [ ,  Upl , w , w; yes ] 
cmp\t%0., %1/z, %2., %3.
- [ ?Upa,  0Upl, w , w; yes ] ^
- [ Upa ,  Upl , w , w; no  ] ^
+ [ ,  Upl, w , ; yes ] 
cmp\t%0., %1/z, %2., #%3
+ [ ?Upl,  0  , w , ; yes ] ^
+ [ Upa ,  Upl, w , ; no  ] ^
+ [ ,  Upl, w , w; yes ] 
cmp\t%0., %1/z, %2., %3.
+ [ ?Upl,  0  , w , w; yes ] ^
+ [ Upa ,  Upl, w , w; no  ] ^
   }
   "&& !rtx_equal_p (operands[4], operands[6])"
   {
@@ -8263,10 +8263,10 @@ (define_insn "@aarch64_pred_cmp_wide"
  UNSPEC_PRED_Z))
(clobber (reg:CC_NZC CC_REGNUM))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1, 2, 3, 4; attrs: pred_clobber ]
- [ ,  Upl ,  , w, w; yes ] 
cmp\t%0., %1/z, %3., %4.d
- [ ?Upa,  0Upl,  , w, w; yes ] ^
- [ Upa ,  Upl ,  , w, w; no  ] ^
+  {@ [ cons: =0, 1   , 2, 3, 4; attrs: pred_clobber ]
+ [ ,  Upl,  , w, w; yes ] 
cmp\t%0., %1/z, %3., %4.d
+ [ ?Upl,  0  ,  , w, w; yes ] ^
+ [ Upa ,  Upl,  , w, w; no  ] ^
   }
 )
 
@@ -8298,10 +8298,10 @@ (define_insn 

[gcc r15-1041] AArch64: enable new predicate tuning for Neoverse cores.

2024-06-05 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:3eb9f6eab9802d5ae65ead6b1f2ae6fe0833e06e

commit r15-1041-g3eb9f6eab9802d5ae65ead6b1f2ae6fe0833e06e
Author: Tamar Christina 
Date:   Wed Jun 5 19:32:16 2024 +0100

AArch64: enable new predicate tuning for Neoverse cores.

This enables the new tuning flag for Neoverse V1, Neoverse V2 and Neoverse 
N2.
It is kept off for generic codegen.

Note the reason for the +sve even though they are in aarch64-sve.exp is if 
the
testsuite is ran with a forced SVE off option, e.g. -march=armv8-a+nosve 
then
the intrinsics end up being disabled because the -march is preferred over 
the
-mcpu even though the -mcpu comes later.

This prevents the tests from failing in such runs.

gcc/ChangeLog:

* config/aarch64/tuning_models/neoversen2.h (neoversen2_tunings): 
Add
AARCH64_EXTRA_TUNE_AVOID_PRED_RMW.
* config/aarch64/tuning_models/neoversev1.h (neoversev1_tunings): 
Add
AARCH64_EXTRA_TUNE_AVOID_PRED_RMW.
* config/aarch64/tuning_models/neoversev2.h (neoversev2_tunings): 
Add
AARCH64_EXTRA_TUNE_AVOID_PRED_RMW.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/pred_clobber_1.c: New test.
* gcc.target/aarch64/sve/pred_clobber_2.c: New test.
* gcc.target/aarch64/sve/pred_clobber_3.c: New test.
* gcc.target/aarch64/sve/pred_clobber_4.c: New test.

Diff:
---
 gcc/config/aarch64/tuning_models/neoversen2.h  |  3 ++-
 gcc/config/aarch64/tuning_models/neoversev1.h  |  3 ++-
 gcc/config/aarch64/tuning_models/neoversev2.h  |  3 ++-
 .../gcc.target/aarch64/sve/pred_clobber_1.c| 22 +
 .../gcc.target/aarch64/sve/pred_clobber_2.c| 22 +
 .../gcc.target/aarch64/sve/pred_clobber_3.c| 23 ++
 .../gcc.target/aarch64/sve/pred_clobber_4.c| 22 +
 7 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
b/gcc/config/aarch64/tuning_models/neoversen2.h
index 7e799bbe762..be9a48ac3ad 100644
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -236,7 +236,8 @@ static const struct tune_params neoversen2_tunings =
   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),   /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS   /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
b/gcc/config/aarch64/tuning_models/neoversev1.h
index 9363f2ad98a..0fc41ce6a41 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -227,7 +227,8 @@ static const struct tune_params neoversev1_tunings =
   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),   /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),   /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS/* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index bc01ed767c9..f76e4ef358f 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -236,7 +236,8 @@ static const struct tune_params neoversev2_tunings =
   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),   /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS   /* stp_policy_model.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c
new file mode 100644
index 000..25129e8d6f2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=neoverse-n2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC target "+sve"
+
+#include 
+
+extern void use(svbool_t);
+
+/*
+** foo:
+** ...
+** ptrue   p([1-3]).b, all
+** cmplo   p0.h, p\1/z, z

[gcc r15-1040] AArch64: add new alternative with early clobber to patterns

2024-06-05 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:2de3bbde1ebea8689f3596967769f66bf903458e

commit r15-1040-g2de3bbde1ebea8689f3596967769f66bf903458e
Author: Tamar Christina 
Date:   Wed Jun 5 19:31:39 2024 +0100

AArch64: add new alternative with early clobber to patterns

This patch adds new alternatives to the patterns which are affected.  The 
new
alternatives with the conditional early clobbers are added before the normal
ones in order for LRA to prefer them in the event that we have enough free
registers to accommodate them.

In case register pressure is too high the normal alternatives will be 
preferred
before a reload is considered as we rather have the tie than a spill.

Tests are in the next patch.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (and3,
@aarch64_pred__z, *3_cc,
*3_ptest, aarch64_pred__z,
*3_cc, *3_ptest,
aarch64_pred__z, *3_cc,
*3_ptest, @aarch64_pred_cmp,
*cmp_cc, *cmp_ptest,
@aarch64_pred_cmp_wide,
*aarch64_pred_cmp_wide_cc,
*aarch64_pred_cmp_wide_ptest, @aarch64_brk,
*aarch64_brk_cc, *aarch64_brk_ptest,
@aarch64_brk, *aarch64_brk_cc,
*aarch64_brk_ptest, aarch64_rdffr_z, *aarch64_rdffr_z_ptest,
*aarch64_rdffr_ptest, *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add
new early clobber
alternative.
* config/aarch64/aarch64-sve2.md
(@aarch64_pred_): Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md  | 178 +
 gcc/config/aarch64/aarch64-sve2.md |   6 +-
 2 files changed, 124 insertions(+), 60 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index ca4d435e705..d902bce62fd 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1161,8 +1161,10 @@
  (reg:VNx16BI FFRT_REGNUM)
  (match_operand:VNx16BI 1 "register_operand")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffr\t%0.b, %1/z
+  {@ [ cons: =0, 1   ; attrs: pred_clobber ]
+ [ , Upa ; yes ] rdffr\t%0.b, %1/z
+ [ ?Upa, 0Upa; yes ] ^
+ [ Upa , Upa ; no  ] ^
   }
 )
 
@@ -1179,8 +1181,10 @@
  UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1   ; attrs: pred_clobber ]
+ [ , Upa ; yes ] rdffrs\t%0.b, %1/z
+ [ ?Upa, 0Upa; yes ] ^
+ [ Upa , Upa ; no  ] ^
   }
 )
 
@@ -1195,8 +1199,10 @@
  UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1   ; attrs: pred_clobber ]
+ [ , Upa ; yes ] rdffrs\t%0.b, %1/z
+ [ ?Upa, 0Upa; yes ] ^
+ [ Upa , Upa ; no  ] ^
   }
 )
 
@@ -1216,8 +1222,10 @@
  (reg:VNx16BI FFRT_REGNUM)
  (match_dup 1)))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1   ; attrs: pred_clobber ]
+ [ , Upa ; yes ] rdffrs\t%0.b, %1/z
+ [ ?Upa, 0Upa; yes ] ^
+ [ Upa , Upa ; no  ] ^
   }
 )
 
@@ -1233,8 +1241,10 @@
(set (match_operand:VNx16BI 0 "register_operand")
(reg:VNx16BI FFRT_REGNUM))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1   ; attrs: pred_clobber ]
+ [ , Upa ; yes ] rdffrs\t%0.b, %1/z
+ [ ?Upa, 0Upa; yes ] ^
+ [ Upa , Upa ; no  ] ^
   }
 )
 
@@ -6651,8 +6661,10 @@
(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand")
  (match_operand:PRED_ALL 2 "register_operand")))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1  , 2   ]
- [ Upa , Upa, Upa ] and\t%0.b, %1/z, %2.b, %2.b
+  {@ [ cons: =0, 1   , 2   ; attrs: pred_clobber ]
+ [ , Upa , Upa ; yes ] and\t%0.b, %1/z, %2.b, %2.b
+ [ ?Upa, 0Upa, 0Upa; yes ] ^
+ [ Upa , Upa , Upa ; no  ] ^
   }
 )
 
@@ -6679,8 +6691,10 @@
(match_operand:PRED_ALL 3 "register_operand"))
  (match_operand:PRED_ALL 1 "register_operand")))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1  , 2  , 3   ]
- [ Upa , Upa, Upa, Upa ] \t%0.b, %1/z, %2.b, %3.b
+  {@ [ cons: =0, 1   , 2   , 3   ; att

[gcc r15-1039] AArch64: add new tuning param and attribute for enabling conditional early clobber

2024-06-05 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:35f17c680ca650f8658994f857358e5a529c0b93

commit r15-1039-g35f17c680ca650f8658994f857358e5a529c0b93
Author: Tamar Christina 
Date:   Wed Jun 5 19:31:11 2024 +0100

AArch64: add new tuning param and attribute for enabling conditional early 
clobber

This adds a new tuning parameter AARCH64_EXTRA_TUNE_AVOID_PRED_RMW for 
AArch64 to
allow us to conditionally enable the early clobber alternatives based on the
tuning models.

gcc/ChangeLog:

* config/aarch64/aarch64-tuning-flags.def
(AVOID_PRED_RMW): New.
* config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New.
* config/aarch64/aarch64.md (pred_clobber): New.
(arch_enabled): Use it.

Diff:
---
 gcc/config/aarch64/aarch64-tuning-flags.def |  4 
 gcc/config/aarch64/aarch64.h|  5 +
 gcc/config/aarch64/aarch64.md   | 18 --
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index d5bcaebce77..a9f48f5d3d4 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", 
AVOID_CROSS_LOOP_FMA)
 
 AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA)
 
+/* Enable is the target prefers to use a fresh register for predicate outputs
+   rather than re-use an input predicate register.  */
+AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index bbf11faaf4b..0997b82dbc0 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = 
AARCH64_FL_SM_OFF;
 enabled through +gcs.  */
 #define TARGET_GCS (AARCH64_ISA_GCS)
 
+/* Prefer different predicate registers for the output of a predicated
+   operation over re-using an existing input predicate.  */
+#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
+&& (aarch64_tune_params.extra_tuning_flags \
+& AARCH64_EXTRA_TUNE_AVOID_PRED_RMW))
 
 /* Standard register usage.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 9dff2d7a2b0..389a1906e23 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -445,6 +445,10 @@
 ;; target-independent code.
 (define_attr "is_call" "no,yes" (const_string "no"))
 
+;; Indicates whether we want to enable the pattern with an optional early
+;; clobber for SVE predicates.
+(define_attr "pred_clobber" "any,no,yes" (const_string "any"))
+
 ;; [For compatibility with Arm in pipeline models]
 ;; Attribute that specifies whether or not the instruction touches fp
 ;; registers.
@@ -460,7 +464,17 @@
 
 (define_attr "arch_enabled" "no,yes"
   (if_then_else
-(ior
+(and
+  (ior
+   (and
+ (eq_attr "pred_clobber" "no")
+ (match_test "!TARGET_SVE_PRED_CLOBBER"))
+   (and
+ (eq_attr "pred_clobber" "yes")
+ (match_test "TARGET_SVE_PRED_CLOBBER"))
+   (eq_attr "pred_clobber" "any"))
+
+  (ior
(eq_attr "arch" "any")
 
(and (eq_attr "arch" "rcpc8_4")
@@ -488,7 +502,7 @@
 (match_test "TARGET_SVE"))
 
(and (eq_attr "arch" "sme")
-(match_test "TARGET_SME")))
+(match_test "TARGET_SME"
 (const_string "yes")
 (const_string "no")))


[gcc r15-1038] AArch64: convert several predicate patterns to new compact syntax

2024-06-05 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:fd4898891ae0c73d6b7aa433cd1ef4539aaa2457

commit r15-1038-gfd4898891ae0c73d6b7aa433cd1ef4539aaa2457
Author: Tamar Christina 
Date:   Wed Jun 5 19:30:39 2024 +0100

AArch64: convert several predicate patterns to new compact syntax

This converts the single alternative patterns to the new compact syntax such
that when I add the new alternatives it's clearer what's being changed.

Note that this will spew out a bunch of warnings from geninsn as it'll warn 
that
@ is useless for a single alternative pattern.  These are not fatal so won't
break the build and are only temporary.

No change in functionality is expected with this patch.

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (and3,
@aarch64_pred__z, *3_cc,
*3_ptest, aarch64_pred__z,
*3_cc, *3_ptest,
aarch64_pred__z, *3_cc,
*3_ptest, *cmp_ptest,
@aarch64_pred_cmp_wide,
*aarch64_pred_cmp_wide_cc,
*aarch64_pred_cmp_wide_ptest, *aarch64_brk_cc,
*aarch64_brk_ptest, @aarch64_brk,
*aarch64_brk_cc, *aarch64_brk_ptest, 
aarch64_rdffr_z,
*aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest, *aarch64_rdffr_z_cc,
*aarch64_rdffr_cc): Convert to compact syntax.
* config/aarch64/aarch64-sve2.md
(@aarch64_pred_): Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md  | 262 ++---
 gcc/config/aarch64/aarch64-sve2.md |  12 +-
 2 files changed, 161 insertions(+), 113 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 0434358122d..ca4d435e705 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1156,76 +1156,86 @@
 
 ;; Likewise with zero predication.
 (define_insn "aarch64_rdffr_z"
-  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+  [(set (match_operand:VNx16BI 0 "register_operand")
(and:VNx16BI
  (reg:VNx16BI FFRT_REGNUM)
- (match_operand:VNx16BI 1 "register_operand" "Upa")))]
+ (match_operand:VNx16BI 1 "register_operand")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffr\t%0.b, %1/z"
+  {@ [ cons: =0, 1   ]
+ [ Upa , Upa ] rdffr\t%0.b, %1/z
+  }
 )
 
 ;; Read the FFR to test for a fault, without using the predicate result.
 (define_insn "*aarch64_rdffr_z_ptest"
   [(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
- [(match_operand:VNx16BI 1 "register_operand" "Upa")
+ [(match_operand:VNx16BI 1 "register_operand")
   (match_dup 1)
   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
   (and:VNx16BI
 (reg:VNx16BI FFRT_REGNUM)
 (match_dup 1))]
  UNSPEC_PTEST))
-   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+   (clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffrs\t%0.b, %1/z"
+  {@ [ cons: =0, 1   ]
+ [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  }
 )
 
 ;; Same for unpredicated RDFFR when tested with a known PTRUE.
 (define_insn "*aarch64_rdffr_ptest"
   [(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
- [(match_operand:VNx16BI 1 "register_operand" "Upa")
+ [(match_operand:VNx16BI 1 "register_operand")
   (match_dup 1)
   (const_int SVE_KNOWN_PTRUE)
   (reg:VNx16BI FFRT_REGNUM)]
  UNSPEC_PTEST))
-   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+   (clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffrs\t%0.b, %1/z"
+  {@ [ cons: =0, 1   ]
+ [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  }
 )
 
 ;; Read the FFR with zero predication and test the result.
 (define_insn "*aarch64_rdffr_z_cc"
   [(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
- [(match_operand:VNx16BI 1 "register_operand" "Upa")
+ [(match_operand:VNx16BI 1 "register_operand")
   (match_dup 1)
   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
   (and:VNx16BI
 (reg:VNx16BI FFRT_REGNUM)
 (match_dup 1))]
  UNSPEC_PTEST))
-   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+   (set (match_operand:VNx16BI 0 "register_operand")
(and:VNx16BI
  (reg:VNx16BI FFRT_REGNUM)
  (match_dup 1)))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffrs\t%0.b, %1/z"
+  {@ [ cons: =0, 1   ]
+ [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  }
 )
 
 ;; Same for unpredicated RDFFR when tested with a known PTRUE.
 (define_insn "*aarch64_rdffr_cc"
   [(set (re

RE: [PATCH] Rearrange SLP nodes with duplicate statements. [PR98138]

2024-06-05 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Wednesday, June 5, 2024 9:07 AM
> To: Manolis Tsamis 
> Cc: gcc-patches@gcc.gnu.org; Christoph Müllner ;
> Kewen . Lin ; Philipp Tomsich ;
> Tamar Christina ; Jiangning Liu
> 
> Subject: Re: [PATCH] Rearrange SLP nodes with duplicate statements. [PR98138]
> 
> On Tue, 4 Jun 2024, Manolis Tsamis wrote:
> 
> > This change adds a function that checks for SLP nodes with multiple 
> > occurrences
> > of the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the 
> > node
> > so that there are no duplicates. A vec_perm is then introduced to recreate 
> > the
> > original ordering. These duplicates can appear due to how two_operators 
> > nodes
> > are handled, and they prevent vectorization in some cases.
> 
> So the trick is that when we have two operands we elide duplicate lanes
> so we can do discovery for a single combined operand instead which we
> then decompose into the required two again.  That's a nice one.
> 
> But as implemented this will fail SLP discovery if the combined operand
> fails discovery possibly because of divergence in downstream defs.  That
> is, it doesn't fall back to separate discovery.  I suspect the situation
> of duplicate lanes isn't common but then I would also suspect that
> divergence _is_ common.

I think we should also look at the cases where vectorization itself also failed
because the generated tree ends up with an unsupported load.

i.e. in this particular case we would have failed SLP at a later step.

> 
> The discovery code is already quite complex with the way it possibly
> swaps operands of lanes, fitting in this as another variant to try (first)
> is likely going to be a bit awkward.  A way out might be to split the
> function or to make the re-try in the caller which could indicate whether
> to apply this pattern trick or not.  That said - can you try to get
> data on how often the trick applies and discovery succeeds and how
> often discovery fails but discovery would suceed without applying the
> pattern (say, on SPEC)?
> 
> I also suppose instead of hardcoding three patterns for a fixed
> size it should be possible to see there's
> only (at most) half unique lanes in both operands (and one less in one
> operand if the number of lanes is odd) and compute the un-swizzling lane
> permutes during this discovery, removing the need of the explicit enum
> and open-coding each case?
> 
> Another general note is that trying (and then undo on fail) such ticks
> eats at the discovery limit we have in place to avoid exponential run-off
> in exactly this degenerate cases.

I suppose this is typically a case where changing to merging multiple single
lane SLPs instead of creating the multiline graph in one go would make things
easier?

Isn't SLP discovery computationally expensive since it has to create the full
graph in one go, whereas with merging you just rotate some subgraphs or
eventually just keep the single lane separate?

Cheers,
Tamar

> 
> Thanks,
> Richard.
> 
> > This targets the vectorization of the SPEC2017 x264 pixel_satd functions.
> > In some processors a larger than 10% improvement on x264 has been observed.
> >
> > See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138
> >
> > gcc/ChangeLog:
> >
> > * tree-vect-slp.cc (enum slp_oprnd_pattern): new enum for
> rearrangement
> > patterns.
> > (try_rearrange_oprnd_info): Detect if a node corresponds to one of the
> > patterns.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/vect-slp-two-operator.c: New test.
> >
> > Signed-off-by: Manolis Tsamis 
> > ---
> >
> >  .../aarch64/vect-slp-two-operator.c   |  42 
> >  gcc/tree-vect-slp.cc  | 234 ++
> >  2 files changed, 276 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
> b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
> > new file mode 100644
> > index 000..2db066a0b6e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
> > @@ -0,0 +1,42 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect -fdump-tree-vect-
> details" } */
> > +
> > +typedef unsigned char uint8_t;
> > +typedef unsigned int uint32_t;
> > +
> > +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
> > +int t0 = s0 + s1;\
> > +int t1 = s0 - s1;\
> > +int t2 = s2 + s3;\
>

RE: [PATCH] [RFC] lower SLP load permutation to interleaving

2024-06-05 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, June 4, 2024 3:33 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Sandiford ; Tamar Christina
> 
> Subject: [PATCH] [RFC] lower SLP load permutation to interleaving
> 
> The following emulates classical interleaving for SLP load permutes
> that we are unlikely handling natively.  This is to handle cases
> where interleaving (or load/store-lanes) is the optimal choice for
> vectorizing even when we are doing that within SLP.  An example
> would be
> 
> void foo (int * __restrict a, int * b)
> {
>   for (int i = 0; i < 16; ++i)
> {
>   a[4*i + 0] = b[4*i + 0] * 3;
>   a[4*i + 1] = b[4*i + 1] + 3;
>   a[4*i + 2] = (b[4*i + 2] * 3 + 3);
>   a[4*i + 3] = b[4*i + 3] * 3;
> }
> }
> 
> where currently the SLP store is merging four single-lane SLP
> sub-graphs but none of the loads in it can be code-generated
> with V4SImode vectors and a VF of four as the permutes would need
> three vectors.
> 
> The patch introduces a lowering phase after SLP discovery but
> before SLP pattern recognition or permute optimization that
> analyzes all loads from the same dataref group and creates an
> interleaving scheme starting from an unpermuted load.
> 
> What can be handled is quite restrictive, matching only a subset
> of the non-SLP interleaving cases (the power-of-two group size
> ones, in addition only cases without gaps).  The interleaving
> vectorization in addition can handle size 3 and 5 - but I am not
> sure if it's possible to do that in a VL agnostic way.  It
> should be still possible to set up the SLP graph in a way that
> a load-lane could be matched from SLP pattern recognition.
> 
> As said gaps are currently not handled - for SLP we have a
> representational issue that SLP_TREE_SCALAR_STMTS for "gap lanes"
> would need to be filled in some way (even if we just push NULL).
> 
> The patch misses multi-level even/odd handling as well as CSEing
> intermediate generated permutes.  Both is quite straight-forward
> to add, but eventually there's a better or more general strategy
> for lowering?  The main goal of the patch is to avoid falling
> back to non-SLP for cases the interleaving code handles.

I guess not handling CSEing the intermediate permutes only really
matter for pattern matching? Those could be eliminated in optimize_slp?

> 
> Comments and suggestions welcome, esp. what representation
> you'd think is suitable for SLP pattern matching to
> load/store-lane and how to represent that?  Maybe this lowering
> should happen directly in vect_lower_load_permutations?

I like this representation personally, I'd say having the permute explicit,
at least until optimize_slp would make pattern matching easier.

We wouldn't need hacks such as optimize_load_redistribution.
In that sense, does it make sense to eventually just lower all permuted
loads?

Cheers,
Tamar

> 
> Thanks,
> Richard.
> 
>   * tree-vect-slp.cc (vllp_cmp): New function.
>   (vect_lower_load_permutations): Likewise.
>   (vect_analyze_slp): Call it.
> ---
>  gcc/tree-vect-slp.cc | 279
> +++
>  1 file changed, 279 insertions(+)
> 
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 7e3d0107b4e..766b773452f 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -3839,6 +3839,279 @@ vect_analyze_slp_instance (vec_info *vinfo,
>return res;
>  }
> 
> +/* qsort comparator ordering SLP load nodes.  */
> +
> +static int
> +vllp_cmp (const void *a_, const void *b_)
> +{
> +  const slp_tree a = *(const slp_tree *)a_;
> +  const slp_tree b = *(const slp_tree *)b_;
> +  stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
> +  stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
> +  if (STMT_VINFO_GROUPED_ACCESS (a0)
> +  && STMT_VINFO_GROUPED_ACCESS (b0)
> +  && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
> +{
> +  /* Same group, order after lanes used.  */
> +  if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
> + return 1;
> +  else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
> + return -1;
> +  else
> + {
> +   /* Try to order loads using the same lanes together, breaking
> +  the tie with the lane number that first differs.  */
> +   if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
> +   && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
> + return 0;
> +   else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
> +&& !SLP_TREE_LOAD_PERMUTATION (b).exists ())
> + return 1;
> +   else if (!SLP_TREE_LOAD_PERMUTATION (a).exist

RE: [PATCH 3/4]AArch64: add new alternative with early clobber to patterns

2024-05-28 Thread Tamar Christina


> -Original Message-
> From: Richard Sandiford 
> Sent: Wednesday, May 22, 2024 12:24 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; ktkac...@gcc.gnu.org
> Subject: Re: [PATCH 3/4]AArch64: add new alternative with early clobber to
> patterns
> 
> Tamar Christina  writes:
> >> -Original Message-
> >> From: Richard Sandiford 
> >> Sent: Wednesday, May 22, 2024 10:48 AM
> >> To: Tamar Christina 
> >> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> >> ; Marcus Shawcroft
> >> ; ktkac...@gcc.gnu.org
> >> Subject: Re: [PATCH 3/4]AArch64: add new alternative with early clobber to
> >> patterns
> >>
> >> Tamar Christina  writes:
> >> > Hi All,
> >> >
> >> > This patch adds new alternatives to the patterns which are affected.  
> >> > The new
> >> > alternatives with the conditional early clobbers are added before the 
> >> > normal
> >> > ones in order for LRA to prefer them in the event that we have enough 
> >> > free
> >> > registers to accommodate them.
> >> >
> >> > In case register pressure is too high the normal alternatives will be 
> >> > preferred
> >> > before a reload is considered as we rather have the tie than a spill.
> >> >
> >> > Tests are in the next patch.
> >> >
> >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >> >
> >> > Ok for master?
> >> >
> >> > Thanks,
> >> > Tamar
> >> >
> >> > gcc/ChangeLog:
> >> >
> >> >  * config/aarch64/aarch64-sve.md (and3,
> >> >  @aarch64_pred__z, *3_cc,
> >> >  *3_ptest, aarch64_pred__z,
> >> >  *3_cc, *3_ptest,
> >> >  aarch64_pred__z, *3_cc,
> >> >  *3_ptest, @aarch64_pred_cmp,
> >> >  *cmp_cc, *cmp_ptest,
> >> >  @aarch64_pred_cmp_wide,
> >> >  *aarch64_pred_cmp_wide_cc,
> >> >  *aarch64_pred_cmp_wide_ptest,
> >> @aarch64_brk,
> >> >  *aarch64_brk_cc, *aarch64_brk_ptest,
> >> >  @aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest,
> >> >  *aarch64_brk_cc, *aarch64_brk_ptest,
> >> >  aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest,
> >> >  *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber
> >> >  alternative.
> >> >  * config/aarch64/aarch64-sve2.md
> >> >  (@aarch64_pred_): Likewise.
> >> >
> >> > ---
> >> > diff --git a/gcc/config/aarch64/aarch64-sve.md
> b/gcc/config/aarch64/aarch64-
> >> sve.md
> >> > index
> >>
> e3085c0c636f1317409bbf3b5fbaf5342a2df1f6..8fdc1bc3cd43acfcd675a18350c
> >> 297428c85fe46 100644
> >> > --- a/gcc/config/aarch64/aarch64-sve.md
> >> > +++ b/gcc/config/aarch64/aarch64-sve.md
> >> > @@ -1161,8 +1161,10 @@ (define_insn "aarch64_rdffr_z"
> >> >(reg:VNx16BI FFRT_REGNUM)
> >> >(match_operand:VNx16BI 1 "register_operand")))]
> >> >"TARGET_SVE && TARGET_NON_STREAMING"
> >> > -  {@ [ cons: =0, 1   ]
> >> > - [ Upa , Upa ] rdffr\t%0.b, %1/z
> >> > +  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
> >> > + [ , Upa; yes ] rdffr\t%0.b, %1/z
> >> > + [ ?Upa, Upa; yes ] ^
> >> > + [ Upa , Upa; *   ] ^
> >> >}
> >> >  )
> >>
> >> Sorry for not explaining it very well, but in the previous review I 
> >> suggested:
> >>
> >> > The gather-like approach would be something like:
> >> >
> >> >  [  , Upl , w , ; yes ]
> >> cmp\t%0., %1/z, %3., #%4
> >> >  [ ?Upl , 0   , w , ; yes ] ^
> >> >  [ Upa  , Upl , w , ; no  ] ^
> >> >  [  , Upl , w , w; yes ] 
> >> > cmp\t%0.,
> %1/z,
> >> %3., %4.
> >> >  [ ?Upl , 0   , w , w; yes ] ^
> >> >  [ Upa  , Upl , w , w; no  ] ^
> >> >
> >> > with:
> >> >
> >> >   (define_attr "pred_clobber" "any,no,yes" (const_string "any"))
> >>
&

RE: [PATCH 2/4]AArch64: add new tuning param and attribute for enabling conditional early clobber

2024-05-28 Thread Tamar Christina
> -Original Message-
> From: Tamar Christina 
> Sent: Wednesday, May 22, 2024 10:29 AM
> To: Richard Sandiford 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; ktkac...@gcc.gnu.org
> Subject: RE: [PATCH 2/4]AArch64: add new tuning param and attribute for
> enabling conditional early clobber
> 
> >
> > Sorry for the bike-shedding, but how about something like "avoid_pred_rmw"?
> > (I'm open to other suggestions.)  Just looking for something that describes
> > either the architecture or the end result that we want to achieve.
> > And preferable something fairly short :)
> >
> > avoid_* would be consistent with the existing "avoid_cross_loop_fma".
> >
> > > +
> > >  #undef AARCH64_EXTRA_TUNING_OPTION
> > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> > > index
> >
> bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d5
> > 6b46c74084ba7c3c 100644
> > > --- a/gcc/config/aarch64/aarch64.h
> > > +++ b/gcc/config/aarch64/aarch64.h
> > > @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE =
> > AARCH64_FL_SM_OFF;
> > >  enabled through +gcs.  */
> > >  #define TARGET_GCS (AARCH64_ISA_GCS)
> > >
> > > +/*  Prefer different predicate registers for the output of a predicated 
> > > operation
> > over
> > > +re-using an existing input predicate.  */
> > > +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
> > > +  && (aarch64_tune_params.extra_tuning_flags \
> > > +  &
> > AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST))
> > >
> > >  /* Standard register usage.  */
> > >
> > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> > > index
> >
> dbde066f7478bec51a8703b017ea553aa98be309..1ecd1a2812969504bd5114a
> > 53473b478c5ddba82 100644
> > > --- a/gcc/config/aarch64/aarch64.md
> > > +++ b/gcc/config/aarch64/aarch64.md
> > > @@ -445,6 +445,10 @@ (define_enum_attr "arch" "arches" (const_string
> > "any"))
> > >  ;; target-independent code.
> > >  (define_attr "is_call" "no,yes" (const_string "no"))
> > >
> > > +;; Indicates whether we want to enable the pattern with an optional early
> > > +;; clobber for SVE predicates.
> > > +(define_attr "pred_clobber" "no,yes" (const_string "no"))
> > > +
> > >  ;; [For compatibility with Arm in pipeline models]
> > >  ;; Attribute that specifies whether or not the instruction touches fp
> > >  ;; registers.
> > > @@ -461,7 +465,8 @@ (define_attr "fp" "no,yes"
> > >  (define_attr "arch_enabled" "no,yes"
> > >(if_then_else
> > >  (ior
> > > - (eq_attr "arch" "any")
> > > + (and (eq_attr "arch" "any")
> > > +  (eq_attr "pred_clobber" "no"))
> > >
> > >   (and (eq_attr "arch" "rcpc8_4")
> > >(match_test "AARCH64_ISA_RCPC8_4"))
> > > @@ -488,7 +493,10 @@ (define_attr "arch_enabled" "no,yes"
> > >(match_test "TARGET_SVE"))
> > >
> > >   (and (eq_attr "arch" "sme")
> > > -  (match_test "TARGET_SME")))
> > > +  (match_test "TARGET_SME"))
> > > +
> > > + (and (eq_attr "pred_clobber" "yes")
> > > +  (match_test "TARGET_SVE_PRED_CLOBBER")))
> >
> > IMO it'd be bettero handle pred_clobber separately from arch, as a new
> > top-level AND:
> >
> >   (and
> > (ior
> >   (eq_attr "pred_clobber" "no")
> >   (match_test "!TARGET_..."))
> > (ior
> >   ...existing arch tests...))
> >
> 

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-tuning-flags.def
(AVOID_PRED_RMW): New.
* config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New.
* config/aarch64/aarch64.md (pred_clobber): New.
(arch_enabled): Use it.

-- inline copy of patch --

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 
d5bcaebce770f0b217aac783063d39135f754c77..a9f48f5d3d4ea32f

RE: [PATCH 3/4]AArch64: add new alternative with early clobber to patterns

2024-05-22 Thread Tamar Christina
> -Original Message-
> From: Richard Sandiford 
> Sent: Wednesday, May 22, 2024 10:48 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; ktkac...@gcc.gnu.org
> Subject: Re: [PATCH 3/4]AArch64: add new alternative with early clobber to
> patterns
> 
> Tamar Christina  writes:
> > Hi All,
> >
> > This patch adds new alternatives to the patterns which are affected.  The 
> > new
> > alternatives with the conditional early clobbers are added before the normal
> > ones in order for LRA to prefer them in the event that we have enough free
> > registers to accommodate them.
> >
> > In case register pressure is too high the normal alternatives will be 
> > preferred
> > before a reload is considered as we rather have the tie than a spill.
> >
> > Tests are in the next patch.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-sve.md (and3,
> > @aarch64_pred__z, *3_cc,
> > *3_ptest, aarch64_pred__z,
> > *3_cc, *3_ptest,
> > aarch64_pred__z, *3_cc,
> > *3_ptest, @aarch64_pred_cmp,
> > *cmp_cc, *cmp_ptest,
> > @aarch64_pred_cmp_wide,
> > *aarch64_pred_cmp_wide_cc,
> > *aarch64_pred_cmp_wide_ptest,
> @aarch64_brk,
> > *aarch64_brk_cc, *aarch64_brk_ptest,
> > @aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest,
> > *aarch64_brk_cc, *aarch64_brk_ptest,
> > aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest,
> > *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber
> > alternative.
> > * config/aarch64/aarch64-sve2.md
> > (@aarch64_pred_): Likewise.
> >
> > ---
> > diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-
> sve.md
> > index
> e3085c0c636f1317409bbf3b5fbaf5342a2df1f6..8fdc1bc3cd43acfcd675a18350c
> 297428c85fe46 100644
> > --- a/gcc/config/aarch64/aarch64-sve.md
> > +++ b/gcc/config/aarch64/aarch64-sve.md
> > @@ -1161,8 +1161,10 @@ (define_insn "aarch64_rdffr_z"
> >   (reg:VNx16BI FFRT_REGNUM)
> >   (match_operand:VNx16BI 1 "register_operand")))]
> >"TARGET_SVE && TARGET_NON_STREAMING"
> > -  {@ [ cons: =0, 1   ]
> > - [ Upa , Upa ] rdffr\t%0.b, %1/z
> > +  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
> > + [ , Upa; yes ] rdffr\t%0.b, %1/z
> > + [ ?Upa, Upa; yes ] ^
> > + [ Upa , Upa; *   ] ^
> >}
> >  )
> 
> Sorry for not explaining it very well, but in the previous review I suggested:
> 
> > The gather-like approach would be something like:
> >
> >  [  , Upl , w , ; yes ]
> cmp\t%0., %1/z, %3., #%4
> >  [ ?Upl , 0   , w , ; yes ] ^
> >  [ Upa  , Upl , w , ; no  ] ^
> >  [  , Upl , w , w; yes ] 
> > cmp\t%0., %1/z,
> %3., %4.
> >  [ ?Upl , 0   , w , w; yes ] ^
> >  [ Upa  , Upl , w , w; no  ] ^
> >
> > with:
> >
> >   (define_attr "pred_clobber" "any,no,yes" (const_string "any"))
> 
> (with emphasis on the last line).  What I didn't say explicitly is
> that "no" should require !TARGET_SVE_PRED_CLOBBER.
> 
> The premise of that review was that we shouldn't enable things like:
> 
>  [ Upa  , Upl , w , w; no  ] ^
> 
> for TARGET_SVE_PRED_CLOBBER since it contradicts the earlyclobber
> alternative.  So we should enable either the pred_clobber=yes
> alternatives or the pred_clobber=no alternatives, but not both.
> 
> The default "any" is then for other non-predicate instructions that
> don't care about TARGET_SVE_PRED_CLOBBER either way.
> 
> In contrast, this patch makes pred_clobber=yes enable the alternatives
> that correctly describe the restriction (good!) but then also enables
> the normal alternatives too, which IMO makes the semantics unclear.

Sure, the reason I still had that is because this ICEs under high register
pressure:

  {@ [ cons: =0 , 1   , 3 , 4; attrs: pred_clobber ]
 [  , Upl , w , ; yes ] 
cmp\t%0., %1/z, %3., #%4
 [ ?Upa , 0   , w , ; yes ] ^
 [ Upa  , Upl , w , ; no  ] ^
 [  

[PATCH 3/4]AArch64: add new alternative with early clobber to patterns

2024-05-22 Thread Tamar Christina
Hi All,

This patch adds new alternatives to the patterns which are affected.  The new
alternatives with the conditional early clobbers are added before the normal
ones in order for LRA to prefer them in the event that we have enough free
registers to accommodate them.

In case register pressure is too high the normal alternatives will be preferred
before a reload is considered as we rather have the tie than a spill.

Tests are in the next patch.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (and3,
@aarch64_pred__z, *3_cc,
*3_ptest, aarch64_pred__z,
*3_cc, *3_ptest,
aarch64_pred__z, *3_cc,
*3_ptest, @aarch64_pred_cmp,
*cmp_cc, *cmp_ptest,
@aarch64_pred_cmp_wide,
*aarch64_pred_cmp_wide_cc,
*aarch64_pred_cmp_wide_ptest, @aarch64_brk,
*aarch64_brk_cc, *aarch64_brk_ptest,
@aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest,
*aarch64_brk_cc, *aarch64_brk_ptest,
aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest,
*aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber
alternative.
* config/aarch64/aarch64-sve2.md
(@aarch64_pred_): Likewise.

---
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
e3085c0c636f1317409bbf3b5fbaf5342a2df1f6..8fdc1bc3cd43acfcd675a18350c297428c85fe46
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1161,8 +1161,10 @@ (define_insn "aarch64_rdffr_z"
  (reg:VNx16BI FFRT_REGNUM)
  (match_operand:VNx16BI 1 "register_operand")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffr\t%0.b, %1/z
+  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
+ [ , Upa; yes ] rdffr\t%0.b, %1/z
+ [ ?Upa, Upa; yes ] ^
+ [ Upa , Upa; *   ] ^
   }
 )
 
@@ -1179,8 +1181,10 @@ (define_insn "*aarch64_rdffr_z_ptest"
  UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
+ [ , Upa; yes ] rdffrs\t%0.b, %1/z
+ [ ?Upa, Upa; yes ] ^
+ [ Upa , Upa; *   ] ^
   }
 )
 
@@ -1195,8 +1199,10 @@ (define_insn "*aarch64_rdffr_ptest"
  UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
+ [ , Upa; yes ] rdffrs\t%0.b, %1/z
+ [ ?Upa, Upa; yes ] ^
+ [ Upa , Upa; *   ] ^
   }
 )
 
@@ -1216,8 +1222,10 @@ (define_insn "*aarch64_rdffr_z_cc"
  (reg:VNx16BI FFRT_REGNUM)
  (match_dup 1)))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
+ [ , Upa; yes ] rdffrs\t%0.b, %1/z
+ [ ?Upa, Upa; yes ] ^
+ [ Upa , Upa; *   ] ^
   }
 )
 
@@ -1233,8 +1241,10 @@ (define_insn "*aarch64_rdffr_cc"
(set (match_operand:VNx16BI 0 "register_operand")
(reg:VNx16BI FFRT_REGNUM))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
+ [ , Upa; yes ] rdffrs\t%0.b, %1/z
+ [ ?Upa, Upa; yes ] ^
+ [ Upa , Upa; *   ] ^
   }
 )
 
@@ -6651,8 +6661,10 @@ (define_insn "and3"
(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand")
  (match_operand:PRED_ALL 2 "register_operand")))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1  , 2   ]
- [ Upa , Upa, Upa ] and\t%0.b, %1/z, %2.b, %2.b
+  {@ [ cons: =0, 1  , 2  ; attrs: pred_clobber ]
+ [ , Upa, Upa; yes ] and\t%0.b, %1/z, %2.b, %2.b
+ [ ?Upa, Upa, Upa; yes ] ^
+ [ Upa , Upa, Upa; *   ] ^
   }
 )
 
@@ -6679,8 +6691,10 @@ (define_insn "@aarch64_pred__z"
(match_operand:PRED_ALL 3 "register_operand"))
  (match_operand:PRED_ALL 1 "register_operand")))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1  , 2  , 3   ]
- [ Upa , Upa, Upa, Upa ] \t%0.b, %1/z, %2.b, %3.b
+  {@ [ cons: =0, 1  , 2  , 3  ; attrs: pred_clobber ]
+ [ , Upa, Upa, Upa; yes ] \t%0.b, %1/z, 
%2.b, %3.b
+ [ ?Upa, Upa, Upa, Upa; yes ] ^
+ [ Upa , Upa, Upa, Upa; *   ] ^
   }
 )
 
@@ -6703,8 +6717,10 @@ (define_insn "*3_cc"
(and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3))
  

[PATCH 4/4]AArch64: enable new predicate tuning for Neoverse cores.

2024-05-22 Thread Tamar Christina
Hi All,

This enables the new tuning flag for Neoverse V1, Neoverse V2 and Neoverse N2.
It is kept off for generic codegen.

Note the reason for the +sve even though they are in aarch64-sve.exp is if the
testsuite is ran with a forced SVE off option, e.g. -march=armv8-a+nosve then
the intrinsics end up being disabled because the -march is preferred over the
-mcpu even though the -mcpu comes later.

This prevents the tests from failing in such runs.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/tuning_models/neoversen2.h (neoversen2_tunings): Add
AARCH64_EXTRA_TUNE_AVOID_PRED_RMW.
* config/aarch64/tuning_models/neoversev1.h (neoversev1_tunings): Add
AARCH64_EXTRA_TUNE_AVOID_PRED_RMW.
* config/aarch64/tuning_models/neoversev2.h (neoversev2_tunings): Add
AARCH64_EXTRA_TUNE_AVOID_PRED_RMW.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/pred_clobber_1.c: New test.
* gcc.target/aarch64/sve/pred_clobber_2.c: New test.
* gcc.target/aarch64/sve/pred_clobber_3.c: New test.
* gcc.target/aarch64/sve/pred_clobber_4.c: New test.
* gcc.target/aarch64/sve/pred_clobber_5.c: New test.

---
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
b/gcc/config/aarch64/tuning_models/neoversen2.h
index 
7e799bbe762fe862e31befed50e54040a7fd1f2f..be9a48ac3adc097f967c217fe09dcac194d7d14f
 100644
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -236,7 +236,8 @@ static const struct tune_params neoversen2_tunings =
   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),   /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS   /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
b/gcc/config/aarch64/tuning_models/neoversev1.h
index 
9363f2ad98a5279cc99f2f9b1509ba921d582e84..0fc41ce6a41b3135fa06d2bda1f517fdf4f8dbcf
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -227,7 +227,8 @@ static const struct tune_params neoversev1_tunings =
   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),   /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),   /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS/* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index 
bc01ed767c9b690504eb98456402df5d9d64eee3..f76e4ef358f7dfb9c7d7b470ea7240eaa2120f8e
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -236,7 +236,8 @@ static const struct tune_params neoversev2_tunings =
   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),   /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS   /* stp_policy_model.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c
new file mode 100644
index 
..934a00a38531c5fd4139d99ff33414904b2c104f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=neoverse-n2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC target "+sve"
+
+#include 
+
+extern void use(svbool_t);
+
+/*
+** foo:
+** ...
+** ptrue   p([1-9][0-9]?).b, all
+** cmplo   p0.h, p\1/z, z0.h, z[0-9]+.h
+** ...
+*/
+void foo (svuint16_t a, uint16_t b)
+{
+svbool_t p0 = svcmplt_n_u16 (svptrue_b16 (), a, b);
+use (p0);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c
new file mode 100644
index 
..58badb66a43b1ac50eeec153b9cac44fc831b145
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=neoverse-v2" } */
+/* { 

RE: [PATCH 2/4]AArch64: add new tuning param and attribute for enabling conditional early clobber

2024-05-22 Thread Tamar Christina
> 
> Sorry for the bike-shedding, but how about something like "avoid_pred_rmw"?
> (I'm open to other suggestions.)  Just looking for something that describes
> either the architecture or the end result that we want to achieve.
> And preferable something fairly short :)
> 
> avoid_* would be consistent with the existing "avoid_cross_loop_fma".
> 
> > +
> >  #undef AARCH64_EXTRA_TUNING_OPTION
> > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> > index
> bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d5
> 6b46c74084ba7c3c 100644
> > --- a/gcc/config/aarch64/aarch64.h
> > +++ b/gcc/config/aarch64/aarch64.h
> > @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE =
> AARCH64_FL_SM_OFF;
> >  enabled through +gcs.  */
> >  #define TARGET_GCS (AARCH64_ISA_GCS)
> >
> > +/*  Prefer different predicate registers for the output of a predicated 
> > operation
> over
> > +re-using an existing input predicate.  */
> > +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
> > +&& (aarch64_tune_params.extra_tuning_flags \
> > +&
> AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST))
> >
> >  /* Standard register usage.  */
> >
> > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> > index
> dbde066f7478bec51a8703b017ea553aa98be309..1ecd1a2812969504bd5114a
> 53473b478c5ddba82 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -445,6 +445,10 @@ (define_enum_attr "arch" "arches" (const_string
> "any"))
> >  ;; target-independent code.
> >  (define_attr "is_call" "no,yes" (const_string "no"))
> >
> > +;; Indicates whether we want to enable the pattern with an optional early
> > +;; clobber for SVE predicates.
> > +(define_attr "pred_clobber" "no,yes" (const_string "no"))
> > +
> >  ;; [For compatibility with Arm in pipeline models]
> >  ;; Attribute that specifies whether or not the instruction touches fp
> >  ;; registers.
> > @@ -461,7 +465,8 @@ (define_attr "fp" "no,yes"
> >  (define_attr "arch_enabled" "no,yes"
> >(if_then_else
> >  (ior
> > -   (eq_attr "arch" "any")
> > +   (and (eq_attr "arch" "any")
> > +(eq_attr "pred_clobber" "no"))
> >
> > (and (eq_attr "arch" "rcpc8_4")
> >  (match_test "AARCH64_ISA_RCPC8_4"))
> > @@ -488,7 +493,10 @@ (define_attr "arch_enabled" "no,yes"
> >  (match_test "TARGET_SVE"))
> >
> > (and (eq_attr "arch" "sme")
> > -(match_test "TARGET_SME")))
> > +(match_test "TARGET_SME"))
> > +
> > +   (and (eq_attr "pred_clobber" "yes")
> > +(match_test "TARGET_SVE_PRED_CLOBBER")))
> 
> IMO it'd be bettero handle pred_clobber separately from arch, as a new
> top-level AND:
> 
>   (and
> (ior
>   (eq_attr "pred_clobber" "no")
>   (match_test "!TARGET_..."))
> (ior
>   ...existing arch tests...))
> 

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-tuning-flags.def
(AVOID_PRED_RMW): New.
* config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New.
* config/aarch64/aarch64.md (pred_clobber): New.
(arch_enabled): Use it.

-- inline copy of patch --

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 
d5bcaebce770f0b217aac783063d39135f754c77..a9f48f5d3d4ea32fbf53086ba21eab4bc65b6dcb
 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", 
AVOID_CROSS_LOOP_FMA)
 
 AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA)
 
+/* Enable is the target prefers to use a fresh register for predicate outputs
+   rather than re-use an input predicate register.  */
+AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
bbf11faaf4b4340956094a983f8b0dc2649b2d27..e7669e65d7dae5df2ba42c265079b1856a5c382b
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = 
AARCH64_FL_SM_OFF;
 enabled through +gcs.  */
 #define TARGET_GCS (AARCH64_ISA_GCS)
 
+/*  Prefer different predicate registers for the output of a predicated 
operation over
+re-using an existing input predicate.  */
+#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
+&& (aarch64_tune_params.extra_tuning_flags \
+& AARCH64_EXTRA_TUNE_AVOID_PRED_RMW))
 
 /* Standard register usage.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
dbde066f7478bec51a8703b017ea553aa98be309..52e5adba4172e14b794b5df9394e58ce49ef8b7f
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ 

RE: [RFC] Merge strathegy for all-SLP vectorizer

2024-05-21 Thread Tamar Christina via Gcc



> -Original Message-
> From: Richard Biener 
> Sent: Friday, May 17, 2024 1:54 PM
> To: Richard Sandiford 
> Cc: Richard Biener via Gcc ; Tamar Christina
> 
> Subject: Re: [RFC] Merge strathegy for all-SLP vectorizer
> 
> On Fri, 17 May 2024, Richard Sandiford wrote:
> 
> > Richard Biener via Gcc  writes:
> > > Hi,
> > >
> > > I'd like to discuss how to go forward with getting the vectorizer to
> > > all-SLP for this stage1.  While there is a personal branch with my
> > > ongoing work (users/rguenth/vect-force-slp) branches haven't proved
> > > themselves working well for collaboration.
> >

Yeah, It's hard to keep rebasing and build on top of.

> > Speaking for myself, the problem hasn't been so much the branch as
> > lack of time.  I've been pretty swamped the last eight months of so
> > (except for the time that I took off, which admittedly was quite a
> > bit!), and so I never even got around to properly reading and replying
> > to your message after the Cauldron.  It's been on the "this is important,
> > I should make time to read and understand it properly" list all this time.
> > Sorry about that. :(
> >
> > I'm hoping to have time to work/help out on SLP stuff soon.
> >
> > > The branch isn't ready to be merged in full but I have been picking
> > > improvements to trunk last stage1 and some remaining bits in the past
> > > weeks.  I have refrained from merging code paths that cannot be
> > > exercised on trunk.
> > >
> > > There are two important set of changes on the branch, both critical
> > > to get more testing on non-x86 targets.
> > >
> > >  1. enable single-lane SLP discovery
> > >  2. avoid splitting store groups (9315bfc661432c3 and 4336060fe2db8ec
> > > if you fetch the branch)
> > >

For no# is there a param or is it just the default?  I can run these through
regression today.

> > > The first point is also most annoying on the testsuite since doing
> > > SLP instead of interleaving changes what we dump and thus tests
> > > start to fail in random ways when you switch between both modes.
> > > On the branch single-lane SLP discovery is gated with
> > > --param vect-single-lane-slp.
> > >
> > > The branch has numerous changes to enable single-lane SLP for some
> > > code paths that have SLP not implemented and where I did not bother
> > > to try supporting multi-lane SLP at this point.  It also adds more
> > > SLP discovery entry points.
> > >
> > > I'm not sure how to try merging these pieces to allow others to
> > > more easily help out.  One possibility is to merge
> > > --param vect-single-lane-slp defaulted off and pick dependent
> > > changes even when they cause testsuite regressions with
> > > vect-single-lane-slp=1.  Alternatively adjust the testsuite by
> > > adding --param vect-single-lane-slp=0 and default to 1
> > > (or keep the default).

I guess which one is better depends on whether the parameter goes
away this release? If so I think we should just leave them broken for
now and fix them up when it's the default?

> >
> > FWIW, this one sounds good to me (the default to 1 version).
> > I.e. mechanically add --param vect-single-lane-slp=0 to any tests
> > that fail with the new default.  That means that the test that need
> > fixing are easily greppable for anyone who wants to help.  Sometimes
> > it'll just be a test update.  Sometimes it will be new vectoriser code.
> 
> OK.  Meanwhile I figured the most important part is 2. from above
> since that enables the single-lane in a grouped access (also covering
> single element interleaving).  This will cover all problematical cases
> with respect to vectorizing loads and stores.  It also has less
> testsuite fallout, mainly because we have a lot less coverage for
> grouped stores without SLP.
> 
> So I'll see to produce a mergeable patch for part 2 and post that
> for review next week.

Sounds good!

Thanks for getting the ball rolling on this.
It would be useful to have it in trunk indeed, off by default for now
sounds good because then I can work on trunk for the SLP support
for early break as well.

Cheers,
Tamar

> 
> Thanks,
> Richard.
> 
> > Thanks,
> > Richard
> >
> > > Or require a clean testsuite with
> > > --param vect-single-lane-slp defaulted to 1 but keep the --param
> > > for debugging (and allow FAILs with 0).
> > >
> > > For fun I merged just single-lane discovery of non-grouped stores
> > > and have that enabled by

RE: [PATCH 0/4]AArch64: support conditional early clobbers on certain operations.

2024-05-20 Thread Tamar Christina
> -Original Message-
> From: Richard Sandiford 
> Sent: Wednesday, May 15, 2024 10:31 PM
> To: Tamar Christina 
> Cc: Richard Biener ; gcc-patches@gcc.gnu.org; nd
> ; Richard Earnshaw ; Marcus
> Shawcroft ; ktkac...@gcc.gnu.org
> Subject: Re: [PATCH 0/4]AArch64: support conditional early clobbers on certain
> operations.
> 
> Tamar Christina  writes:
> >> >> On Wed, May 15, 2024 at 12:29 PM Tamar Christina
> >> >>  wrote:
> >> >> >
> >> >> > Hi All,
> >> >> >
> >> >> > Some Neoverse Software Optimization Guides (SWoG) have a clause that
> state
> >> >> > that for predicated operations that also produce a predicate it is 
> >> >> > preferred
> >> >> > that the codegen should use a different register for the destination 
> >> >> > than
> that
> >> >> > of the input predicate in order to avoid a performance overhead.
> >> >> >
> >> >> > This of course has the problem that it increases register pressure 
> >> >> > and so
> >> should
> >> >> > be done with care.  Additionally not all micro-architectures have this
> >> >> > consideration and so it shouldn't be done as a default thing.
> >> >> >
> >> >> > The patch series adds support for doing conditional early clobbers 
> >> >> > through
> a
> >> >> > combination of new alternatives and attributes to control their 
> >> >> > availability.
> >> >>
> >> >> You could have two alternatives, one with early clobber and one with
> >> >> a matching constraint where you'd disparage the matching constraint one?
> >> >>
> >> >
> >> > Yeah, that's what I do, though there's no need to disparage the non-early
> clobber
> >> > alternative as the early clobber alternative will naturally get a 
> >> > penalty if it
> needs a
> >> > reload.
> >>
> >> But I think Richard's suggestion was to disparage the one with a matching
> >> constraint (not the earlyclobber), to reflect the increased cost of
> >> reusing the register.
> >>
> >> We did take that approach for gathers, e.g.:
> >>
> >>  [, Z,   w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s]
> >>  [?w, Z,   0, Ui1, Ui1, Upl] ^
> >>
> >> The (supposed) advantage is that, if register pressure is so tight
> >> that using matching registers is the only alternative, we still
> >> have the opportunity to do that, as a last resort.
> >>
> >> Providing only an earlyclobber version means that using the same
> >> register is prohibited outright.  If no other register is free, the RA
> >> would need to spill something else to free up a temporary register.
> >> And it might then do the equivalent of (pseudo-code):
> >>
> >>   not p1.b, ..., p0.b
> >>   mov p0.d, p1.d
> >>
> >> after spilling what would otherwise have occupied p1.  In that
> >> situation it would be better use:
> >>
> >>   not p0.b, ..., p0.b
> >>
> >> and not introduce the spill of p1.
> >
> > I think I understood what Richi meant, but I thought it was already working 
> > that
> way.
> 
> The suggestion was to use matching constraints (like "0") though,
> whereas the patch doesn't.  I think your argument is that you don't
> need to use matching constraints.  But that's different from the
> suggestion (and from how we handle gathers).
> 
> I was going to say in response to patch 3 (but got distracted, sorry):
> I don't think we should have:
> 
>, Upa, ...
>Upa, Upa, ...
> 
> (taken from the pure logic ops) enabled at the same time.  Even though
> it works for the testcases, I don't think it has well-defined semantics.
> 
> The problem is that, taken on its own, the second alternative says that
> matching operands are free.  And fundamentally, I don't think the costs
> *must* take the earlyclobber alternative over the non-earlyclobber one
> (when costing during IRA, for instance).  In principle, the cheapest
> is best.
> 
> The aim of the gather approach is to make each alternative correct in
> isolation.  In:
> 
>   [, Z,   w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s]
>   [?w, Z,   0, Ui1, Ui1, Upl] ^
> 
> the second alternative says that it is possible to have operands 0
> and 2 be the same vector register, but using that version has th

RE: [PATCH v3] Match: Extract ternary_integer_types_match_p helper func [NFC]

2024-05-20 Thread Tamar Christina



> -Original Message-
> From: pan2...@intel.com 
> Sent: Tuesday, May 21, 2024 2:13 AM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> ; richard.guent...@gmail.com; Pan Li
> 
> Subject: [PATCH v3] Match: Extract ternary_integer_types_match_p helper func
> [NFC]
> 
> From: Pan Li 
> 
> There are sorts of match pattern for SAT related cases,  there will be
> some duplicated code to check the dest, op_0, op_1 are same tree types.
> Aka ternary tree type matches.  Thus, extract one helper function to
> do this and avoid match code duplication.
> 
> The below test suites are passed for this patch:
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 regression test.
> 
> gcc/ChangeLog:
> 
>   * match.pd: Leverage helper func for SAT_ADD match.
>   * tree.cc (ternary_integer_types_match_p): New func impl to
>   check if ternary tree types are all integer.
>   * tree.h (ternary_integer_types_match_p): New func decl.
> 

Thanks, looks good to me! You still need approval from a maintainer..

Cheers,
Tamar

> Signed-off-by: Pan Li 
> ---
>  gcc/match.pd | 28 +++-
>  gcc/tree.cc  | 16 
>  gcc/tree.h   |  5 +
>  3 files changed, 28 insertions(+), 21 deletions(-)
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0f9c34fa897..cff67c84498 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -39,7 +39,8 @@ along with GCC; see the file COPYING3.  If not see
> HONOR_NANS
> uniform_vector_p
> expand_vec_cmp_expr_p
> -   bitmask_inv_cst_vector_p)
> +   bitmask_inv_cst_vector_p
> +   ternary_integer_types_match_p)
> 
>  /* Operator lists.  */
>  (define_operator_list tcc_comparison
> @@ -3046,38 +3047,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned Saturation Add */
>  (match (usadd_left_part_1 @0 @1)
>   (plus:c @0 @1)
> - (if (INTEGRAL_TYPE_P (type)
> -  && TYPE_UNSIGNED (TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@1)
> + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED
> (type
> 
>  (match (usadd_left_part_2 @0 @1)
>   (realpart (IFN_ADD_OVERFLOW:c @0 @1))
> - (if (INTEGRAL_TYPE_P (type)
> -  && TYPE_UNSIGNED (TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@1)
> + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED
> (type
> 
>  (match (usadd_right_part_1 @0 @1)
>   (negate (convert (lt (plus:c @0 @1) @0)))
> - (if (INTEGRAL_TYPE_P (type)
> -  && TYPE_UNSIGNED (TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@1)
> + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED
> (type
> 
>  (match (usadd_right_part_1 @0 @1)
>   (negate (convert (gt @0 (plus:c @0 @1
> - (if (INTEGRAL_TYPE_P (type)
> -  && TYPE_UNSIGNED (TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@1)
> + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED
> (type
> 
>  (match (usadd_right_part_2 @0 @1)
>   (negate (convert (ne (imagpart (IFN_ADD_OVERFLOW:c @0 @1))
> integer_zerop)))
> - (if (INTEGRAL_TYPE_P (type)
> -  && TYPE_UNSIGNED (TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@1)
> + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED
> (type
> 
>  /* We cannot merge or overload usadd_left_part_1 and usadd_left_part_2
> because the sub part of left_part_2 cannot work with right_part_1.
> diff --git a/gcc/tree.cc b/gcc/tree.cc
> index 6564b002dc1..b59d42c3e47 100644
> --- a/gcc/tree.cc
> +++ b/gcc/tree.cc
> @@ -10622,6 +10622,22 @@ uniform_integer_cst_p (tree t)
>return NULL_TREE;
>  }
> 
> +/* Check if the types T1,  T2 and T3 are effectively the same integer type.
> +   If T1,  T2 or T3 is not a type, the test applies to their TREE_TYPE.  */
> +
> +bool
> +ternary_integer_types_match_p (tree t1, tree t2, tree t3)
> +{
> +  t1 = TYPE_P (t1) ? t1 : TREE_TYPE (t1);
> +  t2 = TYPE_P (t2) ? t2 : TREE_TYPE (t2);
> +  t3 = TYPE_P (t3) ? t3 : TREE_TYPE (t3);
> +
> +  if (!INTEGRAL_TYPE_P (t1) || !INTEGRAL_TYPE_P (t2) || !INTEGRAL_TYPE_P
> (t3))
> +return false;
> +
> +  return types_compatible_p (t1, t2) && types_compatible_p (t2

RE: [PATCH v1 1/2] Match: Support branch form for unsigned SAT_ADD

2024-05-20 Thread Tamar Christina
Hi Pan,

> -Original Message-
> From: pan2...@intel.com 
> Sent: Monday, May 20, 2024 12:01 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> ; richard.guent...@gmail.com; Pan Li
> 
> Subject: [PATCH v1 1/2] Match: Support branch form for unsigned SAT_ADD
> 
> From: Pan Li 
> 
> This patch would like to support the branch form for unsigned
> SAT_ADD.  For example as below:
> 
> uint64_t
> sat_add (uint64_t x, uint64_t y)
> {
>   return (uint64_t) (x + y) >= x ? (x + y) : -1;
> }
> 
> Different to the branchless version,  we leverage the simplify to
> convert the branch version of SAT_ADD into branchless if and only
> if the backend has supported the IFN_SAT_ADD.  Thus,  the backend has
> the ability to choose branch or branchless implementation of .SAT_ADD.
> For example,  some target can take care of branches code more optimally.
> 
> When the target implement the IFN_SAT_ADD for unsigned and before this
> patch:
> uint64_t sat_add_u_1_uint64_t (uint64_t x, uint64_t y)
> {
>   long unsigned int _1;
>   uint64_t _2;
>   __complex__ long unsigned int _6;
>   long unsigned int _7;
> 
> ;;   basic block 2, loop depth 0
> ;;pred:   ENTRY
>   _6 = .ADD_OVERFLOW (x_3(D), y_4(D));
>   _1 = REALPART_EXPR <_6>;
>   _7 = IMAGPART_EXPR <_6>;
>   if (_7 == 0)
> goto ; [65.00%]
>   else
> goto ; [35.00%]
> ;;succ:   4
> ;;3
> 
> ;;   basic block 3, loop depth 0
> ;;pred:   2
> ;;succ:   4
> 
> ;;   basic block 4, loop depth 0
> ;;pred:   3
> ;;2
>   # _2 = PHI <18446744073709551615(3), _1(2)>
>   return _2;
> ;;succ:   EXIT
> 
> }
> 
> After this patch:
> uint64_t sat_add (uint64_t x, uint64_t y)
> {
>   long unsigned int _9;
> 
> ;;   basic block 2, loop depth 0
> ;;pred:   ENTRY
>   _9 = .SAT_ADD (x_3(D), y_4(D)); [tail call]
>   return _9;
> ;;succ:   EXIT
> }
> 
> The below test suites are passed for this patch:
> * The x86 bootstrap test.
> * The x86 fully regression test.
> * The riscv fully regression test.
> 
> gcc/ChangeLog:
> 
>   * match.pd: Add new simplify to convert branch SAT_ADD into
>   branchless,  if and only if backend implement the IFN.
> 
> Signed-off-by: Pan Li 
> ---
>  gcc/match.pd | 18 ++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0f9c34fa897..0547b57b3a3 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3094,6 +3094,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  (match (unsigned_integer_sat_add @0 @1)
>   (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
> 
> +#if GIMPLE
> +
> +/* Simplify the branch version of SAT_ADD into branchless if and only if
> +   the backend has supported the IFN_SAT_ADD.  Thus, the backend has the
> +   ability to choose branch or branchless implementation of .SAT_ADD.  */
> +
> +(simplify
> + (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep)
> +  (if (direct_internal_fn_supported_p (IFN_SAT_ADD, type,
> OPTIMIZE_FOR_BOTH))
> +   (bit_ior @2 (negate (convert (lt @2 @0))
> +
> +(simplify
> + (cond (le @0 (plus:c@2 @0 @1)) @2 integer_minus_onep)
> +  (if (direct_internal_fn_supported_p (IFN_SAT_ADD, type,
> OPTIMIZE_FOR_BOTH))
> +   (bit_ior @2 (negate (convert (lt @2 @0))
> +
> +#endif

Thanks, this looks good to me!

I'll leave it up to Richard to approve,
Richard: The reason for the direct_internal_fn_supported_p is because some
targets said that they currently handle the branch version better due to the 
lack
of some types.  At the time I reason it's just a target expansion bug but 
didn't hear anything.

To be honest, it feels to me like we should do this unconditionally, and just 
have the targets
that get faster branch version to handle it during expand? Since the patch 
series provides
a canonicalized version now.

This means we can also better support targets that have the vector optab but 
not the scalar one
as the above check would fail for these targets.

What do you think?

Thanks,
Tamar

> +
>  /* x >  y  &&  x != XXX_MIN  -->  x > y
> x >  y  &&  x == XXX_MIN  -->  false . */
>  (for eqne (eq ne)
> --
> 2.34.1



RE: [PATCH v1] Match: Extract integer_types_ternary_match helper to avoid code dup [NFC]

2024-05-20 Thread Tamar Christina
> -Original Message-
> From: pan2...@intel.com 
> Sent: Sunday, May 19, 2024 5:17 AM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> ; richard.guent...@gmail.com; Pan Li
> 
> Subject: [PATCH v1] Match: Extract integer_types_ternary_match helper to avoid
> code dup [NFC]
> 
> From: Pan Li 
> 
> There are sorts of match pattern for SAT related cases,  there will be
> some duplicated code to check the dest, op_0, op_1 are same tree types.
> Aka ternary tree type matches.  Thus, extract one helper function to
> do this and avoid match code duplication.
> 
> The below test suites are passed for this patch:
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 regression test.
> 
> gcc/ChangeLog:
> 
>   * generic-match-head.cc (integer_types_ternary_match): New helper
>   function to check tenary tree type matches or not.
>   * gimple-match-head.cc (integer_types_ternary_match): Ditto but
>   for match.
>   * match.pd: Leverage above helper function to avoid code dup.
> 

Nice cleanup!

This function isn't part of the machinery of match.pd and is instead part of a 
pattern.
For these things we usually put them in tree.h/tree.cc and declare them at the 
top
of match.pd in the "define_predicates" list.

This will also allow you to get rid of the code duplication.  In addition such 
functions
which return a true/false we consider predicates, and name them ending with _p.

See e.g. bitmask_inv_cst_vector_p which is also defined in tree.h/tree.cc.

Cheers,
Tamar

> Signed-off-by: Pan Li 
> ---
>  gcc/generic-match-head.cc | 17 +
>  gcc/gimple-match-head.cc  | 17 +
>  gcc/match.pd  | 25 +
>  3 files changed, 39 insertions(+), 20 deletions(-)
> 
> diff --git a/gcc/generic-match-head.cc b/gcc/generic-match-head.cc
> index 0d3f648fe8d..cdd48c7a5cc 100644
> --- a/gcc/generic-match-head.cc
> +++ b/gcc/generic-match-head.cc
> @@ -59,6 +59,23 @@ types_match (tree t1, tree t2)
>return TYPE_MAIN_VARIANT (t1) == TYPE_MAIN_VARIANT (t2);
>  }
> 
> +/* Routine to determine if the types T1,  T2 and T3 are effectively
> +   the same integer type for GENERIC.  If T1,  T2 or T3 is not a type,
> +   the test applies to their TREE_TYPE.  */
> +
> +static inline bool
> +integer_types_ternary_match (tree t1, tree t2, tree t3)
> +{
> +  t1 = TYPE_P (t1) ? t1 : TREE_TYPE (t1);
> +  t2 = TYPE_P (t2) ? t2 : TREE_TYPE (t2);
> +  t3 = TYPE_P (t3) ? t3 : TREE_TYPE (t3);
> +
> +  if (!INTEGRAL_TYPE_P (t1) || !INTEGRAL_TYPE_P (t2) || !INTEGRAL_TYPE_P
> (t3))
> +return false;
> +
> +  return types_match (t1, t2) && types_match (t1, t3);
> +}
> +
>  /* Return if T has a single use.  For GENERIC, we assume this is
> always true.  */
> 
> diff --git a/gcc/gimple-match-head.cc b/gcc/gimple-match-head.cc
> index 5f8a1a1ad8e..91f2e56b8ef 100644
> --- a/gcc/gimple-match-head.cc
> +++ b/gcc/gimple-match-head.cc
> @@ -79,6 +79,23 @@ types_match (tree t1, tree t2)
>return types_compatible_p (t1, t2);
>  }
> 
> +/* Routine to determine if the types T1,  T2 and T3 are effectively
> +   the same integer type for GIMPLE.  If T1,  T2 or T3 is not a type,
> +   the test applies to their TREE_TYPE.  */
> +
> +static inline bool
> +integer_types_ternary_match (tree t1, tree t2, tree t3)
> +{
> +  t1 = TYPE_P (t1) ? t1 : TREE_TYPE (t1);
> +  t2 = TYPE_P (t2) ? t2 : TREE_TYPE (t2);
> +  t3 = TYPE_P (t3) ? t3 : TREE_TYPE (t3);
> +
> +  if (!INTEGRAL_TYPE_P (t1) || !INTEGRAL_TYPE_P (t2) || !INTEGRAL_TYPE_P
> (t3))
> +return false;
> +
> +  return types_match (t1, t2) && types_match (t1, t3);
> +}
> +
>  /* Return if T has a single use.  For GIMPLE, we also allow any
> non-SSA_NAME (ie constants) and zero uses to cope with uses
> that aren't linked up yet.  */
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 0f9c34fa897..b291e34bbe4 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3046,38 +3046,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* Unsigned Saturation Add */
>  (match (usadd_left_part_1 @0 @1)
>   (plus:c @0 @1)
> - (if (INTEGRAL_TYPE_P (type)
> -  && TYPE_UNSIGNED (TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@0))
> -  && types_match (type, TREE_TYPE (@1)
> + (if (TYPE_UNSIGNED (type) && integer_types_ternary_match (type, @0, @1
> 
>  (match (usadd_left_part_2 @0 @1)
>   (realpart (IFN_ADD_OVERFLOW:c @0 @1))
> - (if (INTEGRAL_TYPE_P (type)
> -  && TYPE_UNSIGNED (TREE_TYPE (@0))
> -  && 

RE: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

2024-05-17 Thread Tamar Christina
Hi Pan,

> 
> Hi Tamar,
> 
> I am trying to add more shape(s) like below branch version for SAT_ADD. I 
> suspect
> that widening_mul may not be the best place to take care of this shape.
> Because after_dom_children almost works on bb but we actually need to find the
> def/use cross the bb.

It actually already does this, see for example optimize_spaceship which 
optimizes
across basic blocks. However...

> 
> Thus, is there any suggestion for branch shape? Add new simplify to match.pd
> works well but it is not recommended per previous discussion.

The objection previously was not to introduce the IFNs at match.pd, it doesn't
mean we can't use match.pd to force the versions with branches to banchless
code so the existing patterns can deal with them as is.

...in this case something like this:

#if GIMPLE
(simplify
 (cond (ge (plus:c@3 @0 @1) @0) @3 integer_minus_onep)
  (if (direct_internal_fn_supported_p (...))
   (bit_ior @3 (negate (...)
#endif

Works better I think.

That is, for targets we know we can optimize it later on, or do something with 
it
in the vectorizer we canonicalize it.  The reason I have it guarded with the 
IFN is
that some target maintainers objected to replacing the branch code with 
branchless
code as their targets can more optimally deal with branches.

Cheers,
Tamar
> 
> Thanks a lot for help!
> 
> Pan
> 
> ---Source code-
> 
> #define SAT_ADD_U_1(T) \
> T sat_add_u_1_##T(T x, T y) \
> { \
>   return (T)(x + y) >= x ? (x + y) : -1; \
> }
> 
> SAT_ADD_U_1(uint16_t)
> 
> ---Gimple-
> 
> uint16_t sat_add_u_1_uint16_t (uint16_t x, uint16_t y)
> {
>   short unsigned int _1;
>   uint16_t _2;
> 
>[local count: 1073741824]:
>   _1 = x_3(D) + y_4(D);
>   if (_1 >= x_3(D))
> goto ; [65.00%]
>   else
> goto ; [35.00%]
> 
>[local count: 697932184]:
> 
>    [local count: 1073741824]:
>   # _2 = PHI <65535(2), _1(3)>
>   return _2;
> }
> 
> Pan
> 
> -Original Message-
> From: Tamar Christina 
> Sent: Wednesday, May 15, 2024 5:12 PM
> To: Li, Pan2 ; gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com;
> Liu, Hongtao 
> Subject: RE: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned
> scalar int
> 
> Hi Pan,
> 
> Thanks!
> 
> > -Original Message-
> > From: pan2...@intel.com 
> > Sent: Wednesday, May 15, 2024 3:14 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> > ; richard.guent...@gmail.com;
> > hongtao@intel.com; Pan Li 
> > Subject: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned
> scalar
> > int
> >
> > From: Pan Li 
> >
> > This patch would like to add the middle-end presentation for the
> > saturation add.  Aka set the result of add to the max when overflow.
> > It will take the pattern similar as below.
> >
> > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> >
> > Take uint8_t as example, we will have:
> >
> > * SAT_ADD (1, 254)   => 255.
> > * SAT_ADD (1, 255)   => 255.
> > * SAT_ADD (2, 255)   => 255.
> > * SAT_ADD (255, 255) => 255.
> >
> > Given below example for the unsigned scalar integer uint64_t:
> >
> > uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> > {
> >   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> > }
> >
> > Before this patch:
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   long unsigned int _1;
> >   _Bool _2;
> >   long unsigned int _3;
> >   long unsigned int _4;
> >   uint64_t _7;
> >   long unsigned int _10;
> >   __complex__ long unsigned int _11;
> >
> > ;;   basic block 2, loop depth 0
> > ;;pred:   ENTRY
> >   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
> >   _1 = REALPART_EXPR <_11>;
> >   _10 = IMAGPART_EXPR <_11>;
> >   _2 = _10 != 0;
> >   _3 = (long unsigned int) _2;
> >   _4 = -_3;
> >   _7 = _1 | _4;
> >   return _7;
> > ;;succ:   EXIT
> >
> > }
> >
> > After this patch:
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   uint64_t _7;
> >
> > ;;   basic block 2, loop depth 0
> > ;;pred:   ENTRY
> >   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
> >   return _7;
> > ;;succ:   EXIT
> > }
> >
> > The below tests are passed for this patch:
> > 1. The riscv fully regression tests.
> > 3. The x86 bootstrap tests.
> &g

RE: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer

2024-05-17 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Friday, May 17, 2024 10:46 AM
> To: Tamar Christina 
> Cc: Victor Do Nascimento ; gcc-
> patc...@gcc.gnu.org; Richard Sandiford ; Richard
> Earnshaw ; Victor Do Nascimento
> 
> Subject: Re: [PATCH] middle-end: Expand {u|s}dot product support in
> autovectorizer
> 
> On Fri, May 17, 2024 at 11:05 AM Tamar Christina
>  wrote:
> >
> > > -Original Message-
> > > From: Richard Biener 
> > > Sent: Friday, May 17, 2024 6:51 AM
> > > To: Victor Do Nascimento 
> > > Cc: gcc-patches@gcc.gnu.org; Richard Sandiford
> ;
> > > Richard Earnshaw ; Victor Do Nascimento
> > > 
> > > Subject: Re: [PATCH] middle-end: Expand {u|s}dot product support in
> > > autovectorizer
> > >
> > > On Thu, May 16, 2024 at 4:40 PM Victor Do Nascimento
> > >  wrote:
> > > >
> > > > From: Victor Do Nascimento 
> > > >
> > > > At present, the compiler offers the `{u|s|us}dot_prod_optab' direct
> > > > optabs for dealing with vectorizable dot product code sequences.  The
> > > > consequence of using a direct optab for this is that backend-pattern
> > > > selection is only ever able to match against one datatype - Either
> > > > that of the operands or of the accumulated value, never both.
> > > >
> > > > With the introduction of the 2-way (un)signed dot-product insn [1][2]
> > > > in AArch64 SVE2, the existing direct opcode approach is no longer
> > > > sufficient for full specification of all the possible dot product
> > > > machine instructions to be matched to the code sequence; a dot product
> > > > resulting in VNx4SI may result from either dot products on VNx16QI or
> > > > VNx8HI values for the 4- and 2-way dot product operations, respectively.
> > > >
> > > > This means that the following example fails autovectorization:
> > > >
> > > > uint32_t foo(int n, uint16_t* data) {
> > > >   uint32_t sum = 0;
> > > >   for (int i=0; i > > > sum += data[i] * data[i];
> > > >   }
> > > >   return sum;
> > > > }
> > > >
> > > > To remedy the issue a new optab is added, tentatively named
> > > > `udot_prod_twoway_optab', whose selection is dependent upon checking
> > > > of both input and output types involved in the operation.
> > >
> > > I don't like this too much.  I'll note we document dot_prod as
> > >
> > > @cindex @code{sdot_prod@var{m}} instruction pattern
> > > @item @samp{sdot_prod@var{m}}
> > >
> > > Compute the sum of the products of two signed elements.
> > > Operand 1 and operand 2 are of the same mode. Their
> > > product, which is of a wider mode, is computed and added to operand 3.
> > > Operand 3 is of a mode equal or wider than the mode of the product. The
> > > result is placed in operand 0, which is of the same mode as operand 3.
> > > @var{m} is the mode of operand 1 and operand 2.
> > >
> > > with no restriction on the wider mode but we don't specify it which is
> > > bad design.  This should have been a convert optab with two modes
> > > from the start - adding a _twoway variant is just a hack.
> >
> > We did discuss this at the time we started implementing it.  There was two
> > options, one was indeed to change it to a convert dot_prod optab, but doing
> > this means we have to update every target that uses it.
> >
> > Now that means 3 ISAs for AArch64, Arm, Arc, c6x, 2 for x86, loongson and
> altivec.
> >
> > Which sure could be possible, but there's also every use in the backends 
> > that
> need
> > to be updated, and tested, which for some targets we don't even know how to
> begin.
> >
> > So it seems very hard to correct dotprod to a convert optab now.
> 
> It's still the correct way to go.  At _least_ your new pattern should
> have been this,
> otherwise what do you do when you have two-way, four-way and eight-way
> variants?
> Add yet another optab?

I guess that's fair, but having the new optab only be convert resulted in messy
code as everywhere you must check for both variants.

Additionally that optab would then overlap with the existing optabs as, as you
Say, the documentation only says it's of a wider type and doesn't indicate
precision.

So to avoid issues down the line then If the new optab isn't acceptable then
we'll have to do a wholesale conversion then..

> 
> Another thing is that when you do it 

RE: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer

2024-05-17 Thread Tamar Christina
> -Original Message-
> From: Hongtao Liu 
> Sent: Friday, May 17, 2024 3:14 AM
> To: Victor Do Nascimento 
> Cc: gcc-patches@gcc.gnu.org; Richard Sandiford ;
> Richard Earnshaw ; Victor Do Nascimento
> 
> Subject: Re: [PATCH] middle-end: Expand {u|s}dot product support in
> autovectorizer
> 
> > >
> > Sorry to chime in, for x86 backend, we defined usdot_prodv16hi, and
> > 2-way dot_prod operations can be generated
> >
> This is the link https://godbolt.org/z/hcWr64vx3, x86 define
> udot_prodv16qi/udot_prod8hi and both 2-way and 4-way dot_prod
> instructions are generated
> 

That's not the same, the 2-way vs 4-way dot_prod here is that
e.g. udot_prod8hi can reduce to either DImode or SImode.
udot_prod8hi does not have enough information to distinguish the two and in RTL
you can't overload the names.  So this is about the ISA having instructions 
that overlap
on the source mode of the instruction.

Tamar

> 
> --
> BR,
> Hongtao


RE: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer

2024-05-17 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Friday, May 17, 2024 6:51 AM
> To: Victor Do Nascimento 
> Cc: gcc-patches@gcc.gnu.org; Richard Sandiford ;
> Richard Earnshaw ; Victor Do Nascimento
> 
> Subject: Re: [PATCH] middle-end: Expand {u|s}dot product support in
> autovectorizer
> 
> On Thu, May 16, 2024 at 4:40 PM Victor Do Nascimento
>  wrote:
> >
> > From: Victor Do Nascimento 
> >
> > At present, the compiler offers the `{u|s|us}dot_prod_optab' direct
> > optabs for dealing with vectorizable dot product code sequences.  The
> > consequence of using a direct optab for this is that backend-pattern
> > selection is only ever able to match against one datatype - Either
> > that of the operands or of the accumulated value, never both.
> >
> > With the introduction of the 2-way (un)signed dot-product insn [1][2]
> > in AArch64 SVE2, the existing direct opcode approach is no longer
> > sufficient for full specification of all the possible dot product
> > machine instructions to be matched to the code sequence; a dot product
> > resulting in VNx4SI may result from either dot products on VNx16QI or
> > VNx8HI values for the 4- and 2-way dot product operations, respectively.
> >
> > This means that the following example fails autovectorization:
> >
> > uint32_t foo(int n, uint16_t* data) {
> >   uint32_t sum = 0;
> >   for (int i=0; i > sum += data[i] * data[i];
> >   }
> >   return sum;
> > }
> >
> > To remedy the issue a new optab is added, tentatively named
> > `udot_prod_twoway_optab', whose selection is dependent upon checking
> > of both input and output types involved in the operation.
> 
> I don't like this too much.  I'll note we document dot_prod as
> 
> @cindex @code{sdot_prod@var{m}} instruction pattern
> @item @samp{sdot_prod@var{m}}
> 
> Compute the sum of the products of two signed elements.
> Operand 1 and operand 2 are of the same mode. Their
> product, which is of a wider mode, is computed and added to operand 3.
> Operand 3 is of a mode equal or wider than the mode of the product. The
> result is placed in operand 0, which is of the same mode as operand 3.
> @var{m} is the mode of operand 1 and operand 2.
> 
> with no restriction on the wider mode but we don't specify it which is
> bad design.  This should have been a convert optab with two modes
> from the start - adding a _twoway variant is just a hack.

We did discuss this at the time we started implementing it.  There was two
options, one was indeed to change it to a convert dot_prod optab, but doing
this means we have to update every target that uses it.

Now that means 3 ISAs for AArch64, Arm, Arc, c6x, 2 for x86, loongson and 
altivec.

Which sure could be possible, but there's also every use in the backends that 
need
to be updated, and tested, which for some targets we don't even know how to 
begin.

So it seems very hard to correct dotprod to a convert optab now.

Tamar

> 
> Richard.
> 
> > In order to minimize changes to the existing codebase,
> > `optab_for_tree_code' is renamed `optab_for_tree_code_1' and a new
> > argument is added to its signature - `const_tree otype', allowing type
> > information to be specified for both input and output types.  The
> > existing nterface is retained by defining a new `optab_for_tree_code',
> > which serves as a shim to `optab_for_tree_code_1', passing old
> > parameters as-is and setting the new `optype' argument to `NULL_TREE'.
> >
> > For DOT_PROD_EXPR tree codes, we can call `optab_for_tree_code_1'
> > directly, passing it both types, adding the internal logic to the
> > function to distinguish between competing optabs.
> >
> > Finally, necessary changes are made to `expand_widen_pattern_expr' to
> > ensure the new icode can be correctly selected, given the new optab.
> >
> > [1] https://developer.arm.com/documentation/ddi0602/2024-03/SVE-
> Instructions/UDOT--2-way--vectors---Unsigned-integer-dot-product-
> > [2] https://developer.arm.com/documentation/ddi0602/2024-03/SVE-
> Instructions/SDOT--2-way--vectors---Signed-integer-dot-product-
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-sve2.md 
> > (@aarch64_sve_dotvnx4sivnx8hi):
> > renamed to `dot_prod_twoway_vnx8hi'.
> > * config/aarch64/aarch64-sve-builtins-base.cc (svdot_impl.expand):
> > update icodes used in line with above rename.
> > * optabs-tree.cc (optab_for_tree_code_1): Renamed
> > `optab_for_tree_code' and added new argument.
> > (optab_for_tree_code): Now a call to `optab_for_tree_code_1'.
> > * optabs-tree.h (optab_for_tree_code_1): New.
> > * optabs.cc (expand_widen_pattern_expr): Expand support for
> > DOT_PROD_EXPR patterns.
> > * optabs.def (udot_prod_twoway_optab): New.
> > (sdot_prod_twoway_optab): Likewise.
> > * tree-vect-patterns.cc (vect_supportable_direct_optab_p): Add
> > support for misc optabs that use two modes.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * 

RE: [PATCH] middle-end: Drop __builtin_pretech calls in autovectorization [PR114061]'

2024-05-16 Thread Tamar Christina
Hi,

> -Original Message-
> From: Victor Do Nascimento 
> Sent: Thursday, May 16, 2024 2:57 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Sandiford ; Richard Earnshaw
> ; Victor Do Nascimento
> 
> Subject: [PATCH] middle-end: Drop __builtin_pretech calls in autovectorization
> [PR114061]'
> 
> At present the autovectorizer fails to vectorize simple loops
> involving calls to `__builtin_prefetch'.  A simple example of such
> loop is given below:
> 
> void foo(double * restrict a, double * restrict b, int n){
>   int i;
>   for(i=0; i a[i] = a[i] + b[i];
> __builtin_prefetch(&(b[i+8]));
>   }
> }
> 
> The failure stems from two issues:
> 
> 1. Given that it is typically not possible to fully reason about a
>function call due to the possibility of side effects, the
>autovectorizer does not attempt to vectorize loops which make such
>calls.
> 
>Given the memory reference passed to `__builtin_prefetch', in the
>absence of assurances about its effect on the passed memory
>location the compiler deems the function unsafe to vectorize,
>marking it as clobbering memory in `vect_find_stmt_data_reference'.
>This leads to the failure in autovectorization.
> 
> 2. Notwithstanding the above issue, though the prefetch statement
>would be classed as `vect_unused_in_scope', the loop invariant that
>is used in the address of the prefetch is the scalar loop's and not
>the vector loop's IV. That is, it still uses `i' and not `vec_iv'
>because the instruction wasn't vectorized, causing DCE to think the
>value is live, such that we now have both the vector and scalar loop
>invariant actively used in the loop.
> 
> This patch addresses both of these:
> 
> 1. About the issue regarding the memory clobber, data prefetch does
>not generate faults if its address argument is invalid and does not
>write to memory.  Therefore, it does not alter the internal state
>of the program or its control flow under any circumstance.  As
>such, it is reasonable that the function be marked as not affecting
>memory contents.
> 
>To achieve this, we add the necessary logic to
>`get_references_in_stmt' to ensure that builtin functions are given
>given the same treatment as internal functions.  If the gimple call
>is to a builtin function and its function code is
>`BUILT_IN_PREFETCH', we mark `clobbers_memory' as false.
> 
> 2. Finding precedence in the way clobber statements are handled,
>whereby the vectorizer drops these from both the scalar and
>vectorized versions of a given loop, we choose to drop prefetch
>hints in a similar fashion.  This seems appropriate given how
>software prefetch hints are typically ignored by processors across
>architectures, as they seldom lead to performance gain over their
>hardware counterparts.
> 
>PR target/114061
> 
> gcc/ChangeLog:
> 
>   * tree-data-ref.cc (get_references_in_stmt): set
>   `clobbers_memory' to false for __builtin_prefetch.
>   * tree-vect-loop.cc (vect_transform_loop): Drop all
>   __builtin_prefetch calls from loops.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/vect/vect-prefetch-drop.c: New test.
> ---
>  gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c | 14 ++
>  gcc/tree-data-ref.cc   |  9 +
>  gcc/tree-vect-loop.cc  |  7 ++-
>  3 files changed, 29 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c
> b/gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c
> new file mode 100644
> index 000..57723a8c972
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +/* { dg-additional-options "-march=-O3 -march=armv9.2-a+sve -fdump-tree-
> vect-details" { target { aarch64*-*-* } } } */
> +

See the review about two-way dotprod for comments on this.
However this specific test does not need to check for any assembly instructions.

You're going from being unable to vectorize a function, to being able to 
vectorize
It.

So the `vectorized 1 loops` check is sufficient, then this will work for all 
targets.
This requires a check on vect_double (see gcc/testsuite/lib/target-supports.exp)

I'd also change the loop to just use int, as more targets will support 
vectorizing
those, (and of course at a vect_int check instead)

> +void foo(double * restrict a, double * restrict b, int n){
> +  int i;
> +  for(i=0; i +a[i] = a[i] + b[i];
> +__builtin_prefetch(&(b[i+8]));
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "prfm" } } */
> +/* { dg-final { scan-assembler "fadd\tz\[0-9\]+.d, p\[0-9\]+/m, z\[0-9\]+.d, 
> z\[0-
> 9\]+.d" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
> diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
> 

RE: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer

2024-05-16 Thread Tamar Christina
Hi Victor,

> -Original Message-
> From: Victor Do Nascimento 
> Sent: Thursday, May 16, 2024 3:39 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Sandiford ; Richard Earnshaw
> ; Victor Do Nascimento
> 
> Subject: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer
> 
> From: Victor Do Nascimento 
> 
> At present, the compiler offers the `{u|s|us}dot_prod_optab' direct
> optabs for dealing with vectorizable dot product code sequences.  The
> consequence of using a direct optab for this is that backend-pattern
> selection is only ever able to match against one datatype - Either
> that of the operands or of the accumulated value, never both.
> 
> With the introduction of the 2-way (un)signed dot-product insn [1][2]
> in AArch64 SVE2, the existing direct opcode approach is no longer
> sufficient for full specification of all the possible dot product
> machine instructions to be matched to the code sequence; a dot product
> resulting in VNx4SI may result from either dot products on VNx16QI or
> VNx8HI values for the 4- and 2-way dot product operations, respectively.
> 
> This means that the following example fails autovectorization:
> 
> uint32_t foo(int n, uint16_t* data) {
>   uint32_t sum = 0;
>   for (int i=0; i sum += data[i] * data[i];
>   }
>   return sum;
> }
> 
> To remedy the issue a new optab is added, tentatively named
> `udot_prod_twoway_optab', whose selection is dependent upon checking
> of both input and output types involved in the operation.
> 
> In order to minimize changes to the existing codebase,
> `optab_for_tree_code' is renamed `optab_for_tree_code_1' and a new
> argument is added to its signature - `const_tree otype', allowing type
> information to be specified for both input and output types.  The
> existing nterface is retained by defining a new `optab_for_tree_code',
> which serves as a shim to `optab_for_tree_code_1', passing old
> parameters as-is and setting the new `optype' argument to `NULL_TREE'.
> 
> For DOT_PROD_EXPR tree codes, we can call `optab_for_tree_code_1'
> directly, passing it both types, adding the internal logic to the
> function to distinguish between competing optabs.
> 
> Finally, necessary changes are made to `expand_widen_pattern_expr' to
> ensure the new icode can be correctly selected, given the new optab.
> 
> [1] https://developer.arm.com/documentation/ddi0602/2024-03/SVE-
> Instructions/UDOT--2-way--vectors---Unsigned-integer-dot-product-
> [2] https://developer.arm.com/documentation/ddi0602/2024-03/SVE-
> Instructions/SDOT--2-way--vectors---Signed-integer-dot-product-
> 
> gcc/ChangeLog:
> 
>   * config/aarch64/aarch64-sve2.md
> (@aarch64_sve_dotvnx4sivnx8hi):
>   renamed to `dot_prod_twoway_vnx8hi'.
>   * config/aarch64/aarch64-sve-builtins-base.cc (svdot_impl.expand):
>   update icodes used in line with above rename.

Please split the target specific bits from the target agnostic parts.
I.e. this patch series should be split in two.

>   * optabs-tree.cc (optab_for_tree_code_1): Renamed
>   `optab_for_tree_code' and added new argument.
>   (optab_for_tree_code): Now a call to `optab_for_tree_code_1'.
>   * optabs-tree.h (optab_for_tree_code_1): New.
>   * optabs.cc (expand_widen_pattern_expr): Expand support for
>   DOT_PROD_EXPR patterns.
>   * optabs.def (udot_prod_twoway_optab): New.
>   (sdot_prod_twoway_optab): Likewise.
>   * tree-vect-patterns.cc (vect_supportable_direct_optab_p): Add
>   support for misc optabs that use two modes.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/vect/vect-dotprod-twoway.c: New.
> ---
>  .../aarch64/aarch64-sve-builtins-base.cc  |  4 ++--
>  gcc/config/aarch64/aarch64-sve2.md|  2 +-
>  gcc/optabs-tree.cc| 23 --
>  gcc/optabs-tree.h |  2 ++
>  gcc/optabs.cc |  2 +-
>  gcc/optabs.def|  2 ++
>  .../gcc.dg/vect/vect-dotprod-twoway.c | 24 +++
>  gcc/tree-vect-patterns.cc |  2 +-
>  8 files changed, 54 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c
> 
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> index 0d2edf3f19e..e457db09f66 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -764,8 +764,8 @@ public:
>icode = (e.type_suffix (0).float_p
>  ? CODE_FOR_aarch64_sve_fdotvnx4sfvnx8hf
>  : e.type_suffix (0).unsigned_p
> -? CODE_FOR_aarch64_sve_udotvnx4sivnx8hi
> -: CODE_FOR_aarch64_sve_sdotvnx4sivnx8hi);
> +? CODE_FOR_udot_prod_twoway_vnx8hi
> +: CODE_FOR_sdot_prod_twoway_vnx8hi);
>  return e.use_unpred_insn (icode);
>}
>  };
> diff --git 

RE: [PATCH v2 1/3] Vect: Support loop len in vectorizable early exit

2024-05-16 Thread Tamar Christina
> -Original Message-
> From: pan2...@intel.com 
> Sent: Thursday, May 16, 2024 5:06 AM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> ; richard.guent...@gmail.com; Richard Sandiford
> ; Pan Li 
> Subject: [PATCH v2 1/3] Vect: Support loop len in vectorizable early exit
> 
> From: Pan Li 
> 
> This patch adds early break auto-vectorization support for target which
> use length on partial vectorization.  Consider this following example:
> 
> unsigned vect_a[802];
> unsigned vect_b[802];
> 
> void test (unsigned x, int n)
> {
>   for (int i = 0; i < n; i++)
>   {
> vect_b[i] = x + i;
> 
> if (vect_a[i] > x)
>   break;
> 
> vect_a[i] = x;
>   }
> }
> 
> We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
> And then the IR of RVV looks like below:
> 
>   ...
>   _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
>   _55 = (int) _87;
>   ...
>   mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
>   vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
> {0, ... }, _87, 0);
>   if (vec_len_mask_72 != { 0, ... })
> goto ; [5.50%]
>   else
> goto ; [94.50%]
> 
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The x86 bootstrap tests.
> 3. The x86 fully regression tests.
> 
> gcc/ChangeLog:
> 
>   * tree-vect-stmts.cc (vectorizable_early_exit): Add loop len
>   handling for one or multiple stmt.
> 
> gcc/ChangeLog:
> 
>   * tree-vect-loop.cc (vect_gen_loop_len_mask): New func to gen
>   the loop len mask.
>   * tree-vect-stmts.cc (vectorizable_early_exit): Invoke the
>   vect_gen_loop_len_mask for 1 or more stmt(s).
>   * tree-vectorizer.h (vect_gen_loop_len_mask): New func decl
>   for vect_gen_loop_len_mask.
> 

Thanks, this version looks good to me!

You'll need Richi's review still.

Cheers,
Tamar

> Signed-off-by: Pan Li 
> ---
>  gcc/tree-vect-loop.cc  | 27 +++
>  gcc/tree-vect-stmts.cc | 17 +++--
>  gcc/tree-vectorizer.h  |  4 
>  3 files changed, 46 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 361aec06488..83c0544b6aa 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -11416,6 +11416,33 @@ vect_get_loop_len (loop_vec_info loop_vinfo,
> gimple_stmt_iterator *gsi,
>return loop_len;
>  }
> 
> +/* Generate the tree for the loop len mask and return it.  Given the lens,
> +   nvectors, vectype, index and factor to gen the len mask as below.
> +
> +   tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
> +*/
> +tree
> +vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
> + gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
> + unsigned int nvectors, tree vectype, tree stmt,
> + unsigned int index, unsigned int factor)
> +{
> +  tree all_one_mask = build_all_ones_cst (vectype);
> +  tree all_zero_mask = build_zero_cst (vectype);
> +  tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, 
> index,
> + factor);
> +  tree bias = build_int_cst (intQI_type_node,
> +  LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo));
> +  tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL,
> "vec_len_mask");
> +  gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
> + all_one_mask, all_zero_mask, len,
> + bias);
> +  gimple_call_set_lhs (call, len_mask);
> +  gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
> +
> +  return len_mask;
> +}
> +
>  /* Scale profiling counters by estimation for LOOP which is vectorized
> by factor VF.
> If FLAT is true, the loop we started with had unrealistically flat
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index b8a71605f1b..672959501bb 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -12895,7 +12895,9 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>  ncopies = vect_get_num_copies (loop_vinfo, vectype);
> 
>vec_loop_masks *masks = _VINFO_MASKS (loop_vinfo);
> +  vec_loop_lens *lens = _VINFO_LENS (loop_vinfo);
>bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
> 
>/* Now build the new conditional.  Pattern gimple_conds get dropped 

RE: [PATCH 0/4]AArch64: support conditional early clobbers on certain operations.

2024-05-15 Thread Tamar Christina
> -Original Message-
> From: Richard Sandiford 
> Sent: Wednesday, May 15, 2024 10:31 PM
> To: Tamar Christina 
> Cc: Richard Biener ; gcc-patches@gcc.gnu.org; nd
> ; Richard Earnshaw ; Marcus
> Shawcroft ; ktkac...@gcc.gnu.org
> Subject: Re: [PATCH 0/4]AArch64: support conditional early clobbers on certain
> operations.
> 
> Tamar Christina  writes:
> >> >> On Wed, May 15, 2024 at 12:29 PM Tamar Christina
> >> >>  wrote:
> >> >> >
> >> >> > Hi All,
> >> >> >
> >> >> > Some Neoverse Software Optimization Guides (SWoG) have a clause that
> state
> >> >> > that for predicated operations that also produce a predicate it is 
> >> >> > preferred
> >> >> > that the codegen should use a different register for the destination 
> >> >> > than
> that
> >> >> > of the input predicate in order to avoid a performance overhead.
> >> >> >
> >> >> > This of course has the problem that it increases register pressure 
> >> >> > and so
> >> should
> >> >> > be done with care.  Additionally not all micro-architectures have this
> >> >> > consideration and so it shouldn't be done as a default thing.
> >> >> >
> >> >> > The patch series adds support for doing conditional early clobbers 
> >> >> > through
> a
> >> >> > combination of new alternatives and attributes to control their 
> >> >> > availability.
> >> >>
> >> >> You could have two alternatives, one with early clobber and one with
> >> >> a matching constraint where you'd disparage the matching constraint one?
> >> >>
> >> >
> >> > Yeah, that's what I do, though there's no need to disparage the non-early
> clobber
> >> > alternative as the early clobber alternative will naturally get a 
> >> > penalty if it
> needs a
> >> > reload.
> >>
> >> But I think Richard's suggestion was to disparage the one with a matching
> >> constraint (not the earlyclobber), to reflect the increased cost of
> >> reusing the register.
> >>
> >> We did take that approach for gathers, e.g.:
> >>
> >>  [, Z,   w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s]
> >>  [?w, Z,   0, Ui1, Ui1, Upl] ^
> >>
> >> The (supposed) advantage is that, if register pressure is so tight
> >> that using matching registers is the only alternative, we still
> >> have the opportunity to do that, as a last resort.
> >>
> >> Providing only an earlyclobber version means that using the same
> >> register is prohibited outright.  If no other register is free, the RA
> >> would need to spill something else to free up a temporary register.
> >> And it might then do the equivalent of (pseudo-code):
> >>
> >>   not p1.b, ..., p0.b
> >>   mov p0.d, p1.d
> >>
> >> after spilling what would otherwise have occupied p1.  In that
> >> situation it would be better use:
> >>
> >>   not p0.b, ..., p0.b
> >>
> >> and not introduce the spill of p1.
> >
> > I think I understood what Richi meant, but I thought it was already working 
> > that
> way.
> 
> The suggestion was to use matching constraints (like "0") though,
> whereas the patch doesn't.  I think your argument is that you don't
> need to use matching constraints.  But that's different from the
> suggestion (and from how we handle gathers).
> 
> I was going to say in response to patch 3 (but got distracted, sorry):
> I don't think we should have:
> 
>, Upa, ...
>Upa, Upa, ...
> 
> (taken from the pure logic ops) enabled at the same time.  Even though
> it works for the testcases, I don't think it has well-defined semantics.
> 
> The problem is that, taken on its own, the second alternative says that
> matching operands are free.  And fundamentally, I don't think the costs
> *must* take the earlyclobber alternative over the non-earlyclobber one
> (when costing during IRA, for instance).  In principle, the cheapest
> is best.
> 
> The aim of the gather approach is to make each alternative correct in
> isolation.  In:
> 
>   [, Z,   w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s]
>   [?w, Z,   0, Ui1, Ui1, Upl] ^
> 
> the second alternative says that it is possible to have operands 0
> and 2 be the same vector register, but using that version has the

RE: [PATCH 0/4]AArch64: support conditional early clobbers on certain operations.

2024-05-15 Thread Tamar Christina
> >> On Wed, May 15, 2024 at 12:29 PM Tamar Christina
> >>  wrote:
> >> >
> >> > Hi All,
> >> >
> >> > Some Neoverse Software Optimization Guides (SWoG) have a clause that 
> >> > state
> >> > that for predicated operations that also produce a predicate it is 
> >> > preferred
> >> > that the codegen should use a different register for the destination 
> >> > than that
> >> > of the input predicate in order to avoid a performance overhead.
> >> >
> >> > This of course has the problem that it increases register pressure and so
> should
> >> > be done with care.  Additionally not all micro-architectures have this
> >> > consideration and so it shouldn't be done as a default thing.
> >> >
> >> > The patch series adds support for doing conditional early clobbers 
> >> > through a
> >> > combination of new alternatives and attributes to control their 
> >> > availability.
> >>
> >> You could have two alternatives, one with early clobber and one with
> >> a matching constraint where you'd disparage the matching constraint one?
> >>
> >
> > Yeah, that's what I do, though there's no need to disparage the non-early 
> > clobber
> > alternative as the early clobber alternative will naturally get a penalty 
> > if it needs a
> > reload.
> 
> But I think Richard's suggestion was to disparage the one with a matching
> constraint (not the earlyclobber), to reflect the increased cost of
> reusing the register.
> 
> We did take that approach for gathers, e.g.:
> 
>  [, Z,   w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s]
>  [?w, Z,   0, Ui1, Ui1, Upl] ^
> 
> The (supposed) advantage is that, if register pressure is so tight
> that using matching registers is the only alternative, we still
> have the opportunity to do that, as a last resort.
> 
> Providing only an earlyclobber version means that using the same
> register is prohibited outright.  If no other register is free, the RA
> would need to spill something else to free up a temporary register.
> And it might then do the equivalent of (pseudo-code):
> 
>   not p1.b, ..., p0.b
>   mov p0.d, p1.d
> 
> after spilling what would otherwise have occupied p1.  In that
> situation it would be better use:
> 
>   not p0.b, ..., p0.b
> 
> and not introduce the spill of p1.

I think I understood what Richi meant, but I thought it was already working 
that way.
i.e. as one of the testcases I had:

> aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2 
> -ffixed-p[1-15]

foo:
mov z31.h, w0
ptrue   p0.b, all
cmplo   p0.h, p0/z, z0.h, z31.h
b   use

and reload did not force a spill.

My understanding of how this works, and how it seems to be working is that 
since reload costs
Alternative from front to back the cheapest one wins and it stops evaluating 
the rest.

The early clobber case is first and preferred, however when it's not possible, 
i.e. requires a non-pseudo
reload, the reload cost is added to the alternative.

However you're right that in the following testcase:

-mcpu=neoverse-n2 -ffixed-p1 -ffixed-p2 -ffixed-p3 -ffixed-p4 -ffixed-p5 
-ffixed-p6 -ffixed-p7 -ffixed-p8 -ffixed-p9 -ffixed-p10 -ffixed-p11 -ffixed-p12 
-ffixed-p12 -ffixed-p13 -ffixed-p14 -ffixed-p14 -fdump-rtl-reload

i.e. giving it an extra free register inexplicably causes a spill:

foo:
addvl   sp, sp, #-1
mov z31.h, w0
ptrue   p0.b, all
str p15, [sp]
cmplo   p15.h, p0/z, z0.h, z31.h
mov p0.b, p15.b
ldr p15, [sp]
addvl   sp, sp, #1
b   use

so that's unexpected and is very weird as p15 has no defined value..

Now adding the ? as suggested to the non-early clobber alternative does not fix 
it, and my mental model for how this is supposed to work does not quite line 
up..
Why would making the non-clobber alternative even more expensive help it during 
high register pressure?? But with that suggestion the above case does not get 
fixed
and the following case

-mcpu=neoverse-n2 -ffixed-p1 -ffixed-p2 -ffixed-p3 -ffixed-p4 -ffixed-p5 
-ffixed-p6 -ffixed-p7 -ffixed-p8 -ffixed-p9 -ffixed-p10 -ffixed-p11 -ffixed-p12 
-ffixed-p12 -ffixed-p13 -ffixed-p14 -ffixed-p15 -fdump-rtl-reload

ICEs:

pred-clobber.c: In function 'foo':
pred-clobber.c:9:1: error: unable to find a register to spill
9 | }
  | ^
pred-clobber.c:9:1: error: this is the insn:
(insn 10 22 19 2 (parallel [
(set (reg:VNx8BI 110 [104])
(unspec:VNx8BI [
(reg:VNx8BI 112)
(const_int 1 [0x1])
   

RE: [PATCH 0/4]AArch64: support conditional early clobbers on certain operations.

2024-05-15 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Wednesday, May 15, 2024 12:20 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; ktkac...@gcc.gnu.org; Richard Sandiford
> 
> Subject: Re: [PATCH 0/4]AArch64: support conditional early clobbers on certain
> operations.
> 
> On Wed, May 15, 2024 at 12:29 PM Tamar Christina
>  wrote:
> >
> > Hi All,
> >
> > Some Neoverse Software Optimization Guides (SWoG) have a clause that state
> > that for predicated operations that also produce a predicate it is preferred
> > that the codegen should use a different register for the destination than 
> > that
> > of the input predicate in order to avoid a performance overhead.
> >
> > This of course has the problem that it increases register pressure and so 
> > should
> > be done with care.  Additionally not all micro-architectures have this
> > consideration and so it shouldn't be done as a default thing.
> >
> > The patch series adds support for doing conditional early clobbers through a
> > combination of new alternatives and attributes to control their 
> > availability.
> 
> You could have two alternatives, one with early clobber and one with
> a matching constraint where you'd disparage the matching constraint one?
> 

Yeah, that's what I do, though there's no need to disparage the non-early 
clobber
alternative as the early clobber alternative will naturally get a penalty if it 
needs a
reload.

Cheers,
Tamar

> > On high register pressure we also use LRA's costing to prefer not to use the
> > alternative and instead just use the tie as this is preferable to a reload.
> >
> > Concretely this patch series does:
> >
> > > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2
> >
> > foo:
> > mov z31.h, w0
> > ptrue   p3.b, all
> > cmplo   p0.h, p3/z, z0.h, z31.h
> > b   use
> >
> > > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n1+sve
> >
> > foo:
> > mov z31.h, w0
> > ptrue   p0.b, all
> > cmplo   p0.h, p0/z, z0.h, z31.h
> > b   use
> >
> > > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2 -
> ffixed-p[1-15]
> >
> > foo:
> > mov z31.h, w0
> > ptrue   p0.b, all
> > cmplo   p0.h, p0/z, z0.h, z31.h
> > b   use
> >
> > Testcases for the changes are in the last patch of the series.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Thanks,
> > Tamar
> >
> > ---
> >
> > --


RE: [PATCH 2/4]AArch64: add new tuning param and attribute for enabling conditional early clobber

2024-05-15 Thread Tamar Christina
> -Original Message-
> From: Richard Sandiford 
> Sent: Wednesday, May 15, 2024 11:56 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; ktkac...@gcc.gnu.org
> Subject: Re: [PATCH 2/4]AArch64: add new tuning param and attribute for
> enabling conditional early clobber
> 
> Tamar Christina  writes:
> > Hi All,
> >
> > This adds a new tuning parameter EARLY_CLOBBER_SVE_PRED_DEST for AArch64
> to
> > allow us to conditionally enable the early clobber alternatives based on the
> > tuning models.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-tuning-flags.def
> > (EARLY_CLOBBER_SVE_PRED_DEST): New.
> > * config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New.
> > * config/aarch64/aarch64.md (pred_clobber): New.
> > (arch_enabled): Use it.
> >
> > ---
> > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def
> b/gcc/config/aarch64/aarch64-tuning-flags.def
> > index
> d5bcaebce770f0b217aac783063d39135f754c77..49fbad3ff28bc82b25c61ac50
> 1ccf533ec4b4c3f 100644
> > --- a/gcc/config/aarch64/aarch64-tuning-flags.def
> > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def
> > @@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION
> ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
> >
> >  AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma",
> FULLY_PIPELINED_FMA)
> >
> > +/* Enable is the target prefers to use a fresh register for predicate 
> > outputs
> > +   rather than re-use an input predicate register.  */
> > +AARCH64_EXTRA_TUNING_OPTION ("early_clobber_sve_pred_dest",
> EARLY_CLOBBER_SVE_PRED_DEST)
> 
> Sorry for the bike-shedding, but how about something like "avoid_pred_rmw"?
> (I'm open to other suggestions.)  Just looking for something that describes
> either the architecture or the end result that we want to achieve.
> And preferable something fairly short :)
> 
> avoid_* would be consistent with the existing "avoid_cross_loop_fma".

Sure, happy to, it's something we initially struggled with naming internally as 
well.
It sounds there's precedence so the avoid_ naming, so happy to use this naming.

Will respin with it.

Thanks,
Tamar

> 
> > +
> >  #undef AARCH64_EXTRA_TUNING_OPTION
> > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> > index
> bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d5
> 6b46c74084ba7c3c 100644
> > --- a/gcc/config/aarch64/aarch64.h
> > +++ b/gcc/config/aarch64/aarch64.h
> > @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE =
> AARCH64_FL_SM_OFF;
> >  enabled through +gcs.  */
> >  #define TARGET_GCS (AARCH64_ISA_GCS)
> >
> > +/*  Prefer different predicate registers for the output of a predicated 
> > operation
> over
> > +re-using an existing input predicate.  */
> > +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
> > +&& (aarch64_tune_params.extra_tuning_flags \
> > +&
> AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST))
> >
> >  /* Standard register usage.  */
> >
> > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> > index
> dbde066f7478bec51a8703b017ea553aa98be309..1ecd1a2812969504bd5114a
> 53473b478c5ddba82 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -445,6 +445,10 @@ (define_enum_attr "arch" "arches" (const_string
> "any"))
> >  ;; target-independent code.
> >  (define_attr "is_call" "no,yes" (const_string "no"))
> >
> > +;; Indicates whether we want to enable the pattern with an optional early
> > +;; clobber for SVE predicates.
> > +(define_attr "pred_clobber" "no,yes" (const_string "no"))
> > +
> >  ;; [For compatibility with Arm in pipeline models]
> >  ;; Attribute that specifies whether or not the instruction touches fp
> >  ;; registers.
> > @@ -461,7 +465,8 @@ (define_attr "fp" "no,yes"
> >  (define_attr "arch_enabled" "no,yes"
> >(if_then_else
> >  (ior
> > -   (eq_attr "arch" "any")
> > +   (and (eq_attr "arch" "any")
> > +(eq_attr "pred_clobber" "no"))
> >
> > (and (eq_attr "arch" "rcpc8_4")
> >  (match_test "AARCH64_ISA_RCPC8_4"))
> > @@ -488,7 +493,10 @@ (define_attr "arch_enabled" "no,yes"
> >  (match_test "TARGET_SVE"))
> >
> > (and (eq_attr "arch" "sme")
> > -(match_test "TARGET_SME")))
> > +(match_test "TARGET_SME"))
> > +
> > +   (and (eq_attr "pred_clobber" "yes")
> > +(match_test "TARGET_SVE_PRED_CLOBBER")))
> 
> IMO it'd be bettero handle pred_clobber separately from arch, as a new
> top-level AND:
> 
>   (and
> (ior
>   (eq_attr "pred_clobber" "no")
>   (match_test "!TARGET_..."))
> (ior
>   ...existing arch tests...))
> 
> Thanks,
> Richard


[PATCH 3/4]AArch64: add new alternative with early clobber to patterns

2024-05-15 Thread Tamar Christina
Hi All,

This patch adds new alternatives to the patterns which are affected.  The new
alternatives with the conditional early clobbers are added before the normal
ones in order for LRA to prefer them in the event that we have enough free
registers to accommodate them.

In case register pressure is too high the normal alternatives will be preferred
before a reload is considered as we rather have the tie than a spill.

Tests are in the next patch.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (and3,
@aarch64_pred__z, *3_cc,
*3_ptest, aarch64_pred__z,
*3_cc, *3_ptest,
aarch64_pred__z, *3_cc,
*3_ptest, @aarch64_pred_cmp,
*cmp_cc, *cmp_ptest,
@aarch64_pred_cmp_wide,
*aarch64_pred_cmp_wide_cc,
*aarch64_pred_cmp_wide_ptest, @aarch64_brk,
*aarch64_brk_cc, *aarch64_brk_ptest,
@aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest,
*aarch64_brk_cc, *aarch64_brk_ptest,
aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest,
*aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber
alternative.
* config/aarch64/aarch64-sve2.md
(@aarch64_pred_): Likewise.

---
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
839ab0627747d7a49bef7b0192ee9e7a42587ca0..93ec59e58afee260b85082c472db2abfea7386b6
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1161,8 +1161,9 @@ (define_insn "aarch64_rdffr_z"
  (reg:VNx16BI FFRT_REGNUM)
  (match_operand:VNx16BI 1 "register_operand")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffr\t%0.b, %1/z
+  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
+ [ , Upa; yes ] rdffr\t%0.b, %1/z
+ [ Upa , Upa; *   ] ^
   }
 )
 
@@ -1179,8 +1180,9 @@ (define_insn "*aarch64_rdffr_z_ptest"
  UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1  , 2 ]
- [ Upa , Upa,   ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1  , 2; attrs: pred_clobber ]
+ [ , Upa,  ; yes ] rdffrs\t%0.b, %1/z
+ [ Upa , Upa,  ; *   ] ^
   }
 )
 
@@ -1195,8 +1197,9 @@ (define_insn "*aarch64_rdffr_ptest"
  UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1   ]
- [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1  ; attrs: pred_clobber ]
+ [ , Upa; yes ] rdffrs\t%0.b, %1/z
+ [ Upa , Upa; *   ] ^
   }
 )
 
@@ -1216,8 +1219,9 @@ (define_insn "*aarch64_rdffr_z_cc"
  (reg:VNx16BI FFRT_REGNUM)
  (match_dup 1)))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1  , 2 ]
- [ Upa , Upa,   ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1  , 2; attrs: pred_clobber ]
+ [ , Upa,  ; yes ] rdffrs\t%0.b, %1/z
+ [ Upa , Upa,  ; *   ] ^
   }
 )
 
@@ -1233,8 +1237,9 @@ (define_insn "*aarch64_rdffr_cc"
(set (match_operand:VNx16BI 0 "register_operand")
(reg:VNx16BI FFRT_REGNUM))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  {@ [ cons: =0, 1  , 2 ]
- [ Upa , Upa,   ] rdffrs\t%0.b, %1/z
+  {@ [ cons: =0, 1  , 2; attrs: pred_clobber ]
+ [ , Upa,  ; yes ] rdffrs\t%0.b, %1/z
+ [ Upa , Upa,  ; *   ] ^
   }
 )
 
@@ -6651,8 +6656,9 @@ (define_insn "and3"
(and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand")
  (match_operand:PRED_ALL 2 "register_operand")))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1  , 2   ]
- [ Upa , Upa, Upa ] and\t%0.b, %1/z, %2.b, %2.b
+  {@ [ cons: =0, 1  , 2  ; attrs: pred_clobber ]
+ [ , Upa, Upa; yes ] and\t%0.b, %1/z, %2.b, %2.b
+ [ Upa , Upa, Upa; *   ] ^
   }
 )
 
@@ -6679,8 +6685,9 @@ (define_insn "@aarch64_pred__z"
(match_operand:PRED_ALL 3 "register_operand"))
  (match_operand:PRED_ALL 1 "register_operand")))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1  , 2  , 3   ]
- [ Upa , Upa, Upa, Upa ] \t%0.b, %1/z, %2.b, %3.b
+  {@ [ cons: =0, 1  , 2  , 3  ; attrs: pred_clobber ]
+ [ , Upa, Upa, Upa; yes ] \t%0.b, %1/z, 
%2.b, %3.b
+ [ Upa , Upa, Upa, Upa; *   ] ^
   }
 )
 
@@ -6703,8 +6710,9 @@ (define_insn "*3_cc"
(and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3))
  (match_dup 4)))]
   "TARGET_SVE"
-  {@ [ cons: =0, 1  , 2  , 3  , 4, 5 ]
- [ Upa , Upa, Upa, Upa,  ,   ] s\t%0.b, %1/z, %2.b, %3.b
+  {@ [ cons: =0, 1  , 2  , 3  , 4, 5; attrs: pred_clobber ]
+ [ , Upa, Upa, Upa,  ,  ; yes ] s\t%0.b, 
%1/z, %2.b, %3.b
+  

[PATCH 4/4]AArch64: enable new predicate tuning for Neoverse cores.

2024-05-15 Thread Tamar Christina
Hi All,

This enables the new tuning flag for Neoverse V1, Neoverse V2 and Neoverse N2.
It is kept off for generic codegen.

Note the reason for the +sve even though they are in aarch64-sve.exp is if the
testsuite is ran with a forced SVE off option, e.g. -march=armv8-a+nosve then
the intrinsics end up being disabled because the -march is preferred over the
-mcpu even though the -mcpu comes later.

This prevents the tests from failing in such runs.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/tuning_models/neoversen2.h (neoversen2_tunings): Add
AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST.
* config/aarch64/tuning_models/neoversev1.h (neoversev1_tunings): Add
AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST.
* config/aarch64/tuning_models/neoversev2.h (neoversev2_tunings): Add
AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/pred_clobber_1.c: New test.
* gcc.target/aarch64/sve/pred_clobber_2.c: New test.
* gcc.target/aarch64/sve/pred_clobber_3.c: New test.
* gcc.target/aarch64/sve/pred_clobber_4.c: New test.
* gcc.target/aarch64/sve/pred_clobber_5.c: New test.

---
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
b/gcc/config/aarch64/tuning_models/neoversen2.h
index 
7e799bbe762fe862e31befed50e54040a7fd1f2f..0d8f3f6be67f3583b00473bef97ea3ae4fcea4ec
 100644
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -236,7 +236,8 @@ static const struct tune_params neoversen2_tunings =
   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST),  /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS   /* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
b/gcc/config/aarch64/tuning_models/neoversev1.h
index 
9363f2ad98a5279cc99f2f9b1509ba921d582e84..d28d0b1c0498ed250b0a93ca69720fe10c65c93d
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -227,7 +227,8 @@ static const struct tune_params neoversev1_tunings =
   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
-   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),   /* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
+   | AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST),  /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS/* stp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index 
bc01ed767c9b690504eb98456402df5d9d64eee3..3b2f9797bd777e73ca9c21501fa97448d96cb65e
 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -236,7 +236,8 @@ static const struct tune_params neoversev2_tunings =
   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
-   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags.  */
+   | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
+   | AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST),  /* tune_flags.  */
   _prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
   AARCH64_LDP_STP_POLICY_ALWAYS   /* stp_policy_model.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c
new file mode 100644
index 
..934a00a38531c5fd4139d99ff33414904b2c104f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=neoverse-n2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC target "+sve"
+
+#include 
+
+extern void use(svbool_t);
+
+/*
+** foo:
+** ...
+** ptrue   p([1-9][0-9]?).b, all
+** cmplo   p0.h, p\1/z, z0.h, z[0-9]+.h
+** ...
+*/
+void foo (svuint16_t a, uint16_t b)
+{
+svbool_t p0 = svcmplt_n_u16 (svptrue_b16 (), a, b);
+use (p0);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c
new file mode 100644
index 
..58badb66a43b1ac50eeec153b9cac44fc831b145
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c
@@ -0,0 +1,22 @@
+/* { dg-do compile 

[PATCH 1/4]AArch64: convert several predicate patterns to new compact syntax

2024-05-15 Thread Tamar Christina
Hi All,

This converts the single alternative patterns to the new compact syntax such
that when I add the new alternatives it's clearer what's being changed.

Note that this will spew out a bunch of warnings from geninsn as it'll warn that
@ is useless for a single alternative pattern.  These are not fatal so won't
break the build and are only temporary.

No change in functionality is expected with this patch.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (and3,
@aarch64_pred__z, *3_cc,
*3_ptest, aarch64_pred__z,
*3_cc, *3_ptest,
aarch64_pred__z, *3_cc,
*3_ptest, *cmp_ptest,
@aarch64_pred_cmp_wide,
*aarch64_pred_cmp_wide_cc,
*aarch64_pred_cmp_wide_ptest, *aarch64_brk_cc,
*aarch64_brk_ptest, @aarch64_brk, *aarch64_brkn_cc,
*aarch64_brkn_ptest, *aarch64_brk_cc,
*aarch64_brk_ptest, aarch64_rdffr_z, *aarch64_rdffr_z_ptest,
*aarch64_rdffr_ptest, *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Convert
to compact syntax.
* config/aarch64/aarch64-sve2.md
(@aarch64_pred_): Likewise.

---
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
0434358122d2fde71bd0e0f850338e739e9be02c..839ab0627747d7a49bef7b0192ee9e7a42587ca0
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1156,76 +1156,86 @@ (define_insn "aarch64_rdffr"
 
 ;; Likewise with zero predication.
 (define_insn "aarch64_rdffr_z"
-  [(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+  [(set (match_operand:VNx16BI 0 "register_operand")
(and:VNx16BI
  (reg:VNx16BI FFRT_REGNUM)
- (match_operand:VNx16BI 1 "register_operand" "Upa")))]
+ (match_operand:VNx16BI 1 "register_operand")))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffr\t%0.b, %1/z"
+  {@ [ cons: =0, 1   ]
+ [ Upa , Upa ] rdffr\t%0.b, %1/z
+  }
 )
 
 ;; Read the FFR to test for a fault, without using the predicate result.
 (define_insn "*aarch64_rdffr_z_ptest"
   [(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
- [(match_operand:VNx16BI 1 "register_operand" "Upa")
+ [(match_operand:VNx16BI 1 "register_operand")
   (match_dup 1)
   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
   (and:VNx16BI
 (reg:VNx16BI FFRT_REGNUM)
 (match_dup 1))]
  UNSPEC_PTEST))
-   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+   (clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffrs\t%0.b, %1/z"
+  {@ [ cons: =0, 1  , 2 ]
+ [ Upa , Upa,   ] rdffrs\t%0.b, %1/z
+  }
 )
 
 ;; Same for unpredicated RDFFR when tested with a known PTRUE.
 (define_insn "*aarch64_rdffr_ptest"
   [(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
- [(match_operand:VNx16BI 1 "register_operand" "Upa")
+ [(match_operand:VNx16BI 1 "register_operand")
   (match_dup 1)
   (const_int SVE_KNOWN_PTRUE)
   (reg:VNx16BI FFRT_REGNUM)]
  UNSPEC_PTEST))
-   (clobber (match_scratch:VNx16BI 0 "=Upa"))]
+   (clobber (match_scratch:VNx16BI 0))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffrs\t%0.b, %1/z"
+  {@ [ cons: =0, 1   ]
+ [ Upa , Upa ] rdffrs\t%0.b, %1/z
+  }
 )
 
 ;; Read the FFR with zero predication and test the result.
 (define_insn "*aarch64_rdffr_z_cc"
   [(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
- [(match_operand:VNx16BI 1 "register_operand" "Upa")
+ [(match_operand:VNx16BI 1 "register_operand")
   (match_dup 1)
   (match_operand:SI 2 "aarch64_sve_ptrue_flag")
   (and:VNx16BI
 (reg:VNx16BI FFRT_REGNUM)
 (match_dup 1))]
  UNSPEC_PTEST))
-   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+   (set (match_operand:VNx16BI 0 "register_operand")
(and:VNx16BI
  (reg:VNx16BI FFRT_REGNUM)
  (match_dup 1)))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffrs\t%0.b, %1/z"
+  {@ [ cons: =0, 1  , 2 ]
+ [ Upa , Upa,   ] rdffrs\t%0.b, %1/z
+  }
 )
 
 ;; Same for unpredicated RDFFR when tested with a known PTRUE.
 (define_insn "*aarch64_rdffr_cc"
   [(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
- [(match_operand:VNx16BI 1 "register_operand" "Upa")
+ [(match_operand:VNx16BI 1 "register_operand")
   (match_dup 1)
   (const_int SVE_KNOWN_PTRUE)
   (reg:VNx16BI FFRT_REGNUM)]
  UNSPEC_PTEST))
-   (set (match_operand:VNx16BI 0 "register_operand" "=Upa")
+   (set (match_operand:VNx16BI 0 "register_operand")
(reg:VNx16BI FFRT_REGNUM))]
   "TARGET_SVE && TARGET_NON_STREAMING"
-  "rdffrs\t%0.b, %1/z"
+  {@ [ cons: =0, 1  , 2 ]
+ [ Upa , Upa,   ] rdffrs\t%0.b, %1/z
+  }
 )
 
 ;; [R3 in the block comment above about FFR handling]
@@ -6637,11 +6647,13 @@ (define_insn 

[PATCH 2/4]AArch64: add new tuning param and attribute for enabling conditional early clobber

2024-05-15 Thread Tamar Christina
Hi All,

This adds a new tuning parameter EARLY_CLOBBER_SVE_PRED_DEST for AArch64 to
allow us to conditionally enable the early clobber alternatives based on the
tuning models.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-tuning-flags.def
(EARLY_CLOBBER_SVE_PRED_DEST): New.
* config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New.
* config/aarch64/aarch64.md (pred_clobber): New.
(arch_enabled): Use it.

---
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 
d5bcaebce770f0b217aac783063d39135f754c77..49fbad3ff28bc82b25c61ac501ccf533ec4b4c3f
 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", 
AVOID_CROSS_LOOP_FMA)
 
 AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA)
 
+/* Enable is the target prefers to use a fresh register for predicate outputs
+   rather than re-use an input predicate register.  */
+AARCH64_EXTRA_TUNING_OPTION ("early_clobber_sve_pred_dest", 
EARLY_CLOBBER_SVE_PRED_DEST)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d56b46c74084ba7c3c
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = 
AARCH64_FL_SM_OFF;
 enabled through +gcs.  */
 #define TARGET_GCS (AARCH64_ISA_GCS)
 
+/*  Prefer different predicate registers for the output of a predicated 
operation over
+re-using an existing input predicate.  */
+#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
+&& (aarch64_tune_params.extra_tuning_flags \
+& 
AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST))
 
 /* Standard register usage.  */
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
dbde066f7478bec51a8703b017ea553aa98be309..1ecd1a2812969504bd5114a53473b478c5ddba82
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -445,6 +445,10 @@ (define_enum_attr "arch" "arches" (const_string "any"))
 ;; target-independent code.
 (define_attr "is_call" "no,yes" (const_string "no"))
 
+;; Indicates whether we want to enable the pattern with an optional early
+;; clobber for SVE predicates.
+(define_attr "pred_clobber" "no,yes" (const_string "no"))
+
 ;; [For compatibility with Arm in pipeline models]
 ;; Attribute that specifies whether or not the instruction touches fp
 ;; registers.
@@ -461,7 +465,8 @@ (define_attr "fp" "no,yes"
 (define_attr "arch_enabled" "no,yes"
   (if_then_else
 (ior
-   (eq_attr "arch" "any")
+   (and (eq_attr "arch" "any")
+(eq_attr "pred_clobber" "no"))
 
(and (eq_attr "arch" "rcpc8_4")
 (match_test "AARCH64_ISA_RCPC8_4"))
@@ -488,7 +493,10 @@ (define_attr "arch_enabled" "no,yes"
 (match_test "TARGET_SVE"))
 
(and (eq_attr "arch" "sme")
-(match_test "TARGET_SME")))
+(match_test "TARGET_SME"))
+
+   (and (eq_attr "pred_clobber" "yes")
+(match_test "TARGET_SVE_PRED_CLOBBER")))
 (const_string "yes")
 (const_string "no")))
 




-- 
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index d5bcaebce770f0b217aac783063d39135f754c77..49fbad3ff28bc82b25c61ac501ccf533ec4b4c3f 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
 
 AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA)
 
+/* Enable is the target prefers to use a fresh register for predicate outputs
+   rather than re-use an input predicate register.  */
+AARCH64_EXTRA_TUNING_OPTION ("early_clobber_sve_pred_dest", EARLY_CLOBBER_SVE_PRED_DEST)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d56b46c74084ba7c3c 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF;
 enabled through +gcs.  */
 #define TARGET_GCS (AARCH64_ISA_GCS)
 
+/*  Prefer different predicate registers for the output of a predicated operation over
+re-using an existing input predicate.  */
+#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \
+ && (aarch64_tune_params.extra_tuning_flags \
+ & AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST))
 
 /* Standard register usage.  */
 
diff --git a/gcc/config/aarch64/aarch64.md 

[PATCH 0/4]AArch64: support conditional early clobbers on certain operations.

2024-05-15 Thread Tamar Christina
Hi All,

Some Neoverse Software Optimization Guides (SWoG) have a clause that state
that for predicated operations that also produce a predicate it is preferred
that the codegen should use a different register for the destination than that
of the input predicate in order to avoid a performance overhead.

This of course has the problem that it increases register pressure and so should
be done with care.  Additionally not all micro-architectures have this
consideration and so it shouldn't be done as a default thing.

The patch series adds support for doing conditional early clobbers through a
combination of new alternatives and attributes to control their availability.

On high register pressure we also use LRA's costing to prefer not to use the
alternative and instead just use the tie as this is preferable to a reload.

Concretely this patch series does:

> aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2

foo:
mov z31.h, w0
ptrue   p3.b, all
cmplo   p0.h, p3/z, z0.h, z31.h
b   use

> aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n1+sve

foo:
mov z31.h, w0
ptrue   p0.b, all
cmplo   p0.h, p0/z, z0.h, z31.h
b   use

> aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2 
> -ffixed-p[1-15]

foo:
mov z31.h, w0
ptrue   p0.b, all
cmplo   p0.h, p0/z, z0.h, z31.h
b   use

Testcases for the changes are in the last patch of the series.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Thanks,
Tamar

---

-- 


RE: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

2024-05-15 Thread Tamar Christina
Hi Pan,

Thanks!

> -Original Message-
> From: pan2...@intel.com 
> Sent: Wednesday, May 15, 2024 3:14 AM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> ; richard.guent...@gmail.com;
> hongtao@intel.com; Pan Li 
> Subject: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned 
> scalar
> int
> 
> From: Pan Li 
> 
> This patch would like to add the middle-end presentation for the
> saturation add.  Aka set the result of add to the max when overflow.
> It will take the pattern similar as below.
> 
> SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> 
> Take uint8_t as example, we will have:
> 
> * SAT_ADD (1, 254)   => 255.
> * SAT_ADD (1, 255)   => 255.
> * SAT_ADD (2, 255)   => 255.
> * SAT_ADD (255, 255) => 255.
> 
> Given below example for the unsigned scalar integer uint64_t:
> 
> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> {
>   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> }
> 
> Before this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   long unsigned int _1;
>   _Bool _2;
>   long unsigned int _3;
>   long unsigned int _4;
>   uint64_t _7;
>   long unsigned int _10;
>   __complex__ long unsigned int _11;
> 
> ;;   basic block 2, loop depth 0
> ;;pred:   ENTRY
>   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
>   _1 = REALPART_EXPR <_11>;
>   _10 = IMAGPART_EXPR <_11>;
>   _2 = _10 != 0;
>   _3 = (long unsigned int) _2;
>   _4 = -_3;
>   _7 = _1 | _4;
>   return _7;
> ;;succ:   EXIT
> 
> }
> 
> After this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   uint64_t _7;
> 
> ;;   basic block 2, loop depth 0
> ;;pred:   ENTRY
>   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
>   return _7;
> ;;succ:   EXIT
> }
> 
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
> 
>   PR target/51492
>   PR target/112600
> 
> gcc/ChangeLog:
> 
>   * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
>   to the return true switch case(s).
>   * internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
>   * match.pd: Add unsigned SAT_ADD match(es).
>   * optabs.def (OPTAB_NL): Remove fixed-point limitation for
>   us/ssadd.
>   * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New
>   extern func decl generated in match.pd match.
>   (match_saturation_arith): New func impl to match the saturation arith.
>   (math_opts_dom_walker::after_dom_children): Try match saturation
>   arith when IOR expr.
> 

 LGTM but you'll need an OK from Richard,

Thanks for working on this!

Tamar

> Signed-off-by: Pan Li 
> ---
>  gcc/internal-fn.cc|  1 +
>  gcc/internal-fn.def   |  2 ++
>  gcc/match.pd  | 51 +++
>  gcc/optabs.def|  4 +--
>  gcc/tree-ssa-math-opts.cc | 32 
>  5 files changed, 88 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0a7053c2286..73045ca8c8c 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
>  case IFN_UBSAN_CHECK_MUL:
>  case IFN_ADD_OVERFLOW:
>  case IFN_MUL_OVERFLOW:
> +case IFN_SAT_ADD:
>  case IFN_VEC_WIDEN_PLUS:
>  case IFN_VEC_WIDEN_PLUS_LO:
>  case IFN_VEC_WIDEN_PLUS_HI:
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..25badbb86e5 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST
> | ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> first,
> smulhrs, umulhrs, binary)
> 
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd,
> binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 07e743ae464..0f9c34fa897 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3043,6 +3043,57 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> || POINTER_TYPE_P (itype))
>&& wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype))
> 
> +/* Unsigned Saturation Add */
> +(match (usadd_left_part_1 @0 @1)
> + (plus:c @0 @1)
> + (if (INTEG

RE: [PATCH v1 1/3] Vect: Support loop len in vectorizable early exit

2024-05-13 Thread Tamar Christina
> -Original Message-
> From: pan2...@intel.com 
> Sent: Monday, May 13, 2024 3:54 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com;
> Tamar Christina ; Richard Sandiford
> ; Pan Li 
> Subject: [PATCH v1 1/3] Vect: Support loop len in vectorizable early exit
> 
> From: Pan Li 
> 
> This patch adds early break auto-vectorization support for target which
> use length on partial vectorization.  Consider this following example:
> 
> unsigned vect_a[802];
> unsigned vect_b[802];
> 
> void test (unsigned x, int n)
> {
>   for (int i = 0; i < n; i++)
>   {
>     vect_b[i] = x + i;
> 
>     if (vect_a[i] > x)
>       break;
> 
>     vect_a[i] = x;
>   }
> }
> 
> We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias).
> And then the IR of RVV looks like below:
> 
>   ...
>   _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]);
>   _55 = (int) _87;
>   ...
>   mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67;
>   vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \
> {0, ... }, _87, 0);
>   if (vec_len_mask_72 != { 0, ... })
>     goto ; [5.50%]
>   else
>     goto ; [94.50%]
> 
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The aarch64 fully regression tests.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
> 
> gcc/ChangeLog:
> 
>   * tree-vect-stmts.cc (vectorizable_early_exit): Add loop len
>   handling for one or multiple stmt.
> 
> Signed-off-by: Pan Li 
> ---
>  gcc/tree-vect-stmts.cc | 47 -
> -
>  1 file changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 21e8fe98e44..bfd9d66568f 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -12896,7 +12896,9 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>  ncopies = vect_get_num_copies (loop_vinfo, vectype);
> 
>vec_loop_masks *masks = _VINFO_MASKS (loop_vinfo);
> +  vec_loop_lens *lens = _VINFO_LENS (loop_vinfo);
>bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> +  bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo);
> 
>/* Now build the new conditional.  Pattern gimple_conds get dropped during
>   codegen so we must replace the original insn.  */
> @@ -12960,12 +12962,11 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
>   {
> if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype,
> OPTIMIZE_FOR_SPEED))
> - return false;
> + vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1);
> else
>   vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL);
>   }
> 
> -
>return true;
>  }
> 
> @@ -13018,6 +13019,25 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
> stmts[i], _gsi);
>   workset.quick_push (stmt_mask);
> }
> +  else if (len_loop_p)
> + for (unsigned i = 0; i < stmts.length (); i++)
> +   {
> + tree all_ones_mask = build_all_ones_cst (vectype);
> + tree all_zero_mask = build_zero_cst (vectype);
> + tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies,
> +   vectype, i, 1);
> + signed char cst = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS
> (loop_vinfo);
> + tree bias = build_int_cst (intQI_type_node, cst);
> + tree len_mask
> +   = make_temp_ssa_name (TREE_TYPE (stmts[i]), NULL,
> "vec_len_mask");
> + gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5,
> +   stmts[i], all_ones_mask,
> +   all_zero_mask, len, bias);
> + gimple_call_set_lhs (call, len_mask);
> + gsi_insert_before (_gsi, call, GSI_SAME_STMT);
> +
> + workset.quick_push (len_mask);
> +   }
>else
>   workset.splice (stmts);
> 
> @@ -13042,6 +13062,29 @@ vectorizable_early_exit (vec_info *vinfo,
> stmt_vec_info stmt_info,
> new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
>  new_temp, _gsi);
>   }
> +  else if (len_loop_p)
> + {
> +   /* len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
> +
> +  which is equivalent to:
> +
> +  len_mask

RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

2024-05-13 Thread Tamar Christina
> 
> Thanks Tamer for comments.
> 
> > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when
> optimizing for size.
> 
> Sure thing, let me update it in v5.
> 
> > Hmm why do you iterate independently over the statements? The block below
> already visits
> > Every statement doesn't it?
> 
> Because it will hit .ADD_OVERFLOW first, then it will never hit SAT_ADD as the
> shape changed, or shall we put it to the previous pass ?
> 

That's just a matter of matching the overflow as an additional case no?
i.e. you can add an overload for unsigned_integer_sat_add matching the
IFN_ ADD_OVERFLOW and using the realpart and imagpart helpers.

I think that would be better as it avoid visiting all the statements twice but 
also
extends the matching to some __builtin_add_overflow uses and should be fairly
simple.

> > The root of your match is a BIT_IOR_EXPR expression, so I think you just 
> > need to
> change the entry below to:
> >
> > case BIT_IOR_EXPR:
> >   match_saturation_arith (, stmt, m_cfg_changed_p);
> >   /* fall-through */
> > case BIT_XOR_EXPR:
> >   match_uaddc_usubc (, stmt, code);
> >   break;
> 
> There are other shapes (not covered in this patch) of SAT_ADD like below 
> branch
> version, the IOR should be one of the ROOT. Thus doesn't
> add case here.  Then, shall we take case for each shape here ? Both works for 
> me.
> 

Yeah, I think that's better than iterating over the statements twice.  It also 
fits better
In the existing code.

Tamar.

> #define SAT_ADD_U_1(T) \
> T sat_add_u_1_##T(T x, T y) \
> { \
>   return (T)(x + y) >= x ? (x + y) : -1; \
> }
> 
> SAT_ADD_U_1(uint32_t)
> 
> Pan
> 
> 
> -Original Message-
> From: Tamar Christina 
> Sent: Monday, May 13, 2024 5:10 PM
> To: Li, Pan2 ; gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com;
> Liu, Hongtao 
> Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned
> scalar int
> 
> Hi Pan,
> 
> > -Original Message-
> > From: pan2...@intel.com 
> > Sent: Monday, May 6, 2024 3:48 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> > ; richard.guent...@gmail.com;
> > hongtao@intel.com; Pan Li 
> > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned
> scalar
> > int
> >
> > From: Pan Li 
> >
> > This patch would like to add the middle-end presentation for the
> > saturation add.  Aka set the result of add to the max when overflow.
> > It will take the pattern similar as below.
> >
> > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> >
> > Take uint8_t as example, we will have:
> >
> > * SAT_ADD (1, 254)   => 255.
> > * SAT_ADD (1, 255)   => 255.
> > * SAT_ADD (2, 255)   => 255.
> > * SAT_ADD (255, 255) => 255.
> >
> > Given below example for the unsigned scalar integer uint64_t:
> >
> > uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> > {
> >   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> > }
> >
> > Before this patch:
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   long unsigned int _1;
> >   _Bool _2;
> >   long unsigned int _3;
> >   long unsigned int _4;
> >   uint64_t _7;
> >   long unsigned int _10;
> >   __complex__ long unsigned int _11;
> >
> > ;;   basic block 2, loop depth 0
> > ;;pred:   ENTRY
> >   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
> >   _1 = REALPART_EXPR <_11>;
> >   _10 = IMAGPART_EXPR <_11>;
> >   _2 = _10 != 0;
> >   _3 = (long unsigned int) _2;
> >   _4 = -_3;
> >   _7 = _1 | _4;
> >   return _7;
> > ;;succ:   EXIT
> >
> > }
> >
> > After this patch:
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   uint64_t _7;
> >
> > ;;   basic block 2, loop depth 0
> > ;;pred:   ENTRY
> >   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
> >   return _7;
> > ;;succ:   EXIT
> > }
> >
> > We perform the tranform during widen_mult because that the sub-expr of
> > SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
> > pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
> > .SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
> > cannot perform the .SAT_ADD pattern match as the

RE: [PATCH] Allow patterns in SLP reductions

2024-05-13 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Friday, May 10, 2024 2:07 PM
> To: Richard Biener 
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] Allow patterns in SLP reductions
> 
> On Fri, Mar 1, 2024 at 10:21 AM Richard Biener  wrote:
> >
> > The following removes the over-broad rejection of patterns for SLP
> > reductions which is done by removing them from LOOP_VINFO_REDUCTIONS
> > during pattern detection.  That's also insufficient in case the
> > pattern only appears on the reduction path.  Instead this implements
> > the proper correctness check in vectorizable_reduction and guides
> > SLP discovery to heuristically avoid forming later invalid groups.
> >
> > I also couldn't find any testcase that FAILs when allowing the SLP
> > reductions to form so I've added one.
> >
> > I came across this for single-lane SLP reductions with the all-SLP
> > work where we rely on patterns to properly vectorize COND_EXPR
> > reductions.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, queued for stage1.
> 
> Re-bootstrapped/tested, r15-361-g52d4691294c847

Awesome!

Does this now allow us to write new reductions using patterns? i.e. widening 
reductions?

Cheers,
Tamar
> 
> Richard.
> 
> > Richard.
> >
> > * tree-vect-patterns.cc (vect_pattern_recog_1): Do not
> > remove reductions involving patterns.
> > * tree-vect-loop.cc (vectorizable_reduction): Reject SLP
> > reduction groups with multiple lane-reducing reductions.
> > * tree-vect-slp.cc (vect_analyze_slp_instance): When discovering
> > SLP reduction groups avoid including lane-reducing ones.
> >
> > * gcc.dg/vect/vect-reduc-sad-9.c: New testcase.
> > ---
> >  gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c | 68 
> >  gcc/tree-vect-loop.cc| 15 +
> >  gcc/tree-vect-patterns.cc| 13 
> >  gcc/tree-vect-slp.cc | 26 +---
> >  4 files changed, 101 insertions(+), 21 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c
> >
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c
> b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c
> > new file mode 100644
> > index 000..3c6af4510f4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c
> > @@ -0,0 +1,68 @@
> > +/* Disabling epilogues until we find a better way to deal with scans.  */
> > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
> > +/* { dg-additional-options "-msse4.2" { target { x86_64-*-* i?86-*-* } } } 
> > */
> > +/* { dg-require-effective-target vect_usad_char } */
> > +
> > +#include 
> > +#include "tree-vect.h"
> > +
> > +#define N 64
> > +
> > +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> > +unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
> > +int abs (int);
> > +
> > +/* Sum of absolute differences between arrays of unsigned char types.
> > +   Detected as a sad pattern.
> > +   Vectorized on targets that support sad for unsigned chars.  */
> > +
> > +__attribute__ ((noinline)) int
> > +foo (int len, int *res2)
> > +{
> > +  int i;
> > +  int result = 0;
> > +  int result2 = 0;
> > +
> > +  for (i = 0; i < len; i++)
> > +{
> > +  /* Make sure we are not using an SLP reduction for this.  */
> > +  result += abs (X[2*i] - Y[2*i]);
> > +  result2 += abs (X[2*i + 1] - Y[2*i + 1]);
> > +}
> > +
> > +  *res2 = result2;
> > +  return result;
> > +}
> > +
> > +
> > +int
> > +main (void)
> > +{
> > +  int i;
> > +  int sad;
> > +
> > +  check_vect ();
> > +
> > +  for (i = 0; i < N/2; i++)
> > +{
> > +  X[2*i] = i;
> > +  Y[2*i] = N/2 - i;
> > +  X[2*i+1] = i;
> > +  Y[2*i+1] = 0;
> > +  __asm__ volatile ("");
> > +}
> > +
> > +
> > +  int sad2;
> > +  sad = foo (N/2, );
> > +  if (sad != (N/2)*(N/4))
> > +abort ();
> > +  if (sad2 != (N/2-1)*(N/2)/2)
> > +abort ();
> > +
> > +  return 0;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } 
> > } */
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> > +
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index 35f1f8c7d42..13dcdba403a 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -7703,6 +7703,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
> >return false;
> >  }
> >
> > +  /* Lane-reducing ops also never can be used in a SLP reduction group
> > + since we'll mix lanes belonging to different reductions.  But it's
> > + OK to use them in a reduction chain or when the reduction group
> > + has just one element.  */
> > +  if (lane_reduc_code_p
> > +  && slp_node
> > +  && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
> > +  && SLP_TREE_LANES (slp_node) > 1)
> > +{
> > +  if (dump_enabled_p ())
> > +   dump_printf_loc 

RE: [PATCH v4 2/3] VECT: Support new IFN SAT_ADD for unsigned vector int

2024-05-13 Thread Tamar Christina
Hi Pan,

> -Original Message-
> From: pan2...@intel.com 
> Sent: Monday, May 6, 2024 3:49 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> ; richard.guent...@gmail.com;
> hongtao@intel.com; Pan Li 
> Subject: [PATCH v4 2/3] VECT: Support new IFN SAT_ADD for unsigned vector int
> 
> From: Pan Li 
> 
> This patch depends on below scalar enabling patch:
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2024-May/650822.html
> 
> For vectorize, we leverage the existing vect pattern recog to find
> the pattern similar to scalar and let the vectorizer to perform
> the rest part for standard name usadd3 in vector mode.
> The riscv vector backend have insn "Vector Single-Width Saturating
> Add and Subtract" which can be leveraged when expand the usadd3
> in vector mode.  For example:
> 
> void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
> {
>   unsigned i;
> 
>   for (i = 0; i < n; i++)
> out[i] = (x[i] + y[i]) | (- (uint64_t)((uint64_t)(x[i] + y[i]) < x[i]));
> }
> 
> Before this patch:
> void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
> {
>   ...
>   _80 = .SELECT_VL (ivtmp_78, POLY_INT_CST [2, 2]);
>   ivtmp_58 = _80 * 8;
>   vect__4.7_61 = .MASK_LEN_LOAD (vectp_x.5_59, 64B, { -1, ... }, _80, 0);
>   vect__6.10_65 = .MASK_LEN_LOAD (vectp_y.8_63, 64B, { -1, ... }, _80, 0);
>   vect__7.11_66 = vect__4.7_61 + vect__6.10_65;
>   mask__8.12_67 = vect__4.7_61 > vect__7.11_66;
>   vect__12.15_72 = .VCOND_MASK (mask__8.12_67, { 18446744073709551615,
> ... }, vect__7.11_66);
>   .MASK_LEN_STORE (vectp_out.16_74, 64B, { -1, ... }, _80, 0, vect__12.15_72);
>   vectp_x.5_60 = vectp_x.5_59 + ivtmp_58;
>   vectp_y.8_64 = vectp_y.8_63 + ivtmp_58;
>   vectp_out.16_75 = vectp_out.16_74 + ivtmp_58;
>   ivtmp_79 = ivtmp_78 - _80;
>   ...
> }
> 
> After this patch:
> void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
> {
>   ...
>   _62 = .SELECT_VL (ivtmp_60, POLY_INT_CST [2, 2]);
>   ivtmp_46 = _62 * 8;
>   vect__4.7_49 = .MASK_LEN_LOAD (vectp_x.5_47, 64B, { -1, ... }, _62, 0);
>   vect__6.10_53 = .MASK_LEN_LOAD (vectp_y.8_51, 64B, { -1, ... }, _62, 0);
>   vect__12.11_54 = .SAT_ADD (vect__4.7_49, vect__6.10_53);
>   .MASK_LEN_STORE (vectp_out.12_56, 64B, { -1, ... }, _62, 0, vect__12.11_54);
>   ...
> }
> 
> The below test suites are passed for this patch.
> * The riscv fully regression tests.
> * The aarch64 fully regression tests.
> * The x86 bootstrap tests.
> * The x86 fully regression tests.
> 
>   PR target/51492
>   PR target/112600
> 
> gcc/ChangeLog:
> 
>   * tree-vect-patterns.cc (gimple_unsigned_integer_sat_add): New func
>   decl generated by match.pd match.
>   (vect_recog_sat_add_pattern): New func impl to recog the pattern
>   for unsigned SAT_ADD.
> 
> Signed-off-by: Pan Li 

Patch looks good to me, but I cannot approve so I'll pass it on to Richi.

Cheers,
Tamar

> ---
>  gcc/tree-vect-patterns.cc | 51 +++
>  1 file changed, 51 insertions(+)
> 
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 87c2acff386..8ffcaf71d5c 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -4487,6 +4487,56 @@ vect_recog_mult_pattern (vec_info *vinfo,
>return pattern_stmt;
>  }
> 
> +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree));
> +
> +/*
> + * Try to detect saturation add pattern (SAT_ADD), aka below gimple:
> + *   _7 = _4 + _6;
> + *   _8 = _4 > _7;
> + *   _9 = (long unsigned int) _8;
> + *   _10 = -_9;
> + *   _12 = _7 | _10;
> + *
> + * And then simplied to
> + *   _12 = .SAT_ADD (_4, _6);
> + */
> +
> +static gimple *
> +vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
> + tree *type_out)
> +{
> +  gimple *last_stmt = STMT_VINFO_STMT (stmt_vinfo);
> +
> +  if (!is_gimple_assign (last_stmt))
> +return NULL;
> +
> +  tree res_ops[2];
> +  tree lhs = gimple_assign_lhs (last_stmt);
> +
> +  if (gimple_unsigned_integer_sat_add (lhs, res_ops, NULL))
> +{
> +  tree itype = TREE_TYPE (res_ops[0]);
> +  tree vtype = get_vectype_for_scalar_type (vinfo, itype);
> +
> +  if (vtype != NULL_TREE && direct_internal_fn_supported_p (
> + IFN_SAT_ADD, vtype, OPTIMIZE_FOR_SPEED))
> + {
> +   *type_out = vtype;
> +   gcall *call = gimple_build_call_internal (IFN_SAT_ADD, 2, res_ops[0],
> + res_ops[1]);
> +

RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int

2024-05-13 Thread Tamar Christina
Hi Pan,

> -Original Message-
> From: pan2...@intel.com 
> Sent: Monday, May 6, 2024 3:48 PM
> To: gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina
> ; richard.guent...@gmail.com;
> hongtao@intel.com; Pan Li 
> Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned 
> scalar
> int
> 
> From: Pan Li 
> 
> This patch would like to add the middle-end presentation for the
> saturation add.  Aka set the result of add to the max when overflow.
> It will take the pattern similar as below.
> 
> SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> 
> Take uint8_t as example, we will have:
> 
> * SAT_ADD (1, 254)   => 255.
> * SAT_ADD (1, 255)   => 255.
> * SAT_ADD (2, 255)   => 255.
> * SAT_ADD (255, 255) => 255.
> 
> Given below example for the unsigned scalar integer uint64_t:
> 
> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> {
>   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> }
> 
> Before this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   long unsigned int _1;
>   _Bool _2;
>   long unsigned int _3;
>   long unsigned int _4;
>   uint64_t _7;
>   long unsigned int _10;
>   __complex__ long unsigned int _11;
> 
> ;;   basic block 2, loop depth 0
> ;;pred:   ENTRY
>   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
>   _1 = REALPART_EXPR <_11>;
>   _10 = IMAGPART_EXPR <_11>;
>   _2 = _10 != 0;
>   _3 = (long unsigned int) _2;
>   _4 = -_3;
>   _7 = _1 | _4;
>   return _7;
> ;;succ:   EXIT
> 
> }
> 
> After this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   uint64_t _7;
> 
> ;;   basic block 2, loop depth 0
> ;;pred:   ENTRY
>   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
>   return _7;
> ;;succ:   EXIT
> }
> 
> We perform the tranform during widen_mult because that the sub-expr of
> SAT_ADD will be optimized to .ADD_OVERFLOW.  We need to try the .SAT_ADD
> pattern first and then .ADD_OVERFLOW,  or we may never catch the pattern
> .SAT_ADD.  Meanwhile, the isel pass is after widen_mult and then we
> cannot perform the .SAT_ADD pattern match as the sub-expr will be
> optmized to .ADD_OVERFLOW first.
> 
> The below tests are passed for this patch:
> 1. The riscv fully regression tests.
> 2. The aarch64 fully regression tests.
> 3. The x86 bootstrap tests.
> 4. The x86 fully regression tests.
> 
>   PR target/51492
>   PR target/112600
> 
> gcc/ChangeLog:
> 
>   * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD
>   to the return true switch case(s).
>   * internal-fn.def (SAT_ADD):  Add new signed optab SAT_ADD.
>   * match.pd: Add unsigned SAT_ADD match.
>   * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd.
>   * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern
>   func decl generated in match.pd match.
>   (match_saturation_arith): New func impl to match the saturation arith.
>   (math_opts_dom_walker::after_dom_children): Try match saturation
>   arith.
> 
> Signed-off-by: Pan Li 
> ---
>  gcc/internal-fn.cc|  1 +
>  gcc/internal-fn.def   |  2 ++
>  gcc/match.pd  | 28 
>  gcc/optabs.def|  4 ++--
>  gcc/tree-ssa-math-opts.cc | 46
> +++
>  5 files changed, 79 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 0a7053c2286..73045ca8c8c 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn)
>  case IFN_UBSAN_CHECK_MUL:
>  case IFN_ADD_OVERFLOW:
>  case IFN_MUL_OVERFLOW:
> +case IFN_SAT_ADD:
>  case IFN_VEC_WIDEN_PLUS:
>  case IFN_VEC_WIDEN_PLUS_LO:
>  case IFN_VEC_WIDEN_PLUS_HI:
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 848bb9dbff3..25badbb86e5 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST
> | ECF_NOTHROW, first,
>  DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW,
> first,
> smulhrs, umulhrs, binary)
> 
> +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd,
> binary)
> +
>  DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
>  DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
>  DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary)
> diff --git a/gcc/match.pd b/gcc/match.pd
> index d401e7503e6..7058e4cbe29 100644
>

RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD

2024-05-02 Thread Tamar Christina
> > So he was responding for how to do it for the vectorizer and scalar parts.
> > Remember that the goal is not to introduce new gimple IL that can block 
> > other
> optimizations.
> > The vectorizer already introduces new IL (various IFN) but this is fine as 
> > we don't
> track things like ranges for
> > vector instructions.  So we don't loose any information here.
> 
> > Now for the scalar, if we do an early replacement like in match.pd we 
> > prevent a
> lot of other optimizations
> > because they don't know what IFN_SAT_ADD does. gimple-isel runs pretty late,
> and so at this point we don't
> > expect many more optimizations to happen, so it's a safe spot to insert 
> > more IL
> with "unknown semantics".
> 
> > Was that your intention Richi?
> 
> Thanks Tamar for clear explanation, does that mean both the scalar and vector 
> will
> go isel approach? If so I may
> misunderstand in previous that it is only for vectorize.

No, The isel would only be for the scalar, The vectorizer will still use the 
vect_pattern.
It needs to so we can cost the operation correctly, and in some cases depending 
on how
the saturation is described you are unable the vectorize.  The pattern allows 
us to catch
these cases and still vectorize.

But you should be able to use the same match.pd predicate for both the 
vectorizer pattern
and isel.

> 
> Understand the point that we would like to put the pattern match late but I 
> may
> have a question here.
> Given SAT_ADD related pattern is sort of complicated, it is possible that the 
> sub-
> expression of SAT_ADD is optimized
> In early pass by others and we can hardly catch the shapes later.
> 
> For example, there is a plus expression in SAT_ADD, and in early pass it may 
> be
> optimized to .ADD_OVERFLOW, and
> then the pattern is quite different to aware of that in later pass.
> 

Yeah, it looks like this transformation is done in widening_mul, which is the 
other
place richi suggested to recognize SAT_ADD.  widening_mul already runs quite
late as well so it's also ok.

If you put it there before the code that transforms the sequence to overflow it
should work.

Eventually we do need to recognize this variant since:

uint64_t
add_sat(uint64_t x, uint64_t y) noexcept
{
uint64_t z;
if (!__builtin_add_overflow(x, y, ))
return z;
return -1u;
}

Is a valid and common way to do saturation too.

But for now, it's fine.

Cheers,
Tamar

> Sorry not sure if my understanding is correct, feel free to correct me.
> 
> Pan
> 
> -Original Message-
> From: Tamar Christina 
> Sent: Thursday, May 2, 2024 11:26 AM
> To: Li, Pan2 ; gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com;
> Liu, Hongtao 
> Subject: RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD
> 
> > -Original Message-
> > From: Li, Pan2 
> > Sent: Thursday, May 2, 2024 4:11 AM
> > To: Tamar Christina ; gcc-patches@gcc.gnu.org
> > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com;
> > Liu, Hongtao 
> > Subject: RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD
> >
> > Thanks Tamar
> >
> > > Could you also split off the vectorizer change from scalar recog one? 
> > > Typically I
> > would structure a change like this as:
> >
> > > 1. create types/structures + scalar recogn
> > > 2. Vector recog code
> > > 3. Backend changes
> >
> > Sure thing, will rearrange the patch like this.
> >
> > > Is ECF_NOTHROW correct here? At least on most targets I believe the scalar
> > version
> > > can set flags/throw exceptions if the saturation happens?
> >
> > I see, will remove that.
> >
> > > Hmm I believe Richi mentioned that he wanted the recognition done in isel?
> >
> > > The problem with doing it in match.pd is that it replaces the operations 
> > > quite
> > > early the pipeline. Did I miss an email perhaps? The early replacement 
> > > means
> we
> > > lose optimizations and things such as range calculations etc, since e.g. 
> > > ranger
> > > doesn't know these internal functions.
> >
> > > I think Richi will want this in islet or mult widening but I'll continue 
> > > with
> match.pd
> > > review just in case.
> >
> > If I understand is correct, Richard suggested try vectorizer patterns first 
> > and then
> > possible isel.
> > Thus, I don't have a try for SAT_ADD in ISEL as vectorizer patterns works 
> > well for
> > SAT_ADD.
> > Let's wait 

RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD

2024-05-01 Thread Tamar Christina
> -Original Message-
> From: Li, Pan2 
> Sent: Thursday, May 2, 2024 4:11 AM
> To: Tamar Christina ; gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com;
> Liu, Hongtao 
> Subject: RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD
> 
> Thanks Tamar
> 
> > Could you also split off the vectorizer change from scalar recog one? 
> > Typically I
> would structure a change like this as:
> 
> > 1. create types/structures + scalar recogn
> > 2. Vector recog code
> > 3. Backend changes
> 
> Sure thing, will rearrange the patch like this.
> 
> > Is ECF_NOTHROW correct here? At least on most targets I believe the scalar
> version
> > can set flags/throw exceptions if the saturation happens?
> 
> I see, will remove that.
> 
> > Hmm I believe Richi mentioned that he wanted the recognition done in isel?
> 
> > The problem with doing it in match.pd is that it replaces the operations 
> > quite
> > early the pipeline. Did I miss an email perhaps? The early replacement 
> > means we
> > lose optimizations and things such as range calculations etc, since e.g. 
> > ranger
> > doesn't know these internal functions.
> 
> > I think Richi will want this in islet or mult widening but I'll continue 
> > with match.pd
> > review just in case.
> 
> If I understand is correct, Richard suggested try vectorizer patterns first 
> and then
> possible isel.
> Thus, I don't have a try for SAT_ADD in ISEL as vectorizer patterns works 
> well for
> SAT_ADD.
> Let's wait the confirmation from Richard. Below are the original words from
> previous mail for reference.
> 

I think the comment he made was this

> > Given we have saturating integer alu like below, could you help to coach me 
> > the most reasonable way to represent
> > It in scalar as well as vectorize part? Sorry not familiar with this part 
> > and still dig into how it works...
> 
> As in your v2, .SAT_ADD for both sat_uadd and sat_sadd, similar for
> the other cases.
>
> As I said, use vectorizer patterns and possibly do instruction
> selection at ISEL/widen_mult time.

So he was responding for how to do it for the vectorizer and scalar parts.
Remember that the goal is not to introduce new gimple IL that can block other 
optimizations.
The vectorizer already introduces new IL (various IFN) but this is fine as we 
don't track things like ranges for
vector instructions.  So we don't loose any information here.

Now for the scalar, if we do an early replacement like in match.pd we prevent a 
lot of other optimizations
because they don't know what IFN_SAT_ADD does. gimple-isel runs pretty late, 
and so at this point we don't
expect many more optimizations to happen, so it's a safe spot to insert more IL 
with "unknown semantics".

Was that your intention Richi?

Thanks,
Tamar

> >> As I said, use vectorizer patterns and possibly do instruction
> >> selection at ISEL/widen_mult time.
> 
> > The optimize checks in the match.pd file are weird as it seems to check if 
> > we have
> > optimizations enabled?
> 
> > We don't typically need to do this.
> 
> Sure, will remove this.
> 
> > The function has only one caller, you should just inline it into the 
> > pattern.
> 
> Sure thing.
> 
> > Once you inline vect_sat_add_build_call you can do the check for
> > vtype here, which is the cheaper check so perform it early.
> 
> Sure thing.
> 
> Thanks again and will send the v4 with all comments addressed, as well as the 
> test
> results.
> 
> Pan
> 
> -Original Message-
> From: Tamar Christina 
> Sent: Thursday, May 2, 2024 1:06 AM
> To: Li, Pan2 ; gcc-patches@gcc.gnu.org
> Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com;
> Liu, Hongtao 
> Subject: RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD
> 
> Hi,
> 
> > From: Pan Li 
> >
> > Update in v3:
> > * Rebase upstream for conflict.
> >
> > Update in v2:
> > * Fix one failure for x86 bootstrap.
> >
> > Original log:
> >
> > This patch would like to add the middle-end presentation for the
> > saturation add.  Aka set the result of add to the max when overflow.
> > It will take the pattern similar as below.
> >
> > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> >
> > Take uint8_t as example, we will have:
> >
> > * SAT_ADD (1, 254)   => 255.
> > * SAT_ADD (1, 255)   => 255.
> > * SAT_ADD (2, 255)   => 255.
> > * SAT_ADD (255, 255) => 255.
> >
> > The p

RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD

2024-05-01 Thread Tamar Christina
Hi,

> From: Pan Li 
> 
> Update in v3:
> * Rebase upstream for conflict.
> 
> Update in v2:
> * Fix one failure for x86 bootstrap.
> 
> Original log:
> 
> This patch would like to add the middle-end presentation for the
> saturation add.  Aka set the result of add to the max when overflow.
> It will take the pattern similar as below.
> 
> SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> 
> Take uint8_t as example, we will have:
> 
> * SAT_ADD (1, 254)   => 255.
> * SAT_ADD (1, 255)   => 255.
> * SAT_ADD (2, 255)   => 255.
> * SAT_ADD (255, 255) => 255.
> 
> The patch also implement the SAT_ADD in the riscv backend as
> the sample for both the scalar and vector.  Given below example:
> 
> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> {
>   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> }
> 
> Before this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   long unsigned int _1;
>   _Bool _2;
>   long unsigned int _3;
>   long unsigned int _4;
>   uint64_t _7;
>   long unsigned int _10;
>   __complex__ long unsigned int _11;
> 
> ;;   basic block 2, loop depth 0
> ;;pred:   ENTRY
>   _11 = .ADD_OVERFLOW (x_5(D), y_6(D));
>   _1 = REALPART_EXPR <_11>;
>   _10 = IMAGPART_EXPR <_11>;
>   _2 = _10 != 0;
>   _3 = (long unsigned int) _2;
>   _4 = -_3;
>   _7 = _1 | _4;
>   return _7;
> ;;succ:   EXIT
> 
> }
> 
> After this patch:
> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> {
>   uint64_t _7;
> 
> ;;   basic block 2, loop depth 0
> ;;pred:   ENTRY
>   _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call]
>   return _7;
> ;;succ:   EXIT
> }
> 
> For vectorize, we leverage the existing vect pattern recog to find
> the pattern similar to scalar and let the vectorizer to perform
> the rest part for standard name usadd3 in vector mode.
> The riscv vector backend have insn "Vector Single-Width Saturating
> Add and Subtract" which can be leveraged when expand the usadd3
> in vector mode.  For example:
> 
> void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
> {
>   unsigned i;
> 
>   for (i = 0; i < n; i++)
> out[i] = (x[i] + y[i]) | (- (uint64_t)((uint64_t)(x[i] + y[i]) < x[i]));
> }
> 
> Before this patch:
> void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
> {
>   ...
>   _80 = .SELECT_VL (ivtmp_78, POLY_INT_CST [2, 2]);
>   ivtmp_58 = _80 * 8;
>   vect__4.7_61 = .MASK_LEN_LOAD (vectp_x.5_59, 64B, { -1, ... }, _80, 0);
>   vect__6.10_65 = .MASK_LEN_LOAD (vectp_y.8_63, 64B, { -1, ... }, _80, 0);
>   vect__7.11_66 = vect__4.7_61 + vect__6.10_65;
>   mask__8.12_67 = vect__4.7_61 > vect__7.11_66;
>   vect__12.15_72 = .VCOND_MASK (mask__8.12_67, { 18446744073709551615,
> ... }, vect__7.11_66);
>   .MASK_LEN_STORE (vectp_out.16_74, 64B, { -1, ... }, _80, 0, vect__12.15_72);
>   vectp_x.5_60 = vectp_x.5_59 + ivtmp_58;
>   vectp_y.8_64 = vectp_y.8_63 + ivtmp_58;
>   vectp_out.16_75 = vectp_out.16_74 + ivtmp_58;
>   ivtmp_79 = ivtmp_78 - _80;
>   ...
> }
> 
> vec_sat_add_u64:
>   ...
>   vsetvli a5,a3,e64,m1,ta,ma
>   vle64.v v0,0(a1)
>   vle64.v v1,0(a2)
>   sllia4,a5,3
>   sub a3,a3,a5
>   add a1,a1,a4
>   add a2,a2,a4
>   vadd.vv v1,v0,v1
>   vmsgtu.vv   v0,v0,v1
>   vmerge.vim  v1,v1,-1,v0
>   vse64.v v1,0(a0)
>   ...
> 
> After this patch:
> void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n)
> {
>   ...
>   _62 = .SELECT_VL (ivtmp_60, POLY_INT_CST [2, 2]);
>   ivtmp_46 = _62 * 8;
>   vect__4.7_49 = .MASK_LEN_LOAD (vectp_x.5_47, 64B, { -1, ... }, _62, 0);
>   vect__6.10_53 = .MASK_LEN_LOAD (vectp_y.8_51, 64B, { -1, ... }, _62, 0);
>   vect__12.11_54 = .SAT_ADD (vect__4.7_49, vect__6.10_53);
>   .MASK_LEN_STORE (vectp_out.12_56, 64B, { -1, ... }, _62, 0, vect__12.11_54);
>   ...
> }
> 
> vec_sat_add_u64:
>   ...
>   vsetvli a5,a3,e64,m1,ta,ma
>   vle64.v v1,0(a1)
>   vle64.v v2,0(a2)
>   sllia4,a5,3
>   sub a3,a3,a5
>   add a1,a1,a4
>   add a2,a2,a4
>   vsaddu.vv   v1,v1,v2
>   vse64.v v1,0(a0)
>   ...
> 
> To limit the patch size for review, only unsigned version of
> usadd3 are involved here. The signed version will be covered
> in the underlying patch(es).
> 
> The below test suites are passed for this patch.
> * The riscv fully regression tests.
> * The aarch64 fully regression tests.
> * The x86 bootstrap tests.
> * The x86 fully regression tests.
> 
>   PR target/51492
>   PR target/112600
> 
> gcc/ChangeLog:
> 
>   * config/riscv/autovec.md (usadd3): New pattern expand
>   for unsigned SAT_ADD vector.
>   * config/riscv/riscv-protos.h (riscv_expand_usadd): New func
>   decl to expand usadd3 pattern.
>   (expand_vec_usadd): Ditto but for vector.
>   * config/riscv/riscv-v.cc (emit_vec_saddu): New func impl to
>   emit the vsadd insn.
>   (expand_vec_usadd): New func impl to expand usadd3 for
>   vector.
>   * config/riscv/riscv.cc (riscv_expand_usadd): New func impl
>   to 

[gcc r14-10040] middle-end: refactory vect_recog_absolute_difference to simplify flow [PR114769]

2024-04-19 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:1216460e7023cd8ec49933866107417c70e933c9

commit r14-10040-g1216460e7023cd8ec49933866107417c70e933c9
Author: Tamar Christina 
Date:   Fri Apr 19 15:22:13 2024 +0100

middle-end: refactory vect_recog_absolute_difference to simplify flow 
[PR114769]

Hi All,

As the reporter in PR114769 points out the control flow for the abd 
detection
is hard to follow.  This is because vect_recog_absolute_difference has two
different ways it can return true.

1. It can return true when the widening operation is matched, in which case
   unprom is set, half_type is not NULL and diff_stmt is not set.

2. It can return true when the widening operation is not matched, but the 
stmt
   being checked is a minus.  In this case unprom is not set, half_type is 
set
   to NULL and diff_stmt is set.  This because to get to diff_stmt you have 
to
   dig through the abs statement and any possible promotions.

This however leads to complicated uses of the function at the call sites as 
the
exact semantic needs to be known to use it safely.

vect_recog_absolute_difference has two callers:

1. vect_recog_sad_pattern where if you return true with unprom not set, then
   *half_type will be NULL.  The call to vect_supportable_direct_optab_p 
will
   always reject it since there's no vector mode for NULL.  Note that if 
looking
   at the dump files, the convention in the dump files have always been 
that we
   first indicate that a pattern could possibly be recognize and then check 
that
   it's supported.

   This change somewhat incorrectly makes the diagnostic message get 
printed for
   "invalid" patterns.

2. vect_recog_abd_pattern, where if half_type is NULL, it then uses 
diff_stmt to
   set them.

This refactors the code, it now only has 1 success condition, and diff_stmt 
is
always set to the minus statement in the abs if there is one.

The function now only returns success if the widening minus is found, in 
which
case unprom and half_type set.

This then leaves it up to the caller to decide if they want to do anything 
with
diff_stmt.

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/114769
* tree-vect-patterns.cc:
(vect_recog_absolute_difference): Have only one success condition.
(vect_recog_abd_pattern): Handle further checks if
vect_recog_absolute_difference fails.

Diff:
---
 gcc/tree-vect-patterns.cc | 43 ---
 1 file changed, 16 insertions(+), 27 deletions(-)

diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 4f491c6b833..87c2acff386 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -797,8 +797,7 @@ vect_split_statement (vec_info *vinfo, stmt_vec_info 
stmt2_info, tree new_rhs,
HALF_TYPE and UNPROM will be set should the statement be found to
be a widened operation.
DIFF_STMT will be set to the MINUS_EXPR
-   statement that precedes the ABS_STMT unless vect_widened_op_tree
-   succeeds.
+   statement that precedes the ABS_STMT if it is a MINUS_EXPR..
  */
 static bool
 vect_recog_absolute_difference (vec_info *vinfo, gassign *abs_stmt,
@@ -843,6 +842,12 @@ vect_recog_absolute_difference (vec_info *vinfo, gassign 
*abs_stmt,
   if (!diff_stmt_vinfo)
 return false;
 
+  gassign *diff = dyn_cast  (STMT_VINFO_STMT (diff_stmt_vinfo));
+  if (diff_stmt && diff
+  && gimple_assign_rhs_code (diff) == MINUS_EXPR
+  && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (abs_oprnd)))
+*diff_stmt = diff;
+
   /* FORNOW.  Can continue analyzing the def-use chain when this stmt in a phi
  inside the loop (in case we are analyzing an outer-loop).  */
   if (vect_widened_op_tree (vinfo, diff_stmt_vinfo,
@@ -850,17 +855,6 @@ vect_recog_absolute_difference (vec_info *vinfo, gassign 
*abs_stmt,
false, 2, unprom, half_type))
 return true;
 
-  /* Failed to find a widen operation so we check for a regular MINUS_EXPR.  */
-  gassign *diff = dyn_cast  (STMT_VINFO_STMT (diff_stmt_vinfo));
-  if (diff_stmt && diff
-  && gimple_assign_rhs_code (diff) == MINUS_EXPR
-  && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (abs_oprnd)))
-{
-  *diff_stmt = diff;
-  *half_type = NULL_TREE;
-  return true;
-}
-
   return false;
 }
 
@@ -1499,27 +1493,22 @@ vect_recog_abd_pattern (vec_info *vinfo,
   tree out_type = TREE_TYPE (gimple_assign_lhs (last_stmt));
 
   vect_unpromoted_value unprom[2];
-  gassign *diff_stmt;
-  tree half_type;
-  if (!vect_recog_absolute_difference (vinfo, last_stmt, _type,
+  gassign *diff_stmt = NULL;
+  tree abd_in_type;
+  if (!vect_recog_absolute_difference (vinfo, last_stmt, _in_type,
   unprom, _st

[PATCH]middle-end: refactory vect_recog_absolute_difference to simplify flow [PR114769]

2024-04-19 Thread Tamar Christina
Hi All,

As the reporter in PR114769 points out the control flow for the abd detection
is hard to follow.  This is because vect_recog_absolute_difference has two
different ways it can return true.

1. It can return true when the widening operation is matched, in which case
   unprom is set, half_type is not NULL and diff_stmt is not set.

2. It can return true when the widening operation is not matched, but the stmt
   being checked is a minus.  In this case unprom is not set, half_type is set
   to NULL and diff_stmt is set.  This because to get to diff_stmt you have to
   dig through the abs statement and any possible promotions.

This however leads to complicated uses of the function at the call sites as the
exact semantic needs to be known to use it safely.

vect_recog_absolute_difference has two callers:

1. vect_recog_sad_pattern where if you return true with unprom not set, then
   *half_type will be NULL.  The call to vect_supportable_direct_optab_p will
   always reject it since there's no vector mode for NULL.  Note that if looking
   at the dump files, the convention in the dump files have always been that we
   first indicate that a pattern could possibly be recognize and then check that
   it's supported.

   This change somewhat incorrectly makes the diagnostic message get printed for
   "invalid" patterns.

2. vect_recog_abd_pattern, where if half_type is NULL, it then uses diff_stmt to
   set them.

So while the note in the dump file is misleading, the code is safe.

This refactors the code, it now only has 1 success condition, and diff_stmt is
always set to the minus statement in the abs if there is one.

The function now only returns success if the widening minus is found, in which
case unprom and half_type set.

This then leaves it up to the caller to decide if they want to do anything with
diff_stmt.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/114769
* tree-vect-patterns.cc:
(vect_recog_absolute_difference): Have only one success condition.
(vect_recog_abd_pattern): Handle further checks if
vect_recog_absolute_difference fails.

---
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 
4f491c6b8336f8710c3519dec1fa7e0f49387d2b..87c2acff386d91d22a3b2d6e6443d1f2f2326ea6
 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -797,8 +797,7 @@ vect_split_statement (vec_info *vinfo, stmt_vec_info 
stmt2_info, tree new_rhs,
HALF_TYPE and UNPROM will be set should the statement be found to
be a widened operation.
DIFF_STMT will be set to the MINUS_EXPR
-   statement that precedes the ABS_STMT unless vect_widened_op_tree
-   succeeds.
+   statement that precedes the ABS_STMT if it is a MINUS_EXPR..
  */
 static bool
 vect_recog_absolute_difference (vec_info *vinfo, gassign *abs_stmt,
@@ -843,6 +842,12 @@ vect_recog_absolute_difference (vec_info *vinfo, gassign 
*abs_stmt,
   if (!diff_stmt_vinfo)
 return false;
 
+  gassign *diff = dyn_cast  (STMT_VINFO_STMT (diff_stmt_vinfo));
+  if (diff_stmt && diff
+  && gimple_assign_rhs_code (diff) == MINUS_EXPR
+  && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (abs_oprnd)))
+*diff_stmt = diff;
+
   /* FORNOW.  Can continue analyzing the def-use chain when this stmt in a phi
  inside the loop (in case we are analyzing an outer-loop).  */
   if (vect_widened_op_tree (vinfo, diff_stmt_vinfo,
@@ -850,17 +855,6 @@ vect_recog_absolute_difference (vec_info *vinfo, gassign 
*abs_stmt,
false, 2, unprom, half_type))
 return true;
 
-  /* Failed to find a widen operation so we check for a regular MINUS_EXPR.  */
-  gassign *diff = dyn_cast  (STMT_VINFO_STMT (diff_stmt_vinfo));
-  if (diff_stmt && diff
-  && gimple_assign_rhs_code (diff) == MINUS_EXPR
-  && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (abs_oprnd)))
-{
-  *diff_stmt = diff;
-  *half_type = NULL_TREE;
-  return true;
-}
-
   return false;
 }
 
@@ -1499,27 +1493,22 @@ vect_recog_abd_pattern (vec_info *vinfo,
   tree out_type = TREE_TYPE (gimple_assign_lhs (last_stmt));
 
   vect_unpromoted_value unprom[2];
-  gassign *diff_stmt;
-  tree half_type;
-  if (!vect_recog_absolute_difference (vinfo, last_stmt, _type,
+  gassign *diff_stmt = NULL;
+  tree abd_in_type;
+  if (!vect_recog_absolute_difference (vinfo, last_stmt, _in_type,
   unprom, _stmt))
-return NULL;
-
-  tree abd_in_type, abd_out_type;
-
-  if (half_type)
-{
-  abd_in_type = half_type;
-  abd_out_type = abd_in_type;
-}
-  else
 {
+  /* We cannot try further without having a non-widening MINUS.  */
+  if (!diff_stmt)
+   return NULL;
+
   unprom[0].op = gimple_assign_rhs1 (diff_stmt);
   unprom[1].op = gimple_assign_rhs2 (diff_stmt);
   abd_in_type = signed_type_for (out_type);
-  abd_out_type = abd_in_type;
 }
 
+ 

[gcc r14-10014] AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]

2024-04-18 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:a2f4be3dae04fa8606d1cc8451f0b9d450f7e6e6

commit r14-10014-ga2f4be3dae04fa8606d1cc8451f0b9d450f7e6e6
Author: Tamar Christina 
Date:   Thu Apr 18 11:47:42 2024 +0100

AArch64: remove reliance on register allocator for simd/gpreg costing. 
[PR114741]

In PR114741 we see that we have a regression in codegen when SVE is enable 
where
the simple testcase:

void foo(unsigned v, unsigned *p)
{
*p = v & 1;
}

generates

foo:
fmovs31, w0
and z31.s, z31.s, #1
str s31, [x1]
ret

instead of:

foo:
and w0, w0, 1
str w0, [x1]
ret

This causes an impact it not just codesize but also performance.  This is 
caused
by the use of the ^ constraint modifier in the pattern 3.

The documentation states that this modifier should only have an effect on 
the
alternative costing in that a particular alternative is to be preferred 
unless
a non-psuedo reload is needed.

The pattern was trying to convey that whenever both r and w are required, 
that
it should prefer r unless a reload is needed.  This is because if a reload 
is
needed then we can construct the constants more flexibly on the SIMD side.

We were using this so simplify the implementation and to get generic cases 
such
as:

double negabs (double x)
{
   unsigned long long y;
   memcpy (, , sizeof(double));
   y = y | (1UL << 63);
   memcpy (, , sizeof(double));
   return x;
}

which don't go through an expander.
However the implementation of ^ in the register allocator is not according 
to
the documentation in that it also has an effect during coloring.  During 
initial
register class selection it applies a penalty to a class, similar to how ? 
does.

In this example the penalty makes the use of GP regs expensive enough that 
it no
longer considers them:

r106: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS
;;3--> b  0: i   9 r106=r105&0x1
:cortex_a53_slot_any:GENERAL_REGS+0(-1)FP_REGS+1(1)PR_LO_REGS+0(0)
 PR_HI_REGS+0(0):model 4

which is not the expected behavior.  For GCC 14 this is a conservative fix.

1. we remove the ^ modifier from the logical optabs.

2. In order not to regress copysign we then move the copysign expansion to
   directly use the SIMD variant.  Since copysign only supports floating 
point
   modes this is fine and no longer relies on the register allocator to 
select
   the right alternative.

It once again regresses the general case, but this case wasn't optimized in
earlier GCCs either so it's not a regression in GCC 14.  This change gives
strict better codegen than earlier GCCs and still optimizes the important 
cases.

gcc/ChangeLog:

PR target/114741
* config/aarch64/aarch64.md (3): Remove ^ from alt 2.
(copysign3): Use SIMD version of IOR directly.

gcc/testsuite/ChangeLog:

PR target/114741
* gcc.target/aarch64/fneg-abs_2.c: Update codegen.
* gcc.target/aarch64/fneg-abs_4.c: xfail for now.
* gcc.target/aarch64/pr114741.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64.md | 23 +
 gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c |  5 ++---
 gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c |  4 ++--
 gcc/testsuite/gcc.target/aarch64/pr114741.c   | 29 +++
 4 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 385a669b9b3..dbde066f747 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4811,7 +4811,7 @@
   ""
   {@ [ cons: =0 , 1  , 2; attrs: type , arch  ]
  [ r, %r , r; logic_reg   , * ] \t%0, 
%1, %2
- [ rk   , ^r ,  ; logic_imm   , * ] \t%0, 
%1, %2
+ [ rk   , r  ,  ; logic_imm   , * ] \t%0, 
%1, %2
  [ w, 0  ,  ; *   , sve   ] \t%Z0., 
%Z0., #%2
  [ w, w  , w; neon_logic  , simd  ] 
\t%0., %1., %2.
   }
@@ -7192,22 +7192,29 @@
(match_operand:GPF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  machine_mode int_mode = mode;
-  rtx bitmask = gen_reg_rtx (int_mode);
-  emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
-   << (GET_MODE_BITSIZE (mode) - 1)));
+  rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U
+  << (GET_MODE_BITSIZE (mode) - 1));
   /* copysign (x, -1) should instead be expanded as orr with the sign
  bit.  */
   rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
   if (GET_CODE (op2_elt) == CONST_DO

[PATCH]AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]

2024-04-18 Thread Tamar Christina
Hi All,

In PR114741 we see that we have a regression in codegen when SVE is enable where
the simple testcase:

void foo(unsigned v, unsigned *p)
{
*p = v & 1;
}

generates

foo:
fmovs31, w0
and z31.s, z31.s, #1
str s31, [x1]
ret

instead of:

foo:
and w0, w0, 1
str w0, [x1]
ret

This causes an impact it not just codesize but also performance.  This is caused
by the use of the ^ constraint modifier in the pattern 3.

The documentation states that this modifier should only have an effect on the
alternative costing in that a particular alternative is to be preferred unless
a non-psuedo reload is needed.

The pattern was trying to convey that whenever both r and w are required, that
it should prefer r unless a reload is needed.  This is because if a reload is
needed then we can construct the constants more flexibly on the SIMD side.

We were using this so simplify the implementation and to get generic cases such
as:

double negabs (double x)
{
   unsigned long long y;
   memcpy (, , sizeof(double));
   y = y | (1UL << 63);
   memcpy (, , sizeof(double));
   return x;
}

which don't go through an expander.
However the implementation of ^ in the register allocator is not according to
the documentation in that it also has an effect during coloring.  During initial
register class selection it applies a penalty to a class, similar to how ? does.

In this example the penalty makes the use of GP regs expensive enough that it no
longer considers them:

r106: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS
;;3--> b  0: i   9 r106=r105&0x1
:cortex_a53_slot_any:GENERAL_REGS+0(-1)FP_REGS+1(1)PR_LO_REGS+0(0)
 PR_HI_REGS+0(0):model 4

which is not the expected behavior.  For GCC 14 this is a conservative fix.

1. we remove the ^ modifier from the logical optabs.

2. In order not to regress copysign we then move the copysign expansion to
   directly use the SIMD variant.  Since copysign only supports floating point
   modes this is fine and no longer relies on the register allocator to select
   the right alternative.

It once again regresses the general case, but this case wasn't optimized in
earlier GCCs either so it's not a regression in GCC 14.  This change gives
strict better codegen than earlier GCCs and still optimizes the important cases.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:


PR target/114741
* config/aarch64/aarch64.md (3): Remove ^ from alt 2.
(copysign3): Use SIMD version of IOR directly.

gcc/testsuite/ChangeLog:

PR target/114741
* gcc.target/aarch64/fneg-abs_2.c: Update codegen.
* gcc.target/aarch64/fneg-abs_4.c: xfail for now.
* gcc.target/aarch64/pr114741.c: New test.

---
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 
385a669b9b3c31cc9108a660e881b9091c71fc7c..dbde066f7478bec51a8703b017ea553aa98be309
 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4811,7 +4811,7 @@ (define_insn "3"
   ""
   {@ [ cons: =0 , 1  , 2; attrs: type , arch  ]
  [ r, %r , r; logic_reg   , * ] \t%0, 
%1, %2
- [ rk   , ^r ,  ; logic_imm   , * ] \t%0, 
%1, %2
+ [ rk   , r  ,  ; logic_imm   , * ] \t%0, 
%1, %2
  [ w, 0  ,  ; *   , sve   ] \t%Z0., 
%Z0., #%2
  [ w, w  , w; neon_logic  , simd  ] 
\t%0., %1., %2.
   }
@@ -7192,22 +7192,29 @@ (define_expand "copysign3"
(match_operand:GPF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  machine_mode int_mode = mode;
-  rtx bitmask = gen_reg_rtx (int_mode);
-  emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U
-   << (GET_MODE_BITSIZE (mode) - 1)));
+  rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U
+  << (GET_MODE_BITSIZE (mode) - 1));
   /* copysign (x, -1) should instead be expanded as orr with the sign
  bit.  */
   rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
   if (GET_CODE (op2_elt) == CONST_DOUBLE
   && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
 {
-  emit_insn (gen_ior3 (
-   lowpart_subreg (int_mode, operands[0], mode),
-   lowpart_subreg (int_mode, operands[1], mode), bitmask));
+  rtx v_bitmask
+   = force_reg (V2mode,
+gen_const_vec_duplicate (V2mode,
+ signbit_const));
+
+  emit_insn (gen_iorv23 (
+   lowpart_subreg (V2mode, operands[0], mode),
+   lowpart_subreg (V2mode, operands[1], mode),
+   v_bitmask));
   DONE;
 }
 
+  machine_mode int_mode = mode;
+  rtx bitmask = gen_reg_rtx (int_mode);
+  emit_move_insn (bitmask, signbit_const);
   operands[2] = force_reg (mode, operands[2]);
   emit_insn (gen_copysign3_insn (operands[0], operands[1], operands[2],
 

gcc-wwwdocs branch master updated. 3530b8d820658fb3add4b06def91672a0053f2b2

2024-04-16 Thread Tamar Christina via Gcc-cvs-wwwdocs
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "gcc-wwwdocs".

The branch, master has been updated
   via  3530b8d820658fb3add4b06def91672a0053f2b2 (commit)
  from  794555052d5c1d9a92298aba1fc4b645042946dd (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -
commit 3530b8d820658fb3add4b06def91672a0053f2b2
Author: Tamar Christina 
Date:   Mon Apr 15 16:00:21 2024 +0100

gcc-14/docs: document early break support and pragma novector

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index 6035ae37..c98ebe5a 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -124,6 +124,34 @@ a work-in-progress.
 for indicating parameters that are expected to be null-terminated
 strings.
   
+  
+   The vectorizer now supports vectorizing loops which contain any number 
of early breaks.
+   This means loops such as:
+   
+   int z[100], y[100], x[100];
+   int foo (int n)
+   {
+ int res = 0;
+ for (int i = 0; i < n; i++)
+   {
+  y[i] = x[i] * 2;
+  res += x[i] + y[i];
+
+  if (x[i] > 5)
+break;
+
+  if (z[i] > 5)
+break;
+
+   }
+ return res;
+   }
+   
+   can now be vectorized on a number of targets.  In this first version any
+   input data sources must either have a statically known size at compile 
time
+   or the vectorizer must be able to determine based on auxillary 
information
+   that the accesses are aligned.
+  
 
 
 New Languages and Language specific improvements
@@ -234,6 +262,9 @@ a work-in-progress.
   previous options -std=c2x, -std=gnu2x
   and -Wc11-c2x-compat, which are deprecated but remain
   supported.
+  GCC supports a new pragma pragma GCC novector to
+  indicate to the vectorizer not to vectorize the loop annotated with the
+  pragma.
 
 
 C++
@@ -403,6 +434,9 @@ a work-in-progress.
   warnings are enabled for C++ as well
   The DR 2237 code no longer gives an error, it emits
   a -Wtemplate-id-cdtor warning instead
+  GCC supports a new pragma pragma GCC novector to
+  indicate to the vectorizer not to vectorize the loop annotated with the
+  pragma.
 
 
 Runtime Library (libstdc++)

---

Summary of changes:
 htdocs/gcc-14/changes.html | 34 ++
 1 file changed, 34 insertions(+)


hooks/post-receive
-- 
gcc-wwwdocs


[gcc r14-9997] testsuite: Fix data check loop on vect-early-break_124-pr114403.c

2024-04-16 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:f438acf7ce2e6cb862cf62f2543c36639e2af233

commit r14-9997-gf438acf7ce2e6cb862cf62f2543c36639e2af233
Author: Tamar Christina 
Date:   Tue Apr 16 20:56:26 2024 +0100

testsuite: Fix data check loop on vect-early-break_124-pr114403.c

The testcase had the wrong indices in the buffer check loop.

gcc/testsuite/ChangeLog:

PR tree-optimization/114403
* gcc.dg/vect/vect-early-break_124-pr114403.c: Fix check loop.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
index 1751296ab81..51abf245ccb 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
@@ -68,8 +68,8 @@ int main ()
 
   int store_size = sizeof(PV);
 #pragma GCC novector
-  for (int i = 0; i < NUM - 1; i+=store_size)
-if (0 != __builtin_memcmp (buffer+i, (char*)[i].Val, store_size))
+  for (int i = 0; i < NUM - 1; i++)
+if (0 != __builtin_memcmp (buffer+(i*store_size), (char*)[i].Val, 
store_size))
   __builtin_abort ();
 
   return 0;


RE: [PATCH]middle-end: skip vectorization check on ilp32 on vect-early-break_124-pr114403.c

2024-04-16 Thread Tamar Christina
> On Tue, Apr 16, 2024 at 09:00:53AM +0200, Richard Biener wrote:
> > >   PR tree-optimization/114403
> > >   * gcc.dg/vect/vect-early-break_124-pr114403.c: Skip in ilp32.
> > >
> > > ---
> > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
> > > index
> 1751296ab813fe85eaab1f58dc674bac10f6eb7a..db8e00556f116ca81c5a6558e
> c6ecd3b222ec93d 100644
> > > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
> > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
> > > @@ -2,11 +2,11 @@
> > >  /* { dg-require-effective-target vect_early_break_hw } */
> > >  /* { dg-require-effective-target vect_long_long } */
> > >
> > > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! 
> > > ilp32 } } }
> } */
> > >
> > >  #include "tree-vect.h"
> > >
> > > -typedef unsigned long PV;
> > > +typedef unsigned long long PV;
> > >  typedef struct _buff_t {
> > >  int foo;
> > >  PV Val;
> 
> As discussed on IRC, I think we want
> --- gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c.jj  2024-04-
> 16 08:43:36.001729192 +0200
> +++ gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c 2024-04-
> 16 08:55:11.296214959 +0200
> @@ -64,8 +64,8 @@ int main ()
> 
>int store_size = sizeof(PV);
>  #pragma GCC novector
> -  for (int i = 0; i < NUM - 1; i+=store_size)
> -if (0 != __builtin_memcmp (buffer+i, (char*)[i].Val, store_size))
> +  for (int i = 0; i < NUM - 1; i++)
> +if (0 != __builtin_memcmp (buffer+i*store_size, (char*)[i].Val, 
> store_size))
>__builtin_abort ();
> 
>return 0;
> 
> instead (and then I think there is no need to switch PV from unsigned long
> to unsigned long long, nor disabling on ilp32.
> 

Regtested on x86_64-pc-linux-gnu with -m32,-m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR tree-optimization/114403
* gcc.dg/vect/vect-early-break_124-pr114403.c: Fix check loop.

-- inline copy of patch --

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
index 
1751296ab813fe85eaab1f58dc674bac10f6eb7a..51abf245ccb51b85f06916a8a0238698911ab551
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
@@ -68,8 +68,8 @@ int main ()
 
   int store_size = sizeof(PV);
 #pragma GCC novector
-  for (int i = 0; i < NUM - 1; i+=store_size)
-if (0 != __builtin_memcmp (buffer+i, (char*)[i].Val, store_size))
+  for (int i = 0; i < NUM - 1; i++)
+if (0 != __builtin_memcmp (buffer+(i*store_size), (char*)[i].Val, 
store_size))
   __builtin_abort ();
 
   return 0;



rb18418.patch
Description: rb18418.patch


[PATCH]middle-end: skip vectorization check on ilp32 on vect-early-break_124-pr114403.c

2024-04-15 Thread Tamar Christina
Hi all,

The testcase seems to fail vectorization on -m32 since the access pattern is
determined as too complex.  This skips the vectorization check on ilp32 systems
as I couldn't find a better proxy for being able to do strided 64-bit loads and
I suspect it would fail on all 32-bit targets.

Regtested on x86_64-pc-linux-gnu with -m32 and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR tree-optimization/114403
* gcc.dg/vect/vect-early-break_124-pr114403.c: Skip in ilp32.

---
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
index 
1751296ab813fe85eaab1f58dc674bac10f6eb7a..db8e00556f116ca81c5a6558ec6ecd3b222ec93d
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
@@ -2,11 +2,11 @@
 /* { dg-require-effective-target vect_early_break_hw } */
 /* { dg-require-effective-target vect_long_long } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! ilp32 } } 
} } */
 
 #include "tree-vect.h"
 
-typedef unsigned long PV;
+typedef unsigned long long PV;
 typedef struct _buff_t {
 int foo;
 PV Val;




-- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
index 1751296ab813fe85eaab1f58dc674bac10f6eb7a..db8e00556f116ca81c5a6558ec6ecd3b222ec93d 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c
@@ -2,11 +2,11 @@
 /* { dg-require-effective-target vect_early_break_hw } */
 /* { dg-require-effective-target vect_long_long } */
 
-/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! ilp32 } } } } */
 
 #include "tree-vect.h"
 
-typedef unsigned long PV;
+typedef unsigned long long PV;
 typedef struct _buff_t {
 int foo;
 PV Val;





docs: document early break support and pragma novector

2024-04-15 Thread Tamar Christina
docs: document early break support and pragma novector

---
diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index 
b4c602a523717c1d64333e44aefb60ba0ed02e7a..aceecb86f17443cfae637e90987427b98c42f6eb
 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -200,6 +200,34 @@ a work-in-progress.
 for indicating parameters that are expected to be null-terminated
 strings.
   
+  
+The vectorizer now supports vectorizing loops which contain any number of 
early breaks.
+This means loops such as:
+
+   int z[100], y[100], x[100];
+   int foo (int n)
+   {
+ int res = 0;
+ for (int i = 0; i < n; i++)
+   {
+  y[i] = x[i] * 2;
+  res += x[i] + y[i];
+
+  if (x[i] > 5)
+break;
+
+  if (z[i] > 5)
+break;
+
+   }
+ return res;
+   }
+
+can now be vectorized on a number of targets.  In this first version any
+input data sources must either have a statically known size at compile time
+or the vectorizer must be able to determine based on auxillary information
+that the accesses are aligned.
+  
 
 
 New Languages and Language specific improvements
@@ -231,6 +259,9 @@ a work-in-progress.
   previous options -std=c2x, -std=gnu2x
   and -Wc11-c2x-compat, which are deprecated but remain
   supported.
+  GCC supports a new pragma pragma GCC novector to
+  indicate to the vectorizer not to vectorize the loop annotated with the
+  pragma.
 
 
 C++
@@ -400,6 +431,9 @@ a work-in-progress.
   warnings are enabled for C++ as well
   The DR 2237 code no longer gives an error, it emits
   a -Wtemplate-id-cdtor warning instead
+  GCC supports a new pragma pragma GCC novector to
+  indicate to the vectorizer not to vectorize the loop annotated with the
+  pragma.
 
 
 Runtime Library (libstdc++)




-- 
diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index b4c602a523717c1d64333e44aefb60ba0ed02e7a..aceecb86f17443cfae637e90987427b98c42f6eb 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -200,6 +200,34 @@ a work-in-progress.
 for indicating parameters that are expected to be null-terminated
 strings.
   
+  
+The vectorizer now supports vectorizing loops which contain any number of early breaks.
+This means loops such as:
+
+	int z[100], y[100], x[100];
+	int foo (int n)
+	{
+	  int res = 0;
+	  for (int i = 0; i < n; i++)
+	{
+	   y[i] = x[i] * 2;
+	   res += x[i] + y[i];
+
+	   if (x[i] > 5)
+		 break;
+
+	   if (z[i] > 5)
+		 break;
+
+	}
+	  return res;
+	}
+
+can now be vectorized on a number of targets.  In this first version any
+input data sources must either have a statically known size at compile time
+or the vectorizer must be able to determine based on auxillary information
+that the accesses are aligned.
+  
 
 
 New Languages and Language specific improvements
@@ -231,6 +259,9 @@ a work-in-progress.
   previous options -std=c2x, -std=gnu2x
   and -Wc11-c2x-compat, which are deprecated but remain
   supported.
+  GCC supports a new pragma pragma GCC novector to
+  indicate to the vectorizer not to vectorize the loop annotated with the
+  pragma.
 
 
 C++
@@ -400,6 +431,9 @@ a work-in-progress.
   warnings are enabled for C++ as well
   The DR 2237 code no longer gives an error, it emits
   a -Wtemplate-id-cdtor warning instead
+  GCC supports a new pragma pragma GCC novector to
+  indicate to the vectorizer not to vectorize the loop annotated with the
+  pragma.
 
 
 Runtime Library (libstdc++)





[gcc r11-11323] [AArch64]: Do not allow SIMD clones with simdlen 1 [PR113552]

2024-04-15 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:0c2fcf3ddfe93d1f403962c4bacbb5d55ab7d19d

commit r11-11323-g0c2fcf3ddfe93d1f403962c4bacbb5d55ab7d19d
Author: Tamar Christina 
Date:   Mon Apr 15 12:32:24 2024 +0100

[AArch64]: Do not allow SIMD clones with simdlen 1 [PR113552]

This is a backport of g:306713c953d509720dc394c43c0890548bb0ae07.

The AArch64 vector PCS does not allow simd calls with simdlen 1,
however due to a bug we currently do allow it for num == 0.

This causes us to emit a symbol that doesn't exist and we fail to link.

gcc/ChangeLog:

PR tree-optimization/113552
* config/aarch64/aarch64.c
(aarch64_simd_clone_compute_vecsize_and_simdlen): Block simdlen 1.

gcc/testsuite/ChangeLog:

PR tree-optimization/113552
* gcc.target/aarch64/pr113552.c: New test.
* gcc.target/aarch64/simd_pcs_attribute-3.c: Remove bogus check.

Diff:
---
 gcc/config/aarch64/aarch64.c   | 18 ++
 gcc/testsuite/gcc.target/aarch64/pr113552.c| 17 +
 .../gcc.target/aarch64/simd_pcs_attribute-3.c  |  4 ++--
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 9bbbc5043af..4df72339952 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -25556,7 +25556,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
tree base_type, int num)
 {
   tree t, ret_type;
-  unsigned int elt_bits, count;
+  unsigned int elt_bits, count = 0;
   unsigned HOST_WIDE_INT const_simdlen;
   poly_uint64 vec_bits;
 
@@ -25624,11 +25624,20 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
   if (known_eq (clonei->simdlen, 0U))
 {
-  count = 2;
-  vec_bits = (num == 0 ? 64 : 128);
+  /* We don't support simdlen == 1.  */
+  if (known_eq (elt_bits, 64))
+   {
+ count = 1;
+ vec_bits = 128;
+   }
+  else
+   {
+ count = 2;
+ vec_bits = (num == 0 ? 64 : 128);
+   }
   clonei->simdlen = exact_div (vec_bits, elt_bits);
 }
-  else
+  else if (maybe_ne (clonei->simdlen, 1U))
 {
   count = 1;
   vec_bits = clonei->simdlen * elt_bits;
@@ -25643,6 +25652,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
  return 0;
}
 }
+
   clonei->vecsize_int = vec_bits;
   clonei->vecsize_float = vec_bits;
   return count;
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c 
b/gcc/testsuite/gcc.target/aarch64/pr113552.c
new file mode 100644
index 000..9c96b061ed2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=armv8-a" } */
+
+__attribute__ ((__simd__ ("notinbranch"), const))
+double cos (double);
+
+void foo (float *a, double *b)
+{
+for (int i = 0; i < 12; i+=3)
+  {
+b[i] = cos (5.0 * a[i]);
+b[i+1] = cos (5.0 * a[i+1]);
+b[i+2] = cos (5.0 * a[i+2]);
+  }
+}
+
+/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c 
b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
index 95f6a6803e8..c6dac6b104c 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
@@ -18,7 +18,7 @@ double foo(double x)
 }
 
 /* { dg-final { scan-assembler-not {\.variant_pcs\tfoo} } } */
-/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM1v_foo} 1 } } */
+/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnM1v_foo} } } */
 /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM2v_foo} 1 } } */
-/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN1v_foo} 1 } } */
+/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnN1v_foo} } } */
 /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN2v_foo} 1 } } */


[gcc r12-10329] AArch64: Do not allow SIMD clones with simdlen 1 [PR113552]

2024-04-15 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:642cfd049780f03335da9fe0a51415f130232334

commit r12-10329-g642cfd049780f03335da9fe0a51415f130232334
Author: Tamar Christina 
Date:   Mon Apr 15 12:16:53 2024 +0100

AArch64: Do not allow SIMD clones with simdlen 1 [PR113552]

This is a backport of g:306713c953d509720dc394c43c0890548bb0ae07.

The AArch64 vector PCS does not allow simd calls with simdlen 1,
however due to a bug we currently do allow it for num == 0.

This causes us to emit a symbol that doesn't exist and we fail to link.

gcc/ChangeLog:

PR tree-optimization/113552
* config/aarch64/aarch64.cc
(aarch64_simd_clone_compute_vecsize_and_simdlen): Block simdlen 1.

gcc/testsuite/ChangeLog:

PR tree-optimization/113552
* gcc.target/aarch64/pr113552.c: New test.
* gcc.target/aarch64/simd_pcs_attribute-3.c: Remove bogus check.

Diff:
---
 gcc/config/aarch64/aarch64.cc   | 16 +---
 gcc/testsuite/gcc.target/aarch64/pr113552.c | 17 +
 gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c |  4 ++--
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 2bbba323770..96976abdbf4 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -26898,7 +26898,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
tree base_type, int num)
 {
   tree t, ret_type;
-  unsigned int elt_bits, count;
+  unsigned int elt_bits, count = 0;
   unsigned HOST_WIDE_INT const_simdlen;
   poly_uint64 vec_bits;
 
@@ -26966,8 +26966,17 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
   if (known_eq (clonei->simdlen, 0U))
 {
-  count = 2;
-  vec_bits = (num == 0 ? 64 : 128);
+  /* We don't support simdlen == 1.  */
+  if (known_eq (elt_bits, 64))
+   {
+ count = 1;
+ vec_bits = 128;
+   }
+  else
+   {
+ count = 2;
+ vec_bits = (num == 0 ? 64 : 128);
+   }
   clonei->simdlen = exact_div (vec_bits, elt_bits);
 }
   else
@@ -26985,6 +26994,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
  return 0;
}
 }
+
   clonei->vecsize_int = vec_bits;
   clonei->vecsize_float = vec_bits;
   return count;
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c 
b/gcc/testsuite/gcc.target/aarch64/pr113552.c
new file mode 100644
index 000..9c96b061ed2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=armv8-a" } */
+
+__attribute__ ((__simd__ ("notinbranch"), const))
+double cos (double);
+
+void foo (float *a, double *b)
+{
+for (int i = 0; i < 12; i+=3)
+  {
+b[i] = cos (5.0 * a[i]);
+b[i+1] = cos (5.0 * a[i+1]);
+b[i+2] = cos (5.0 * a[i+2]);
+  }
+}
+
+/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c 
b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
index 95f6a6803e8..c6dac6b104c 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
@@ -18,7 +18,7 @@ double foo(double x)
 }
 
 /* { dg-final { scan-assembler-not {\.variant_pcs\tfoo} } } */
-/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM1v_foo} 1 } } */
+/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnM1v_foo} } } */
 /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM2v_foo} 1 } } */
-/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN1v_foo} 1 } } */
+/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnN1v_foo} } } */
 /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN2v_foo} 1 } } */


[gcc r13-8604] AArch64: Do not allow SIMD clones with simdlen 1 [PR113552]

2024-04-15 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:1e08e39c743692afdd5d3546b2223474beac1dbc

commit r13-8604-g1e08e39c743692afdd5d3546b2223474beac1dbc
Author: Tamar Christina 
Date:   Mon Apr 15 12:11:48 2024 +0100

AArch64: Do not allow SIMD clones with simdlen 1 [PR113552]

This is a backport of g:306713c953d509720dc394c43c0890548bb0ae07.

The AArch64 vector PCS does not allow simd calls with simdlen 1,
however due to a bug we currently do allow it for num == 0.

This causes us to emit a symbol that doesn't exist and we fail to link.

gcc/ChangeLog:

PR tree-optimization/113552
* config/aarch64/aarch64.cc
(aarch64_simd_clone_compute_vecsize_and_simdlen): Block simdlen 1.

gcc/testsuite/ChangeLog:

PR tree-optimization/113552
* gcc.target/aarch64/pr113552.c: New test.
* gcc.target/aarch64/simd_pcs_attribute-3.c: Remove bogus check.

Diff:
---
 gcc/config/aarch64/aarch64.cc   | 16 +---
 gcc/testsuite/gcc.target/aarch64/pr113552.c | 17 +
 gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c |  4 ++--
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f6d14cd791a..b8a4ab1b980 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27029,7 +27029,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int elt_bits, count;
+  unsigned int elt_bits, count = 0;
   unsigned HOST_WIDE_INT const_simdlen;
   poly_uint64 vec_bits;
 
@@ -27102,8 +27102,17 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
   if (known_eq (clonei->simdlen, 0U))
 {
-  count = 2;
-  vec_bits = (num == 0 ? 64 : 128);
+  /* We don't support simdlen == 1.  */
+  if (known_eq (elt_bits, 64))
+   {
+ count = 1;
+ vec_bits = 128;
+   }
+  else
+   {
+ count = 2;
+ vec_bits = (num == 0 ? 64 : 128);
+   }
   clonei->simdlen = exact_div (vec_bits, elt_bits);
 }
   else
@@ -27123,6 +27132,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
  return 0;
}
 }
+
   clonei->vecsize_int = vec_bits;
   clonei->vecsize_float = vec_bits;
   return count;
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c 
b/gcc/testsuite/gcc.target/aarch64/pr113552.c
new file mode 100644
index 000..9c96b061ed2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=armv8-a" } */
+
+__attribute__ ((__simd__ ("notinbranch"), const))
+double cos (double);
+
+void foo (float *a, double *b)
+{
+for (int i = 0; i < 12; i+=3)
+  {
+b[i] = cos (5.0 * a[i]);
+b[i+1] = cos (5.0 * a[i+1]);
+b[i+2] = cos (5.0 * a[i+2]);
+  }
+}
+
+/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c 
b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
index 95f6a6803e8..c6dac6b104c 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
@@ -18,7 +18,7 @@ double foo(double x)
 }
 
 /* { dg-final { scan-assembler-not {\.variant_pcs\tfoo} } } */
-/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM1v_foo} 1 } } */
+/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnM1v_foo} } } */
 /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM2v_foo} 1 } } */
-/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN1v_foo} 1 } } */
+/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnN1v_foo} } } */
 /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN2v_foo} 1 } } */


[gcc r14-9969] middle-end: adjust loop upper bounds when peeling for gaps and early break [PR114403].

2024-04-15 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:85002f8085c25bb3e74ab013581a74e7c7ae006b

commit r14-9969-g85002f8085c25bb3e74ab013581a74e7c7ae006b
Author: Tamar Christina 
Date:   Mon Apr 15 12:06:21 2024 +0100

middle-end: adjust loop upper bounds when peeling for gaps and early break 
[PR114403].

This fixes a bug with the interaction between peeling for gaps and early 
break.

Before I go further, I'll first explain how I understand this to work for 
loops
with a single exit.

When peeling for gaps we peel N < VF iterations to scalar.
This happens by removing N iterations from the calculation of niters such 
that
vect_iters * VF == niters is always false.

In other words, when we exit the vector loop we always fall to the scalar 
loop.
The loop bounds adjustment guarantees this. Because of this we potentially
execute a vector loop iteration less.  That is, if you're at the boundary
condition where niters % VF by peeling one or more scalar iterations the 
vector
loop executes one less.

This is accounted for by the adjustments in vect_transform_loops.  This
adjustment happens differently based on whether the the vector loop can be
partial or not:

Peeling for gaps sets the bias to 0 and then:

when not partial:  we take the floor of (scalar_upper_bound / VF) - 1 to 
get the
   vector latch iteration count.

when loop is partial:  For a single exit this means the loop is masked, we 
take
   the ceil to account for the fact that the loop can 
handle
   the final partial iteration using masking.

Note that there's no difference between ceil an floor on the boundary 
condition.
There is a difference however when you're slightly above it. i.e. if scalar
iterates 14 times and VF = 4 and we peel 1 iteration for gaps.

The partial loop does ((13 + 0) / 4) - 1 == 2 vector iterations. and in 
effect
the partial iteration is ignored and it's done as scalar.

This is fine because the niters modification has capped the vector 
iteration at
2.  So that when we reduce the induction values you end up entering the 
scalar
code with ind_var.2 = ind_var.1 + 2 * VF.

Now lets look at early breaks.  To make it esier I'll focus on the specific
testcase:

char buffer[64];

__attribute__ ((noipa))
buff_t *copy (buff_t *first, buff_t *last)
{
  char *buffer_ptr = buffer;
  char *const buffer_end = [SZ-1];
  int store_size = sizeof(first->Val);
  while (first != last && (buffer_ptr + store_size) <= buffer_end)
{
  const char *value_data = (const char *)(>Val);
  __builtin_memcpy(buffer_ptr, value_data, store_size);
  buffer_ptr += store_size;
  ++first;
}

  if (first == last)
return 0;

  return first;
}

Here the first, early exit is on the condition:

  (buffer_ptr + store_size) <= buffer_end

and the main exit is on condition:

  first != last

This is important, as this bug only manifests itself when the first exit 
has a
known constant iteration count that's lower than the latch exit count.

because buffer holds 64 bytes, and VF = 4, unroll = 2, we end up processing 
16
bytes per iteration.  So the exit has a known bounds of 8 + 1.

The vectorizer correctly analizes this:

Statement (exit)if (ivtmp_21 != 0)
 is executed at most 8 (bounded by 8) + 1 times in loop 1.

and as a consequence the IV is bound by 9:

  # vect_vec_iv_.14_117 = PHI <_118(9), { 9, 8, 7, 6 }(20)>
  ...
  vect_ivtmp_21.16_124 = vect_vec_iv_.14_117 + { 18446744073709551615, 
18446744073709551615, 18446744073709551615, 18446744073709551615 };
  mask_patt_22.17_126 = vect_ivtmp_21.16_124 != { 0, 0, 0, 0 };
  if (mask_patt_22.17_126 == { -1, -1, -1, -1 })
goto ; [88.89%]
  else
goto ; [11.11%]

The imporant bits are this:

In this example the value of last - first = 416.

the calculated vector iteration count, is:

x = (((ptr2 - ptr1) - 16) / 16) + 1 = 27

the bounds generated, adjusting for gaps:

   x == (((x - 1) >> 2) << 2)

which means we'll always fall through to the scalar code. as intended.

Here are two key things to note:

1. In this loop, the early exit will always be the one taken.  When it's 
taken
   we enter the scalar loop with the correct induction value to apply the 
gap
   peeling.

2. If the main exit is taken, the induction values assumes you've finished 
all
   vector iterations.  i.e. it assumes you have completed 24 iterations, as 
we
   treat the main exit the same for normal loop vect and early break when 
not
   PEELED.
   This means the

[PATCH]middle-end: adjust loop upper bounds when peeling for gaps and early break [PR114403].

2024-04-12 Thread Tamar Christina
Hi All,

This is a story all about how the peeling for gaps introduces a bug in the upper
bounds.

Before I go further, I'll first explain how I understand this to work for loops
with a single exit.

When peeling for gaps we peel N < VF iterations to scalar.
This happens by removing N iterations from the calculation of niters such that
vect_iters * VF == niters is always false.

In other words, when we exit the vector loop we always fall to the scalar loop.
The loop bounds adjustment guarantees this. Because of this we potentially
execute a vector loop iteration less.  That is, if you're at the boundary
condition where niters % VF by peeling one or more scalar iterations the vector
loop executes one less.

This is accounted for by the adjustments in vect_transform_loops.  This
adjustment happens differently based on whether the the vector loop can be
partial or not:

Peeling for gaps sets the bias to 0 and then:

when not partial:  we take the floor of (scalar_upper_bound / VF) - 1 to get the
   vector latch iteration count.

when loop is partial:  For a single exit this means the loop is masked, we take
   the ceil to account for the fact that the loop can handle
   the final partial iteration using masking.

Note that there's no difference between ceil an floor on the boundary condition.
There is a difference however when you're slightly above it. i.e. if scalar
iterates 14 times and VF = 4 and we peel 1 iteration for gaps.

The partial loop does ((13 + 0) / 4) - 1 == 2 vector iterations. and in effect
the partial iteration is ignored and it's done as scalar.

This is fine because the niters modification has capped the vector iteration at
2.  So that when we reduce the induction values you end up entering the scalar
code with ind_var.2 = ind_var.1 + 2 * VF.

Now lets look at early breaks.  To make it esier I'll focus on the specific
testcase:

char buffer[64];

__attribute__ ((noipa))
buff_t *copy (buff_t *first, buff_t *last)
{
  char *buffer_ptr = buffer;
  char *const buffer_end = [SZ-1];
  int store_size = sizeof(first->Val);
  while (first != last && (buffer_ptr + store_size) <= buffer_end)
{
  const char *value_data = (const char *)(>Val);
  __builtin_memcpy(buffer_ptr, value_data, store_size);
  buffer_ptr += store_size;
  ++first;
}

  if (first == last)
return 0;

  return first;
}

Here the first, early exit is on the condition:

  (buffer_ptr + store_size) <= buffer_end

and the main exit is on condition:

  first != last

This is important, as this bug only manifests itself when the first exit has a
known constant iteration count that's lower than the latch exit count.

because buffer holds 64 bytes, and VF = 4, unroll = 2, we end up processing 16
bytes per iteration.  So the exit has a known bounds of 8 + 1.

The vectorizer correctly analizes this:

Statement (exit)if (ivtmp_21 != 0)
 is executed at most 8 (bounded by 8) + 1 times in loop 1.

and as a consequence the IV is bound by 9:

  # vect_vec_iv_.14_117 = PHI <_118(9), { 9, 8, 7, 6 }(20)>
  ...
  vect_ivtmp_21.16_124 = vect_vec_iv_.14_117 + { 18446744073709551615, 
18446744073709551615, 18446744073709551615, 18446744073709551615 };
  mask_patt_22.17_126 = vect_ivtmp_21.16_124 != { 0, 0, 0, 0 };
  if (mask_patt_22.17_126 == { -1, -1, -1, -1 })
goto ; [88.89%]
  else
goto ; [11.11%]

The imporant bits are this:

In this example the value of last - first = 416.

the calculated vector iteration count, is:

x = (((ptr2 - ptr1) - 16) / 16) + 1 = 27

the bounds generated, adjusting for gaps:

   x == (((x - 1) >> 2) << 2)

which means we'll always fall through to the scalar code. as intended.

Here are two key things to note:

1. In this loop, the early exit will always be the one taken.  When it's taken
   we enter the scalar loop with the correct induction value to apply the gap
   peeling.

2. If the main exit is taken, the induction values assumes you've finished all
   vector iterations.  i.e. it assumes you have completed 24 iterations, as we
   treat the main exit the same for normal loop vect and early break when not
   PEELED.
   This means the induction value is adjusted to ind_var.2 = ind_var.1 + 24 * 
VF;

So what's going wrong.  The vectorizer's codegen is correct and efficient,
however when we adjust the upper bounds, that code knows that the loops upper
bound is based on the early exit. i.e. 8 latch iterations. or in other words.
It thinks the loop iterates once.

This is incorrect as the vector loop iterates twice, as it has set up the
induction value such that it exits at the early exit.   So it in effect iterates
2.5x times.

Becuase the upper bound is incorrect, when we unroll it now exits from the main
exit which uses the incorrect induction value.

So there are three ways to fix this:

1.  If we take the position that the main exit should support both premature
exits and final exits then vect_update_ivs_after_vectorizer 

[PATCH]middle-end vect: adjust loop upper bounds when peeling for gaps and early break [PR114403]

2024-04-04 Thread Tamar Christina
Hi All,

The report shows that we end up in a situation where the code has been peeled
for gaps and we have an early break.

The code for peeling for gaps assume that a scalar loop needs to perform at
least one iteration.  However this doesn't take into account early break where
the scalar loop may not need to be executed.

That the early break loop can be partial is not accounted for in this scenario.
loop partiality is normally handled by setting bias_for_lowest to 1, but when
peeling for gaps we end up with 0, which when the loop upper bounds are
calculated means that a partial loop iteration loses the final partial iter:

Analyzing # of iterations of loop 1
  exit condition [8, + , 18446744073709551615] != 0
  bounds on difference of bases: -8 ... -8
  result:
# of iterations 8, bounded by 8

and a VF=4 calculating:

Loop 1 iterates at most 1 times.
Loop 1 likely iterates at most 1 times.
Analyzing # of iterations of loop 1
  exit condition [1, + , 1](no_overflow) < bnd.5505_39
  bounds on difference of bases: 0 ... 4611686018427387902
Matching expression match.pd:2011, generic-match-8.cc:27
Applying pattern match.pd:2067, generic-match-1.cc:4813
  result:
# of iterations bnd.5505_39 + 18446744073709551615, bounded by 
4611686018427387902
Estimating sizes for loop 1
...
   Induction variable computation will be folded away.
  size:   2 if (ivtmp_312 < bnd.5505_39)
   Exit condition will be eliminated in last copy.
size: 24-3, last_iteration: 24-5
  Loop size: 24
  Estimated size after unrolling: 26
;; Guessed iterations of loop 1 is 0.858446. New upper bound 1.

upper bound should be 2 not 1.

This patch forced the bias_for_lowest to be 1 even when peeling for gaps.

I have however not been able to write a standalone reproducer for this so I have
no tests but bootstrap and LLVM build fine now.

The testcase:

#define COUNT 9
#define SIZE COUNT * 4
#define TYPE unsigned long

TYPE x[SIZE], y[SIZE];

void __attribute__((noipa))
loop (TYPE val)
{
  for (int i = 0; i < COUNT; ++i)
{
  if (x[i * 4] > val || x[i * 4 + 1] > val)
return;
  x[i * 4] = y[i * 2] + 1;
  x[i * 4 + 1] = y[i * 2] + 2;
  x[i * 4 + 2] = y[i * 2 + 1] + 3;
  x[i * 4 + 3] = y[i * 2 + 1] + 4;
}
}

does perform the peeling for gaps and early beak, however it creates a hybrid
loop which works fine. adjusting the indices to non linear also works. So I'd
like to submit the fix and work on a testcase separately if needed.

Bootstrapped Regtested on x86_64-pc-linux-gnu no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/114403
* tree-vect-loop.cc (vect_transform_loop): Adjust upper bounds for when
peeling for gaps and early break.

---
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
4375ebdcb493a90fd0501cbb4b07466077b525c3..bf1bb9b005c68fbb13ee1b1279424865b237245a
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -12139,7 +12139,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)
   /* The minimum number of iterations performed by the epilogue.  This
  is 1 when peeling for gaps because we always need a final scalar
  iteration.  */
-  int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
+  int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+  && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo) ? 1 : 0;
   /* +1 to convert latch counts to loop iteration counts,
  -min_epilogue_iters to remove iterations that cannot be performed
by the vector code.  */




-- 
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 4375ebdcb493a90fd0501cbb4b07466077b525c3..bf1bb9b005c68fbb13ee1b1279424865b237245a 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -12139,7 +12139,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   /* The minimum number of iterations performed by the epilogue.  This
  is 1 when peeling for gaps because we always need a final scalar
  iteration.  */
-  int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
+  int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+			   && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo) ? 1 : 0;
   /* +1 to convert latch counts to loop iteration counts,
  -min_epilogue_iters to remove iterations that cannot be performed
by the vector code.  */





[gcc r14-9493] match.pd: Only merge truncation with conversion for -fno-signed-zeros

2024-03-15 Thread Tamar Christina via Gcc-cvs
https://gcc.gnu.org/g:7dd3b2b09cbeb6712ec680a0445cb0ad41070423

commit r14-9493-g7dd3b2b09cbeb6712ec680a0445cb0ad41070423
Author: Joe Ramsay 
Date:   Fri Mar 15 09:20:45 2024 +

match.pd: Only merge truncation with conversion for -fno-signed-zeros

This optimisation does not honour signed zeros, so should not be
enabled except with -fno-signed-zeros.

gcc/ChangeLog:

* match.pd: Fix truncation pattern for -fno-signed-zeroes

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/no_merge_trunc_signed_zero.c: New test.

Diff:
---
 gcc/match.pd   |  1 +
 .../aarch64/no_merge_trunc_signed_zero.c   | 24 ++
 2 files changed, 25 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index 9ce313323a3..15a1e7350d4 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4858,6 +4858,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (simplify
(float (fix_trunc @0))
(if (!flag_trapping_math
+   && !HONOR_SIGNED_ZEROS (type)
&& types_match (type, TREE_TYPE (@0))
&& direct_internal_fn_supported_p (IFN_TRUNC, type,
  OPTIMIZE_FOR_BOTH))
diff --git a/gcc/testsuite/gcc.target/aarch64/no_merge_trunc_signed_zero.c 
b/gcc/testsuite/gcc.target/aarch64/no_merge_trunc_signed_zero.c
new file mode 100644
index 000..b2c93e55567
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/no_merge_trunc_signed_zero.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-trapping-math -fsigned-zeros" } */
+
+#include 
+
+float
+f1 (float x)
+{
+  return (int) rintf(x);
+}
+
+double
+f2 (double x)
+{
+  return (long) rint(x);
+}
+
+/* { dg-final { scan-assembler "frintx\\ts\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "cvtzs\\ts\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "scvtf\\ts\[0-9\]+, s\[0-9\]+" } } */
+/* { dg-final { scan-assembler "frintx\\td\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "cvtzs\\td\[0-9\]+, d\[0-9\]+" } } */
+/* { dg-final { scan-assembler "scvtf\\td\[0-9\]+, d\[0-9\]+" } } */
+


Summary: [PATCH][committed]AArch64: Do not allow SIMD clones with simdlen 1 [PR113552][GCC 13/12/11 backport]

2024-03-12 Thread Tamar Christina
Hi All,

This is a backport of g:306713c953d509720dc394c43c0890548bb0ae07.

The AArch64 vector PCS does not allow simd calls with simdlen 1,
however due to a bug we currently do allow it for num == 0.

This causes us to emit a symbol that doesn't exist and we fail to link.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Committed to GCC 13,12,11 branches as previously approved.

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/113552
* config/aarch64/aarch64.cc
(aarch64_simd_clone_compute_vecsize_and_simdlen): Block simdlen 1.

gcc/testsuite/ChangeLog:

PR tree-optimization/113552
* gcc.target/aarch64/pr113552.c: New test.
* gcc.target/aarch64/simd_pcs_attribute-3.c: Remove bogus check.

---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
f546c48ae2d2bad2e34c6b72e5e3e30aba3c3bd6..d19a9c16cc97ae75afd4e29f4339d65d39cfb73a
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27027,7 +27027,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int elt_bits, count;
+  unsigned int elt_bits, count = 0;
   unsigned HOST_WIDE_INT const_simdlen;
   poly_uint64 vec_bits;
 
@@ -27104,7 +27104,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
   vec_bits = (num == 0 ? 64 : 128);
   clonei->simdlen = exact_div (vec_bits, elt_bits);
 }
-  else
+  else if (maybe_ne (clonei->simdlen, 1U))
 {
   count = 1;
   vec_bits = clonei->simdlen * elt_bits;
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c 
b/gcc/testsuite/gcc.target/aarch64/pr113552.c
new file mode 100644
index 
..9c96b061ed2b4fcc57e58925277f74d14f79c51f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=armv8-a" } */
+
+__attribute__ ((__simd__ ("notinbranch"), const))
+double cos (double);
+
+void foo (float *a, double *b)
+{
+for (int i = 0; i < 12; i+=3)
+  {
+b[i] = cos (5.0 * a[i]);
+b[i+1] = cos (5.0 * a[i+1]);
+b[i+2] = cos (5.0 * a[i+2]);
+  }
+}
+
+/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c 
b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
index 
95f6a6803e889c02177ef10972962ed62d2095eb..c6dac6b104c94c9de89ed88dc5a73e185d2be125
 100644
--- a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
@@ -18,7 +18,7 @@ double foo(double x)
 }
 
 /* { dg-final { scan-assembler-not {\.variant_pcs\tfoo} } } */
-/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM1v_foo} 1 } } */
+/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnM1v_foo} } } */
 /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM2v_foo} 1 } } */
-/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN1v_foo} 1 } } */
+/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnN1v_foo} } } */
 /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN2v_foo} 1 } } */




-- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f546c48ae2d2bad2e34c6b72e5e3e30aba3c3bd6..d19a9c16cc97ae75afd4e29f4339d65d39cfb73a 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27027,7 +27027,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
 	bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int elt_bits, count;
+  unsigned int elt_bits, count = 0;
   unsigned HOST_WIDE_INT const_simdlen;
   poly_uint64 vec_bits;
 
@@ -27104,7 +27104,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
   vec_bits = (num == 0 ? 64 : 128);
   clonei->simdlen = exact_div (vec_bits, elt_bits);
 }
-  else
+  else if (maybe_ne (clonei->simdlen, 1U))
 {
   count = 1;
   vec_bits = clonei->simdlen * elt_bits;
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c b/gcc/testsuite/gcc.target/aarch64/pr113552.c
new file mode 100644
index ..9c96b061ed2b4fcc57e58925277f74d14f79c51f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=armv8-a" } */
+
+__attribute__ ((__simd__ ("notinbranch"), const))
+double cos (double);
+
+void foo (float *a, double *b)
+{
+for (int i = 0; i < 12; i+=3)
+  {
+b[i] = cos (5.0 * a[i]);
+b[i+1] = cos (5.0 * a[i+1]);
+b[i+2] = cos (5.0 * a[i+2]);
+  }
+}
+
+/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c
index 

RE: [PATCH] vect: Do not peel epilogue for partial vectors [PR114196].

2024-03-07 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Thursday, March 7, 2024 8:47 AM
> To: Robin Dapp 
> Cc: gcc-patches ; Tamar Christina
> 
> Subject: Re: [PATCH] vect: Do not peel epilogue for partial vectors 
> [PR114196].
> 
> On Wed, Mar 6, 2024 at 9:21 PM Robin Dapp  wrote:
> >
> > Hi,
> >
> > r14-7036-gcbf569486b2dec added an epilogue vectorization guard for early
> > break but PR114196 shows that we also run into the problem without early
> > break.  Therefore remove early break from the conditions.
> >
> > gcc/ChangeLog:
> >
> > PR middle-end/114196
> >
> > * tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Remove
> > early break check from guards.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/pr114196.c: New test.
> > * gcc.target/riscv/rvv/autovec/pr114196.c: New test.
> > ---
> >  gcc/testsuite/gcc.target/aarch64/pr114196.c   | 19 +++
> >  .../gcc.target/riscv/rvv/autovec/pr114196.c   | 19 +++
> >  gcc/tree-vect-loop-manip.cc   |  6 +++---
> >  3 files changed, 41 insertions(+), 3 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr114196.c
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr114196.c
> >
> > diff --git a/gcc/testsuite/gcc.target/aarch64/pr114196.c
> b/gcc/testsuite/gcc.target/aarch64/pr114196.c
> > new file mode 100644
> > index 000..15e4b0e31b8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/pr114196.c
> > @@ -0,0 +1,19 @@
> > +/* { dg-do compile } */
> > +/* { dg-options { -O3 -fno-vect-cost-model -march=armv9-a -msve-vector-
> bits=256 } } */
> > +
> > +unsigned a;
> > +int b;
> > +long *c;
> > +
> > +int
> > +main ()
> > +{
> > +  for (int d = 0; d < 22; d += 4) {
> > +  b = ({
> > +   int e = c[d];
> > +   e;
> > +   })
> > +  ? 0 : -c[d];
> > +  a *= 3;
> > +  }
> > +}
> > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr114196.c
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr114196.c
> > new file mode 100644
> > index 000..7ba9cbbed70
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr114196.c
> > @@ -0,0 +1,19 @@
> > +/* { dg-do compile } */
> > +/* { dg-options { -O3 -fno-vect-cost-model -march=rv64gcv_zvl256b -
> mabi=lp64d -mrvv-vector-bits=zvl } } */
> > +
> > +unsigned a;
> > +int b;
> > +long *c;
> > +
> > +int
> > +main ()
> > +{
> > +  for (int d = 0; d < 22; d += 4) {
> > +  b = ({
> > +   int e = c[d];
> > +   e;
> > +   })
> > +  ? 0 : -c[d];
> > +  a *= 3;
> > +  }
> > +}
> > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> > index f72da915103..c3cd20eef70 100644
> > --- a/gcc/tree-vect-loop-manip.cc
> > +++ b/gcc/tree-vect-loop-manip.cc
> > @@ -2183,9 +2183,9 @@ vect_can_peel_nonlinear_iv_p (loop_vec_info
> loop_vinfo,
> >   perform the peeling.  The below condition mirrors that of
> >   vect_gen_vector_loop_niters  where niters_vector_mult_vf_var then sets
> >   step_vector to VF rather than 1.  This is what creates the nonlinear
> > - IV.  PR113163.  */
> > -  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > -  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
> > + IV.  PR113163.
> > + This also happens without early breaks, see PR114196.  */
> 
> Can you instead reword to not mention early breaks, maybe instead
> say PR113163 (with early breaks), PR114196 (without)?
> 
> The dump message also needs adjustments, it mentions early breaks as
> well.
> 
> The comment says it matches a condition in vect_gen_vector_loop_niters
> but I can't see what that means ... Tamar?
> 

The comment was trying to say that this case is when you manage to get here:
https://github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L2847

because that makes you fall into 
https://github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L3528
 which creates the nonlinear IV variable.

The vect_step_op_neg exception is because vect_update_ivs_after_vectorizer can 
deal with that case specifically
https://github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L2398

which is what the previous check is also explaining 
https://github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L2133

If this also happens for non-early breaks it's just better to merge the check 
into the earlier one at 
github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L2133

Tamar

> > +  if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()
> >&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
> >&& induction_type != vect_step_op_neg)
> >  {
> > --
> > 2.43.2


RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

2024-02-27 Thread Tamar Christina
> Thanks Tamar.
> 
> > Those two cases also *completely* stop vectorization because of either the
> > control flow or the fact the vectorizer can't handle complex types.
> 
> Yes, we eventually would like to vectorize the SAT ALU but we start with 
> scalar part
> first.
> I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> well with some additions as below.
> Feel free to correct me if any misunderstandings.
> 
> 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> usadd$a3(as well as gen_int_libfunc) for int.
> 2. We need to implement a default implementation of SAT_ADD if
> direct_binary_optab_supported_p is false.
> It looks like the default implementation is difficult to make every 
> backend happy.
> That is why you suggest just normal
> DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> 
> Thanks Richard.
> 
> > But what I'd like to see is that we do more instruction selection on GIMPLE
> > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > passes doing what I'd call instruction selection).  But that means not 
> > adding
> > match.pd patterns for that or at least have a separate isel-match.pd
> > machinery for that.
> 
> > So as a start I would go for a direct optab and see to recognize it during
> > ISEL?
> 
> Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> to know isel and I am happy to
> try that once we have conclusion.
> 

So after a lively discussion on IRC, the conclusion is that before we proceed 
Richi would
like to see some examples of various operations.  The problem is that unsigned 
saturating
addition is the simplest example and it may lead to an implementation strategy 
that doesn't
scale.

So I'd suggest writing some example of both signed and unsigned saturating add 
and multiply

Because signed addition, will likely require a branch and signed multiplication 
would require a
larger type.

This would allow us to better understand what kind of gimple would have to to 
deal with in
ISEL and VECT if we decide not to lower early.

Thanks,
Tamar

> Pan
> 
> -Original Message-
> From: Tamar Christina 
> Sent: Tuesday, February 27, 2024 5:57 PM
> To: Richard Biener 
> Cc: Li, Pan2 ; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai;
> Wang, Yanzhang ; kito.ch...@gmail.com;
> richard.sandiford@arm.com2; jeffreya...@gmail.com
> Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> US_PLUS
> 
> > -Original Message-
> > From: Richard Biener 
> > Sent: Tuesday, February 27, 2024 9:44 AM
> > To: Tamar Christina 
> > Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai;
> > yanzhang.w...@intel.com; kito.ch...@gmail.com;
> > richard.sandiford@arm.com2; jeffreya...@gmail.com
> > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > US_PLUS
> >
> > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> >  wrote:
> > >
> > > Hi Pan,
> > >
> > > > From: Pan Li 
> > > >
> > > > Hi Richard & Tamar,
> > > >
> > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > >
> > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > >
> > >
> > > I think a couple of things are being confused here.  So lets break it 
> > > down:
> > >
> > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > >
> > > Now the downside of this is that this is a direct internal optab.  This 
> > > means
> > > that for the representation to be used the target *must* have the optab
> > > implemented.   This is a bit annoying because it doesn't allow us to 
> > > generically
> > > assume that all targets use SAT_ADD for saturating add and thus only have 
> > > to
> > > write optimization for this representation.
> > >
> > > This is why Richi said we may need to use a new tree_code because we c

RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU

2024-02-27 Thread Tamar Christina
> Am 19.02.24 um 08:36 schrieb Richard Biener:
> > On Sat, Feb 17, 2024 at 11:30 AM  wrote:
> >>
> >> From: Pan Li 
> >>
> >> This patch would like to add the middle-end presentation for the
> >> unsigned saturation add.  Aka set the result of add to the max
> >> when overflow.  It will take the pattern similar as below.
> >>
> >> SAT_ADDU (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> 
> Does this even try to wort out the costs?
> 
> For example, with the following example
> 
> 
> #define T __UINT16_TYPE__
> 
> T sat_add1 (T x, T y)
> {
>return (x + y) | (- (T)((T)(x + y) < x));
> }
> 
> T sat_add2 (T x, T y)
> {
>  T z = x + y;
>  if (z < x)
>  z = (T) -1;
>  return z;
> }
> 
> And then "avr-gcc -S -Os -dp" the code is
> 
> 
> sat_add1:
>   add r22,r24  ;  7   [c=8 l=2]  *addhi3/0
>   adc r23,r25
>   ldi r18,lo8(1)   ;  8   [c=4 l=2]  *movhi/4
>   ldi r19,0
>   cp r22,r24   ;  9   [c=8 l=2]  cmphi3/2
>   cpc r23,r25
>   brlo .L2 ;  10  [c=16 l=1]  branch
>   ldi r19,0;  31  [c=4 l=1]  movqi_insn/0
>   ldi r18,0;  32  [c=4 l=1]  movqi_insn/0
> .L2:
>   clr r24  ;  13  [c=12 l=4]  neghi2/1
>   clr r25
>   sub r24,r18
>   sbc r25,r19
>   or r24,r22   ;  29  [c=4 l=1]  iorqi3/0
>   or r25,r23   ;  30  [c=4 l=1]  iorqi3/0
>   ret  ;  35  [c=0 l=1]  return
> 
> sat_add2:
>   add r22,r24  ;  8   [c=8 l=2]  *addhi3/0
>   adc r23,r25
>   cp r22,r24   ;  9   [c=8 l=2]  cmphi3/2
>   cpc r23,r25
>   brsh .L3 ;  10  [c=16 l=1]  branch
>   ldi r22,lo8(-1)  ;  5   [c=4 l=2]  *movhi/4
>   ldi r23,lo8(-1)
> .L3:
>   mov r25,r23  ;  21  [c=4 l=1]  movqi_insn/0
>   mov r24,r22  ;  22  [c=4 l=1]  movqi_insn/0
>   ret  ;  25  [c=0 l=1]  return
> 
> i.e. the conditional jump is better than overly smart arithmetic
> (smaller and faster code with less register pressure).
> With larger dypes the difference is even more pronounced-
> 

*on AVR. https://godbolt.org/z/7jaExbTa8  shows the branchless code is better.
And the branchy code will vectorize worse if at all 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51492

But looking at that output it just seems like it's your expansion that's 
inefficient.

But fair point, perhaps it should be just a normal DEF_INTERNAL_SIGNED_OPTAB_FN 
so that we
provide the additional optimization only for targets that want it.

Tamar

> >> Take uint8_t as example, we will have:
> >>
> >> * SAT_ADDU (1, 254)   => 255.
> >> * SAT_ADDU (1, 255)   => 255.
> >> * SAT_ADDU (2, 255)   => 255.
> >> * SAT_ADDU (255, 255) => 255.
> >>
> >> The patch also implement the SAT_ADDU in the riscv backend as
> >> the sample.  Given below example:
> >>
> >> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> >> {
> >>return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> >> }
> >>
> >> Before this patch:
> >>
> >> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> >> {
> >>long unsigned int _1;
> >>_Bool _2;
> >>long unsigned int _3;
> >>long unsigned int _4;
> >>uint64_t _7;
> >>long unsigned int _10;
> >>__complex__ long unsigned int _11;
> >>
> >> ;;   basic block 2, loop depth 0
> >> ;;pred:   ENTRY
> >>_11 = .ADD_OVERFLOW (x_5(D), y_6(D));
> >>_1 = REALPART_EXPR <_11>;
> >>_10 = IMAGPART_EXPR <_11>;
> >>_2 = _10 != 0;
> >>_3 = (long unsigned int) _2;
> >>_4 = -_3;
> >>_7 = _1 | _4;
> >>return _7;
> >> ;;succ:   EXIT
> >>
> >> }
> >>
> >> After this patch:
> >>
> >> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> >> {
> >>uint64_t _7;
> >>
> >> ;;   basic block 2, loop depth 0
> >> ;;pred:   ENTRY
> >>_7 = .SAT_ADDU (x_5(D), y_6(D)); [tail call]
> >>return _7;
> >> ;;succ:   EXIT
> >>
> >> }
> >>
> >> Then we will have the middle-end representation like .SAT_ADDU after
> >> this patch.
> >
> > I'll note that on RTL we already have SS_PLUS/US_PLUS and friends and
> > the corresponding ssadd/usadd optabs.  There's not much documentation
> > unfortunately besides the use of gen_*_fixed_libfunc usage where the comment
> > suggests this is used for fixed-point operations.  It looks like arm uses
> > fractional/accumulator modes for this but for example bfin has ssaddsi3.
> >
> > So the question is whether the fixed-point case can be distinguished from
> > the integer case based on mode.
> >
> > There's also FIXED_POINT_TYPE on the GENERIC/GIMPLE side and
> > no special tree operator codes for them.  So compared to what appears
> > to be the case on RTL we'd need a way to represent saturating integer
> > operations on GIMPLE.
> >
> > The natural thing is to use direct optab internal functions (that's what you
> > basically did, but you added a new optab, IMO without good reason).
> > More 

RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

2024-02-27 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, February 27, 2024 9:44 AM
> To: Tamar Christina 
> Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai;
> yanzhang.w...@intel.com; kito.ch...@gmail.com;
> richard.sandiford@arm.com2; jeffreya...@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> US_PLUS
> 
> On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
>  wrote:
> >
> > Hi Pan,
> >
> > > From: Pan Li 
> > >
> > > Hi Richard & Tamar,
> > >
> > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > >
> > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > >
> >
> > I think a couple of things are being confused here.  So lets break it down:
> >
> > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > we only want one internal function for both signed and unsigned SAT_ADD.
> > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> >
> > Now the downside of this is that this is a direct internal optab.  This 
> > means
> > that for the representation to be used the target *must* have the optab
> > implemented.   This is a bit annoying because it doesn't allow us to 
> > generically
> > assume that all targets use SAT_ADD for saturating add and thus only have to
> > write optimization for this representation.
> >
> > This is why Richi said we may need to use a new tree_code because we can
> > override tree code expansions.  However the same can be done with the 
> > _EXT_FN
> > internal functions.
> >
> > So what I meant was that we want to have a combination of the two. i.e. a
> > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> 
> Whether we want/need _EXT or only direct depends mainly on how we want to
> leverage support.  If it's only during vectorization and possibly instruction
> selection a direct optab is IMO the way to go.  Generic optimization only
> marginally improves when you explode the number of basic operations you
> expose - in fact it gets quite unwieldly to support all of them in
> simplifications
> and/or canonicalization and you possibly need to translate them back to what
> the target CPU supports.
> 
> We already do have too many (IMO) "special" operations exposed "early"
> in the GIMPLE pipeline.
> 
> But what I'd like to see is that we do more instruction selection on GIMPLE
> but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> passes doing what I'd call instruction selection).  But that means not adding
> match.pd patterns for that or at least have a separate isel-match.pd
> machinery for that.
> 
> So as a start I would go for a direct optab and see to recognize it during
> ISEL?
> 

The problem with ISEL and the reason I suggested an indirect IFN is that there
Are benefit to be had from recognizing it early.  Saturating arithmetic can be 
optimized
Differently from non-saturating ones.

But additionally a common way of specifying them decomposes to branches
and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic).

These two representation can be detected in PHI-opts and it's beneficial to all
targets to canonicalize them to the branchless code.

Those two cases also *completely* stop vectorization because of either the
control flow or the fact the vectorizer can't handle complex types.

So really, gimple ISEL would fix just 1 of the 3 very common cases, and then
We'd still need to hack the vectorizer cost models for targets with saturating
vector instructions.

I of course defer to you, but it seems quite suboptimal to do it this way and
doesn't get us first class saturation support.

Additionally there have been discussions whether both clang and gcc should
provide __builtin_saturate_* methods, which the non-direct IFN would help
support.

Tamar.

> > If Richi agrees, the below is what I meant. It creates the infrastructure 
> > for this
> > and for now only allows a default fallback for unsigned saturating add and 
> > makes
> > it easier for us to add the rest later
> >
> > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus 
> > are
> the
> > RTL expressi

RE: [PATCH]middle-end: delay updating of dominators until later during vectorization. [PR114081]

2024-02-26 Thread Tamar Christina
> > The testcase shows an interesting case where we have multiple loops sharing 
> > a
> > live value and have an early exit that go to the same location.  The 
> > additional
> > complication is that on x86_64 with -mavx we seem to also do prologue 
> > peeling
> > on the loops.
> >
> > We correctly identify which BB we need their dominators updated for, but we 
> > do
> > so too early.
> >
> > Instead of adding more dominator update we can solve this by for the cases 
> > with
> > multiple exits not to verify dominators at the end of peeling if peeling for
> > vectorization.
> >
> > We can then perform the final dominator updates just before vectorization 
> > when
> > all loop transformations are done.
> 
> What's the actual CFG transform that happens between the old and the new
> place?  I see a possible edge splitting but where is the one that makes
> this patch work?

It's not one but two.
1. loop 1 is prologue peeled. This ICEs because the dominator update is only 
happening
for epilogue peeling.  Note that loop 1 here dominates 21 and the ICE is:

ice.c: In function 'void php_zval_filter(int, int)':
ice.c:7:6: error: dominator of 14 should be 21, not 3
7 | void php_zval_filter(int filter, int id1) {
  |  ^~~
ice.c:7:6: error: dominator of 10 should be 21, not 3
during GIMPLE pass: vect
dump file: a-ice.c.179t.vect

This can be simply fixed by just moving the dom update code down:

diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index a5202f32e27..e88948370c6 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1845,13 +1845,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop 
*loop, edge loop_exit,
 to the original function exit we recorded.  Other exits are already
 correct.  */
   if (multiple_exits_p)
-   {
- update_loop = new_loop;
- doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
- for (unsigned i = 0; i < doms.length (); ++i)
-   if (flow_bb_inside_loop_p (loop, doms[i]))
- doms.unordered_remove (i);
-   }
+   update_loop = new_loop;
 }
   else /* Add the copy at entry.  */
 {
@@ -1906,6 +1900,11 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop 
*loop, edge loop_exit,

   if (multiple_exits_p)
 {
+  doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
+  for (unsigned i = 0; i < doms.length (); ++i)
+   if (flow_bb_inside_loop_p (loop, doms[i]))
+ doms.unordered_remove (i);
+
   for (edge e : get_loop_exit_edges (update_loop))
{
  edge ex;

with that done, the next ICE comes along. Loop 1 is peeled again, but this time 
for epilogue.
however loop 1 no longer dominates the exits as the prologue peeled loop does.

So we don't find anything to update and ice with the second ICE:

ice.c: In function 'void php_zval_filter(int, int)':
ice.c:7:6: error: dominator of 14 should be 2, not 21
7 | void php_zval_filter(int filter, int id1) {
  |  ^~~
ice.c:7:6: error: dominator of 10 should be 2, not 21
during GIMPLE pass: vect
dump file: a-ice.c.179t.vect

because the prologue loop no longer dominates them due to the skip edge.  This 
is why delaying
works because we know we have to update the dominators of 14 and 10, but to 
what we don't know
yet.

Tamar

> 
> > This also means we reduce the number of dominator updates needed by at least
> > 50% and fixes the ICE.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and
> > x86_64-pc-linux-gnu no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > PR tree-optimization/114081
> > PR tree-optimization/113290
> > * tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
> > Skip dominator update when multiple exit.
> > (vect_do_peeling): Remove multiple exit dominator update.
> > * tree-vect-loop.cc (vect_transform_loop): Update dominators when
> > multiple exits.
> > * tree-vectorizer.h (LOOP_VINFO_DOMS_NEED_UPDATE,
> >  dominators_needing_update): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR tree-optimization/114081
> > PR tree-optimization/113290
> > * gcc.dg/vect/vect-early-break_120-pr114081.c: New test.
> > * gcc.dg/vect/vect-early-break_121-pr114081.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c
> > new file mode 100644
> > index
> ..2cd4ce1e4ac573ba6e4173
> 0fd2216f0ec8061376
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c
> > @@ -0,0 +1,38 @@
> > +/* { dg-do compile } */
> > +/* { dg-add-options vect_early_break } */
> > +/* { dg-require-effective-target vect_early_break } */
> > +/* { dg-require-effective-target vect_int } */
> > +/* { 

[PATCH]middle-end: delay updating of dominators until later during vectorization. [PR114081]

2024-02-25 Thread Tamar Christina
Hi All,

The testcase shows an interesting case where we have multiple loops sharing a
live value and have an early exit that go to the same location.  The additional
complication is that on x86_64 with -mavx we seem to also do prologue peeling
on the loops.

We correctly identify which BB we need their dominators updated for, but we do
so too early.

Instead of adding more dominator update we can solve this by for the cases with
multiple exits not to verify dominators at the end of peeling if peeling for
vectorization.

We can then perform the final dominator updates just before vectorization when
all loop transformations are done.

This also means we reduce the number of dominator updates needed by at least
50% and fixes the ICE.

Bootstrapped Regtested on aarch64-none-linux-gnu and
x86_64-pc-linux-gnu no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/114081
PR tree-optimization/113290
* tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg):
Skip dominator update when multiple exit.
(vect_do_peeling): Remove multiple exit dominator update.
* tree-vect-loop.cc (vect_transform_loop): Update dominators when
multiple exits.
* tree-vectorizer.h (LOOP_VINFO_DOMS_NEED_UPDATE,
 dominators_needing_update): New.

gcc/testsuite/ChangeLog:

PR tree-optimization/114081
PR tree-optimization/113290
* gcc.dg/vect/vect-early-break_120-pr114081.c: New test.
* gcc.dg/vect/vect-early-break_121-pr114081.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c
new file mode 100644
index 
..2cd4ce1e4ac573ba6e41730fd2216f0ec8061376
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+typedef struct filter_list_entry {
+  const char *name;
+  int id;
+  void (*function)();
+} filter_list_entry;
+
+static const filter_list_entry filter_list[9] = {0};
+
+void php_zval_filter(int filter, int id1) {
+  filter_list_entry filter_func;
+
+  int size = 9;
+  for (int i = 0; i < size; ++i) {
+if (filter_list[i].id == filter) {
+  filter_func = filter_list[i];
+  goto done;
+}
+  }
+
+#pragma GCC novector
+  for (int i = 0; i < size; ++i) {
+if (filter_list[i].id == 0x0204) {
+  filter_func = filter_list[i];
+  goto done;
+}
+  }
+done:
+  if (!filter_func.id)
+filter_func.function();
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c
new file mode 100644
index 
..feebdb7a6c9b8981d7be31dd1c741f9e36738515
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+typedef struct filter_list_entry {
+  const char *name;
+  int id;
+  void (*function)();
+} filter_list_entry;
+
+static const filter_list_entry filter_list[9] = {0};
+
+void php_zval_filter(int filter, int id1) {
+  filter_list_entry filter_func;
+
+  int size = 9;
+  for (int i = 0; i < size; ++i) {
+if (filter_list[i].id == filter) {
+  filter_func = filter_list[i];
+  goto done;
+}
+  }
+
+  for (int i = 0; i < size; ++i) {
+if (filter_list[i].id == 0x0204) {
+  filter_func = filter_list[i];
+  goto done;
+}
+  }
+done:
+  if (!filter_func.id)
+filter_func.function();
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 
3f974d6d839e32516ae316f28ca25316e43d7d86..b5e158bc5cfb5107d5ff461e489d306f81e090d0
 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1917,7 +1917,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, 
edge loop_exit,
  doms.safe_push (e->dest);
}
 
-  iterate_fix_dominators (CDI_DOMINATORS, doms, false);
   if (updated_doms)
updated_doms->safe_splice (doms);
 }
@@ -1925,7 +1924,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, 
edge loop_exit,
   free (new_bbs);
   free (bbs);
 
-  checking_verify_dominators (CDI_DOMINATORS);
+  /* If we're peeling for vectorization then delay verifying dominators.  */
+  if (!flow_loops || !multiple_exits_p)
+checking_verify_dominators (CDI_DOMINATORS);
 
   return 

RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

2024-02-25 Thread Tamar Christina
Hi Pan,

> From: Pan Li 
> 
> Hi Richard & Tamar,
> 
> Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> 
> I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> 

I think a couple of things are being confused here.  So lets break it down:

The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
we only want one internal function for both signed and unsigned SAT_ADD.
with this definition we don't need SAT_UADD and SAT_SADD but instead
we will only have SAT_ADD, which will expand to us_plus or ss_plus.

Now the downside of this is that this is a direct internal optab.  This means
that for the representation to be used the target *must* have the optab
implemented.   This is a bit annoying because it doesn't allow us to generically
assume that all targets use SAT_ADD for saturating add and thus only have to
write optimization for this representation.

This is why Richi said we may need to use a new tree_code because we can
override tree code expansions.  However the same can be done with the _EXT_FN
internal functions.

So what I meant was that we want to have a combination of the two. i.e. a
DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.

If Richi agrees, the below is what I meant. It creates the infrastructure for 
this
and for now only allows a default fallback for unsigned saturating add and makes
it easier for us to add the rest later

Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are 
the
RTL expression, but the optab for saturation are ssadd and usadd.  So you don't
need to make new us_plus and ss_plus ones.

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index a07f25f3aee..aaf9f8991b3 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn, 
tree_pair types,
return direct_##TYPE##_optab_supported_p (which_optab, types,   \
  opt_type);\
   }
+#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \
+UNSIGNED_OPTAB, TYPE)  \
+case IFN_##CODE:   \
+  {
\
+   optab which_optab = (TYPE_UNSIGNED (types.SELECTOR) \
+? UNSIGNED_OPTAB ## _optab \
+: SIGNED_OPTAB ## _optab); \
+   return direct_##TYPE##_optab_supported_p (which_optab, types,   \
+ opt_type) \
+  || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \
+  }
 #include "internal-fn.def"
 
 case IFN_LAST:
@@ -4303,6 +4314,8 @@ set_edom_supported_p (void)
 optab which_optab = direct_internal_fn_optab (fn, types);  \
 expand_##TYPE##_optab_fn (fn, stmt, which_optab);  \
   }
+#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \
+UNSIGNED_OPTAB, TYPE)
 #include "internal-fn.def"
 
 /* Routines to expand each internal function, indexed by function number.
@@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt)
   emit_move_insn (plhs, cmp);
 }
 }
+
+void
+expand_SAT_ADD (internal_fn fn, gcall *stmt)
+{
+  /* Check if the target supports the expansion through an IFN.  */
+  tree_pair types = direct_internal_fn_types (fn, stmt);
+  optab which_optab = direct_internal_fn_optab (fn, types);
+  if (direct_binary_optab_supported_p (which_optab, types,
+  insn_optimization_type ()))
+{
+  expand_binary_optab_fn (fn, stmt, which_optab);
+  return;
+}
+
+  /* Target does not support the optab, but we can de-compose it.  */
+  /*
+  ... decompose to a canonical representation ...
+  if (TYPE_UNSIGNED (types.SELECTOR))
+{
+  ...
+  decompose back to (X + Y) | - ((X + Y) < X)
+}
+  else
+{
+  ...
+}
+  */
+}
+
+bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /* optype 
*/)
+{
+  /* For now, don't support decomposing vector ops.  */
+  if (VECTOR_TYPE_P (type))
+return false;
+
+  /* Signed saturating arithmetic is harder to do since we'll so for now
+ lets ignore.  */
+  if (!TYPE_UNSIGNED (type))
+return false;
+
+  return TREE_CODE (type) == INTEGER_TYPE;
+}
\ No newline at end of file
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index c14d30365c1..5a2491228d5 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -92,6 +92,10 @@ along with GCC; see the file 

[PATCH]middle-end: update vuses out of loop which use a vdef that's moved [PR114068]

2024-02-23 Thread Tamar Christina
Hi All,

In certain cases we can have a situation where the merge block has a vUSE
virtual PHI and the exits do not.  In this case for instance the exits lead
to an abort so they have no virtual PHIs.  If we have a store before the first
exit and we move it to a later block during vectorization we update the vUSE
chain.

However the merge block is not an exit and is not visited by the update code.

This patch fixes it by checking during moving if there are any out of loop uses
of the vDEF that is the last_seen_vuse.  Normally there wouldn't be any and
things are skipped, but if there is then update that to the last vDEF in the
exit block.

Bootstrapped Regtested on aarch64-none-linux-gnu and
x86_64-pc-linux-gnu no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimizations/114068
* tree-vect-loop.cc (move_early_exit_stmts): Update vUSE chain in merge
block.

gcc/testsuite/ChangeLog:

PR tree-optimizations/114068
* gcc.dg/vect/vect-early-break_118-pr114068.c: New test.
* gcc.dg/vect/vect-early-break_119-pr114068.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c
new file mode 100644
index 
..b462a464b6603e718c5a283513ea586fc13e37ce
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+struct h {
+  int b;
+  int f;
+} k;
+
+void n(int m) {
+  struct h a = k;
+  for (int o = m; o; ++o) {
+if (a.f)
+  __builtin_unreachable();
+if (o > 1)
+  __builtin_unreachable();
+*( + o) = 1;
+  }
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_119-pr114068.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_119-pr114068.c
new file mode 100644
index 
..a65ef7b8c4901b2ada585f38fda436dc07d1e1de
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_119-pr114068.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+struct h {
+  int b;
+  int c;
+  int f;
+} k;
+
+void n(int m) {
+  struct h a = k;
+  for (int o = m; o; ++o) {
+if (a.f)
+  __builtin_unreachable();
+if (o > 1)
+  __builtin_unreachable();
+*( + o) = 1;
+*( + o*m) = 2;
+  }
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
35f1f8c7d4245135ace740ff9be548919587..44bd8032b55b1ef84fdf4fa9d6117304b7709d6f
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -11837,6 +11837,27 @@ move_early_exit_stmts (loop_vec_info loop_vinfo)
   update_stmt (p);
 }
 
+  /* last_seen_vuse should now be the PHI in the loop header.  Check for
+ any out of loop uses and update them to the vUSE on the loop latch.  */
+  auto vuse_stmt =  loop_vinfo->lookup_def (last_seen_vuse);
+  gphi *vuse_def;
+  if (vuse_stmt
+  && (vuse_def = dyn_cast  (STMT_VINFO_STMT (vuse_stmt
+{
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  gimple *use_stmt;
+  auto loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree vuse = PHI_ARG_DEF_FROM_EDGE (vuse_def, loop_latch_edge (loop));
+  FOR_EACH_IMM_USE_STMT (use_stmt, iter, last_seen_vuse)
+   {
+ if (flow_bb_inside_loop_p (loop, use_stmt->bb))
+   continue;
+ FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
+   SET_USE (use_p, vuse);
+   }
+}
+
   /* And update the LC PHIs on exits.  */
   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
 if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))




-- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c
new file mode 100644
index 
..b462a464b6603e718c5a283513ea586fc13e37ce
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+struct h {
+  int b;
+  int f;
+} k;
+
+void n(int m) {
+  struct h a = k;
+  for (int o = m; o; ++o) {
+if (a.f)
+  __builtin_unreachable();
+if (o > 1)
+  __builtin_unreachable();
+*( + o) = 1;
+  }
+}
diff --git 

RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU

2024-02-19 Thread Tamar Christina
> -Original Message-
> From: Li, Pan2 
> Sent: Monday, February 19, 2024 12:59 PM
> To: Tamar Christina ; Richard Biener
> 
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang
> ; kito.ch...@gmail.com
> Subject: RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU
> 
> Thanks Tamar for comments and explanations.
> 
> > I think we should actually do an indirect optab here, because the IFN can 
> > be used
> > to replace the general representation of saturating arithmetic.
> 
> > e.g. the __builtin_add_overflow case in
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600
> > is inefficient on all targets and so the IFN can always expand to something 
> > that's
> more
> > efficient like the branchless version add_sat2.
> 
> > I think this is why you suggested a new tree code below, but we don't 
> > really need
> > tree-codes for this. It can be done cleaner using the same way as
> DEF_INTERNAL_INT_EXT_FN
> 
> Yes, the backend could choose a branchless(of course we always hate branch for
> performance) code-gen or even better there is one saturation insn.
> Good to learn DEF_INTERNAL_INT_EXT_FN, and will have a try for it.
> 
> > Richard means that there shouldn't be .SAT_ADDU and .SAT_ADDS  and that the
> sign
> > should be determined by the types at expansion time.  i.e. there should 
> > only be
> > .SAT_ADD.
> 
> Got it, my initial idea comes from that we may have two insns for saturation 
> add,
> mostly these insns need to be signed or unsigned.
> For example, slt/sltu in riscv scalar. But I am not very clear about a 
> scenario like this.
> During define_expand in backend, we hit the standard name
> sat_add_3 but can we tell it is signed or not here? AFAIK, we only have 
> QI, HI,
> SI and DI.

Yeah, the way DEF_INTERNAL_SIGNED_OPTAB_FN works is that you give it two optabs,
one for when it's signed and one for when it's unsigned, and the right one is 
picked
automatically during expansion.  But in GIMPLE you'd only have one IFN.

> Maybe I will have the answer after try DEF_INTERNAL_SIGNED_OPTAB_FN, will
> keep you posted.

Awesome, Thanks!

Tamar
> 
> Pan
> 
> -Original Message-
> From: Tamar Christina 
> Sent: Monday, February 19, 2024 4:55 PM
> To: Li, Pan2 ; Richard Biener 
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang
> ; kito.ch...@gmail.com
> Subject: RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU
> 
> Thanks for doing this!
> 
> > -Original Message-
> > From: Li, Pan2 
> > Sent: Monday, February 19, 2024 8:42 AM
> > To: Richard Biener 
> > Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang
> > ; kito.ch...@gmail.com; Tamar Christina
> > 
> > Subject: RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU
> >
> > Thanks Richard for comments.
> >
> > > I'll note that on RTL we already have SS_PLUS/US_PLUS and friends and
> > > the corresponding ssadd/usadd optabs.  There's not much documentation
> > > unfortunately besides the use of gen_*_fixed_libfunc usage where the
> comment
> > > suggests this is used for fixed-point operations.  It looks like arm uses
> > > fractional/accumulator modes for this but for example bfin has ssaddsi3.
> >
> > I find the related description about plus family in GCC internals doc but 
> > it doesn't
> > mention
> > anything about mode m here.
> >
> > (plus:m x y)
> > (ss_plus:m x y)
> > (us_plus:m x y)
> > These three expressions all represent the sum of the values represented by x
> > and y carried out in machine mode m. They diff er in their behavior on 
> > overflow
> > of integer modes. plus wraps round modulo the width of m; ss_plus saturates
> > at the maximum signed value representable in m; us_plus saturates at the
> > maximum unsigned value.
> >
> > > The natural thing is to use direct optab internal functions (that's what 
> > > you
> > > basically did, but you added a new optab, IMO without good reason).
> 
> I think we should actually do an indirect optab here, because the IFN can be 
> used
> to replace the general representation of saturating arithmetic.
> 
> e.g. the __builtin_add_overflow case in
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600
> is inefficient on all targets and so the IFN can always expand to something 
> that's
> more
> efficient like the branchless version add_sat2.
> 
> I think this is why you suggested a new tree code below, but we don't really 
> need
> tree-codes for this. It can be done cleaner using the sam

RE: [PATCH]AArch64: xfail modes_1.f90 [PR107071]

2024-02-19 Thread Tamar Christina
> -Original Message-
> From: Tamar Christina
> Sent: Thursday, February 15, 2024 11:05 AM
> To: Richard Earnshaw (lists) ; gcc-
> patc...@gcc.gnu.org
> Cc: nd ; Marcus Shawcroft ; Kyrylo
> Tkachov ; Richard Sandiford
> 
> Subject: RE: [PATCH]AArch64: xfail modes_1.f90 [PR107071]
> 
> > -Original Message-
> > From: Richard Earnshaw (lists) 
> > Sent: Thursday, February 15, 2024 11:01 AM
> > To: Tamar Christina ; gcc-patches@gcc.gnu.org
> > Cc: nd ; Marcus Shawcroft ;
> Kyrylo
> > Tkachov ; Richard Sandiford
> > 
> > Subject: Re: [PATCH]AArch64: xfail modes_1.f90 [PR107071]
> >
> > On 15/02/2024 10:57, Tamar Christina wrote:
> > > Hi All,
> > >
> > > This test has never worked on AArch64 since the day it was committed.  It 
> > > has
> > > a number of issues that prevent it from working on AArch64:
> > >
> > > 1.  IEEE does not require that FP operations raise a SIGFPE for FP 
> > > operations,
> > >     only that an exception is raised somehow.
> > >
> > > 2. Most Arm designed cores don't raise SIGFPE and instead set a status 
> > > register
> > >    and some partner cores raise a SIGILL instead.
> > >
> > > 3. The way it checks for feenableexcept doesn't really work for AArch64.
> > >
> > > As such this test doesn't seem to really provide much value on AArch64 so 
> > > we
> > > should just xfail it.
> > >
> > > Regtested on aarch64-none-linux-gnu and no issues.
> > >
> > > Ok for master?
> >
> > Wouldn't it be better to just skip the test.  XFAIL just adds clutter to 
> > verbose
> output
> > and suggests that someday the tools might be fixed for this case.
> >
> > Better still would be a new dg-requires fp_exceptions_raise_sigfpe as a 
> > guard for
> > the test.
> 

It looks like this is similar to 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78314 so
I'll just similarly skip it.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 
b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
index 
205c47f38007d06116289c19d6b23cf3bf83bd48..e29d8c678e6e51c3f2e5dac53c7703bb18a99ac4
 100644
--- a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
+++ b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
@@ -1,5 +1,5 @@
 ! { dg-do run }
-!
+! { dg-skip-if "PR libfortran/78314" { aarch64*-*-gnu* arm*-*-gnueabi 
arm*-*-gnueabihf } }
 ! Test IEEE_MODES_TYPE, IEEE_GET_MODES and IEEE_SET_MODES
 
Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR fortran/107071
* gfortran.dg/ieee/modes_1.f90: skip aarch64, arm.


rb18274.patch
Description: rb18274.patch


RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU

2024-02-19 Thread Tamar Christina
Thanks for doing this!

> -Original Message-
> From: Li, Pan2 
> Sent: Monday, February 19, 2024 8:42 AM
> To: Richard Biener 
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang
> ; kito.ch...@gmail.com; Tamar Christina
> 
> Subject: RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU
> 
> Thanks Richard for comments.
> 
> > I'll note that on RTL we already have SS_PLUS/US_PLUS and friends and
> > the corresponding ssadd/usadd optabs.  There's not much documentation
> > unfortunately besides the use of gen_*_fixed_libfunc usage where the comment
> > suggests this is used for fixed-point operations.  It looks like arm uses
> > fractional/accumulator modes for this but for example bfin has ssaddsi3.
> 
> I find the related description about plus family in GCC internals doc but it 
> doesn't
> mention
> anything about mode m here.
> 
> (plus:m x y)
> (ss_plus:m x y)
> (us_plus:m x y)
> These three expressions all represent the sum of the values represented by x
> and y carried out in machine mode m. They diff er in their behavior on 
> overflow
> of integer modes. plus wraps round modulo the width of m; ss_plus saturates
> at the maximum signed value representable in m; us_plus saturates at the
> maximum unsigned value.
> 
> > The natural thing is to use direct optab internal functions (that's what you
> > basically did, but you added a new optab, IMO without good reason).

I think we should actually do an indirect optab here, because the IFN can be 
used
to replace the general representation of saturating arithmetic.

e.g. the __builtin_add_overflow case in 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600
is inefficient on all targets and so the IFN can always expand to something 
that's more
efficient like the branchless version add_sat2. 

I think this is why you suggested a new tree code below, but we don't really 
need
tree-codes for this. It can be done cleaner using the same way as 
DEF_INTERNAL_INT_EXT_FN.

> 
> That makes sense to me, I will try to leverage US_PLUS instead here.
> 
> > More GIMPLE-like would be to let the types involved decide whether
> > it's signed or unsigned saturation.  That's actually what I'd prefer here
> > and if we don't map 1:1 to optabs then instead use tree codes like
> > S_PLUS_EXPR (mimicing RTL here).
> 
> Sorry I don't get the point here for GIMPLE-like way. For the .SAT_ADDU, I 
> add one
> restriction
> like unsigned_p (type) in match.pd. Looks we have a better way here.
> 

Richard means that there shouldn't be .SAT_ADDU and .SAT_ADDS  and that the sign
should be determined by the types at expansion time.  i.e. there should only be
.SAT_ADD. 

i.e. instead of this

+DEF_INTERNAL_OPTAB_FN (SAT_ADDU, ECF_CONST | ECF_NOTHROW, sat_addu, binary)

You should use DEF_INTERNAL_SIGNED_OPTAB_FN.

Regards,
Tamar

> > Any other opinions?  Anyone knows more about fixed-point and RTL/modes?
> 
> AFAIK, the scalar of the riscv backend doesn't have fixed-point but the 
> vector does
> have. They
> share the same mode as vector integer. For example, RVVM1SI in vector-
> iterators.md. Kito
> and Juzhe can help to correct me if any misunderstandings.
> 
> Pan
> 
> -Original Message-
> From: Richard Biener 
> Sent: Monday, February 19, 2024 3:36 PM
> To: Li, Pan2 
> Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang
> ; kito.ch...@gmail.com; tamar.christ...@arm.com
> Subject: Re: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU
> 
> On Sat, Feb 17, 2024 at 11:30 AM  wrote:
> >
> > From: Pan Li 
> >
> > This patch would like to add the middle-end presentation for the
> > unsigned saturation add.  Aka set the result of add to the max
> > when overflow.  It will take the pattern similar as below.
> >
> > SAT_ADDU (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> >
> > Take uint8_t as example, we will have:
> >
> > * SAT_ADDU (1, 254)   => 255.
> > * SAT_ADDU (1, 255)   => 255.
> > * SAT_ADDU (2, 255)   => 255.
> > * SAT_ADDU (255, 255) => 255.
> >
> > The patch also implement the SAT_ADDU in the riscv backend as
> > the sample.  Given below example:
> >
> > uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> > {
> >   return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> > }
> >
> > Before this patch:
> >
> > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> > {
> >   long unsigned int _1;
> >   _Bool _2;
> >   long unsigned int _3;
> >   long unsigned int _4;
> >   uint64_t _7;
> >   long unsigned int _10;
> >   __complex__ long unsigned int _11

RE: [PATCH] aarch64: Improve PERM<{0}, a, ...> (64bit) by adding whole vector shift right [PR113872]

2024-02-15 Thread Tamar Christina
> -Original Message-
> From: Richard Sandiford 
> Sent: Thursday, February 15, 2024 2:56 PM
> To: Andrew Pinski 
> Cc: gcc-patches@gcc.gnu.org; Tamar Christina 
> Subject: Re: [PATCH] aarch64: Improve PERM<{0}, a, ...> (64bit) by adding 
> whole
> vector shift right [PR113872]
> 
> Andrew Pinski  writes:
> > The backend currently defines a whole vector shift left for 64bit vectors, 
> > adding
> the
> > shift right can also improve code for some PERMs too. So this adds that 
> > pattern.
> 
> Is this reversed?  It looks like we have the shift right and the patch is
> adding the shift left (at least in GCC internal and little-endian terms).
> 
> But on many Arm cores, EXT has a higher throughput than SHL, so I don't think
> we should do this unconditionally.

Yeah, on most (if not all) all Arm cores the EXT has higher throughput than SHL
and on Cortex-A75 the EXT has both higher throughput and lower latency.

I guess the expected gain here is that we wouldn't need to create the zero 
vector,
However on modern Arm cores the zero vector creation is free using movi and EXT
being three operand also means we only need one copy if e.g in a loop.

Kind Regards,
Tamar

> 
> Thanks,
> Richard
> 
> >
> > I added a testcase for the shift left also. I also fixed the instruction 
> > template
> > there which was using a space instead of a tab after the instruction.
> >
> > Built and tested on aarch64-linux-gnu.
> >
> > PR target/113872
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (vec_shr_):
> Use tab instead of space after
> > the instruction in the template.
> > (vec_shl_): New pattern
> > * config/aarch64/iterators.md (unspec): Add UNSPEC_VEC_SHL
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/perm_zero-1.c: New test.
> > * gcc.target/aarch64/perm_zero-2.c: New test.
> >
> > Signed-off-by: Andrew Pinski 
> > ---
> >  gcc/config/aarch64/aarch64-simd.md | 18 --
> >  gcc/config/aarch64/iterators.md|  1 +
> >  gcc/testsuite/gcc.target/aarch64/perm_zero-1.c | 15 +++
> >  gcc/testsuite/gcc.target/aarch64/perm_zero-2.c | 15 +++
> >  4 files changed, 47 insertions(+), 2 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-2.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> > index f8bb973a278..0d2f1ea3902 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -1592,9 +1592,23 @@ (define_insn "vec_shr_"
> >"TARGET_SIMD"
> >{
> >  if (BYTES_BIG_ENDIAN)
> > -  return "shl %d0, %d1, %2";
> > +  return "shl\t%d0, %d1, %2";
> >  else
> > -  return "ushr %d0, %d1, %2";
> > +  return "ushr\t%d0, %d1, %2";
> > +  }
> > +  [(set_attr "type" "neon_shift_imm")]
> > +)
> > +(define_insn "vec_shl_"
> > +  [(set (match_operand:VD 0 "register_operand" "=w")
> > +(unspec:VD [(match_operand:VD 1 "register_operand" "w")
> > +   (match_operand:SI 2 "immediate_operand" "i")]
> > +  UNSPEC_VEC_SHL))]
> > +  "TARGET_SIMD"
> > +  {
> > +if (BYTES_BIG_ENDIAN)
> > +  return "ushr\t%d0, %d1, %2";
> > +else
> > +  return "shl\t%d0, %d1, %2";
> >}
> >[(set_attr "type" "neon_shift_imm")]
> >  )
> > diff --git a/gcc/config/aarch64/iterators.md 
> > b/gcc/config/aarch64/iterators.md
> > index 99cde46f1ba..3aebe9cf18a 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -758,6 +758,7 @@ (define_c_enum "unspec"
> >  UNSPEC_PMULL; Used in aarch64-simd.md.
> >  UNSPEC_PMULL2   ; Used in aarch64-simd.md.
> >  UNSPEC_REV_REGLIST  ; Used in aarch64-simd.md.
> > +UNSPEC_VEC_SHL  ; Used in aarch64-simd.md.
> >  UNSPEC_VEC_SHR  ; Used in aarch64-simd.md.
> >  UNSPEC_SQRDMLAH ; Used in aarch64-simd.md.
> >  UNSPEC_SQRDMLSH ; Used in aarch64-simd.md.
> > diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c
> > new file mode 100644
> > 

RE: [PATCH]AArch64: xfail modes_1.f90 [PR107071]

2024-02-15 Thread Tamar Christina
> -Original Message-
> From: Richard Earnshaw (lists) 
> Sent: Thursday, February 15, 2024 11:01 AM
> To: Tamar Christina ; gcc-patches@gcc.gnu.org
> Cc: nd ; Marcus Shawcroft ; Kyrylo
> Tkachov ; Richard Sandiford
> 
> Subject: Re: [PATCH]AArch64: xfail modes_1.f90 [PR107071]
> 
> On 15/02/2024 10:57, Tamar Christina wrote:
> > Hi All,
> >
> > This test has never worked on AArch64 since the day it was committed.  It 
> > has
> > a number of issues that prevent it from working on AArch64:
> >
> > 1.  IEEE does not require that FP operations raise a SIGFPE for FP 
> > operations,
> >     only that an exception is raised somehow.
> >
> > 2. Most Arm designed cores don't raise SIGFPE and instead set a status 
> > register
> >    and some partner cores raise a SIGILL instead.
> >
> > 3. The way it checks for feenableexcept doesn't really work for AArch64.
> >
> > As such this test doesn't seem to really provide much value on AArch64 so we
> > should just xfail it.
> >
> > Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> 
> Wouldn't it be better to just skip the test.  XFAIL just adds clutter to 
> verbose output
> and suggests that someday the tools might be fixed for this case.
> 
> Better still would be a new dg-requires fp_exceptions_raise_sigfpe as a guard 
> for
> the test.

There seems to be check_effective_target_fenv_exceptions which seems to test for
if the target can raise FP exceptions.  I'll see if that works.

Thanks,
Tamar

> 
> R.
> 
> >
> > Thanks,
> > Tamar
> >
> > gcc/testsuite/ChangeLog:
> >
> >     PR fortran/107071
> >     * gfortran.dg/ieee/modes_1.f90: xfail aarch64.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
> b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
> > index
> 205c47f38007d06116289c19d6b23cf3bf83bd48..3667571969427ae7b2b9668
> 4ec1af8b3fdd4985f 100644
> > --- a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
> > +++ b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
> > @@ -1,4 +1,4 @@
> > -! { dg-do run }
> > +! { dg-do run { xfail { aarch64*-*-* } } }
> >  !
> >  ! Test IEEE_MODES_TYPE, IEEE_GET_MODES and IEEE_SET_MODES
> >
> >
> >
> >
> >
> > --



[PATCH]AArch64: xfail modes_1.f90 [PR107071]

2024-02-15 Thread Tamar Christina
Hi All,

This test has never worked on AArch64 since the day it was committed.  It has
a number of issues that prevent it from working on AArch64:

1.  IEEE does not require that FP operations raise a SIGFPE for FP operations,
only that an exception is raised somehow.

2. Most Arm designed cores don't raise SIGFPE and instead set a status register
   and some partner cores raise a SIGILL instead.

3. The way it checks for feenableexcept doesn't really work for AArch64.

As such this test doesn't seem to really provide much value on AArch64 so we
should just xfail it.

Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR fortran/107071
* gfortran.dg/ieee/modes_1.f90: xfail aarch64.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 
b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
index 
205c47f38007d06116289c19d6b23cf3bf83bd48..3667571969427ae7b2b96684ec1af8b3fdd4985f
 100644
--- a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
+++ b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
@@ -1,4 +1,4 @@
-! { dg-do run }
+! { dg-do run { xfail { aarch64*-*-* } } }
 !
 ! Test IEEE_MODES_TYPE, IEEE_GET_MODES and IEEE_SET_MODES
 




-- 
diff --git a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 
b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
index 
205c47f38007d06116289c19d6b23cf3bf83bd48..3667571969427ae7b2b96684ec1af8b3fdd4985f
 100644
--- a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
+++ b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90
@@ -1,4 +1,4 @@
-! { dg-do run }
+! { dg-do run { xfail { aarch64*-*-* } } }
 !
 ! Test IEEE_MODES_TYPE, IEEE_GET_MODES and IEEE_SET_MODES
 





RE: [PATCH]AArch64: remove ls64 from being mandatory on armv8.7-a..

2024-02-15 Thread Tamar Christina
Hi,  this I a new version of the patch updating some additional tests
because some of the LTO tests required a newer binutils than my distro had.

---

The Arm Architectural Reference Manual (Version J.a, section A2.9 on FEAT_LS64)
shows that ls64 is an optional extensions and should not be enabled by default
for Armv8.7-a.

This drops it from the mandatory bits for the architecture and brings GCC inline
with LLVM and the achitecture.

Note that we will not be changing binutils to preserve compatibility with older
released compilers.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? and backport to GCC 13,12,11?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-arches.def (AARCH64_ARCH): Remove LS64 from
Armv8.7-a.

gcc/testsuite/ChangeLog:

* g++.target/aarch64/acle/ls64.C: Add +ls64.
* g++.target/aarch64/acle/ls64_lto.C: Likewise.
* gcc.target/aarch64/acle/ls64_lto.c: Likewise.
* gcc.target/aarch64/acle/pr110100.c: Likewise.
* gcc.target/aarch64/acle/pr110132.c: Likewise.
* gcc.target/aarch64/options_set_28.c: Drop check for nols64.
* gcc.target/aarch64/pragma_cpp_predefs_2.c: Correct header checks.

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-arches.def 
b/gcc/config/aarch64/aarch64-arches.def
index 
b7115ff7c3d4a7ee7abbedcb091ef15a7efacc79..9bec30e9203bac01155281ef3474846c402bb29e
 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -37,7 +37,7 @@ AARCH64_ARCH("armv8.3-a", generic_armv8_a,   V8_3A, 
8,  (V8_2A, PAUTH, R
 AARCH64_ARCH("armv8.4-a", generic_armv8_a,   V8_4A, 8,  (V8_3A, 
F16FML, DOTPROD, FLAGM))
 AARCH64_ARCH("armv8.5-a", generic_armv8_a,   V8_5A, 8,  (V8_4A, SB, 
SSBS, PREDRES))
 AARCH64_ARCH("armv8.6-a", generic_armv8_a,   V8_6A, 8,  (V8_5A, I8MM, 
BF16))
-AARCH64_ARCH("armv8.7-a", generic_armv8_a,   V8_7A, 8,  (V8_6A, LS64))
+AARCH64_ARCH("armv8.7-a", generic_armv8_a,   V8_7A, 8,  (V8_6A))
 AARCH64_ARCH("armv8.8-a", generic_armv8_a,   V8_8A, 8,  (V8_7A, MOPS))
 AARCH64_ARCH("armv8.9-a", generic_armv8_a,   V8_9A, 8,  (V8_8A))
 AARCH64_ARCH("armv8-r",   generic_armv8_a,   V8R  , 8,  (V8_4A))
diff --git a/gcc/testsuite/g++.target/aarch64/acle/ls64.C 
b/gcc/testsuite/g++.target/aarch64/acle/ls64.C
index 
d9002785b578741bde1202761f0881dc3d47e608..dcfe6f1af6711a7f3ec2562f6aabf56baecf417d
 100644
--- a/gcc/testsuite/g++.target/aarch64/acle/ls64.C
+++ b/gcc/testsuite/g++.target/aarch64/acle/ls64.C
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-march=armv8.7-a" } */
+/* { dg-additional-options "-march=armv8.7-a+ls64" } */
 #include 
 int main()
 {
diff --git a/gcc/testsuite/g++.target/aarch64/acle/ls64_lto.C 
b/gcc/testsuite/g++.target/aarch64/acle/ls64_lto.C
index 
274a4771e1c1d13bcb1a7bdc77c2e499726f024c..0198fe2a1b78627b873bf22e3d8416dbdcc77078
 100644
--- a/gcc/testsuite/g++.target/aarch64/acle/ls64_lto.C
+++ b/gcc/testsuite/g++.target/aarch64/acle/ls64_lto.C
@@ -1,5 +1,5 @@
 /* { dg-do link { target aarch64_asm_ls64_ok } } */
-/* { dg-additional-options "-march=armv8.7-a -flto" } */
+/* { dg-additional-options "-march=armv8.7-a+ls64 -flto" } */
 #include 
 int main()
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/ls64_lto.c 
b/gcc/testsuite/gcc.target/aarch64/acle/ls64_lto.c
index 
8b4f24277717675badc39dd145d365f75f5ceb27..0e5ae0b052b50b08d35151f4bc113617c1569bd3
 100644
--- a/gcc/testsuite/gcc.target/aarch64/acle/ls64_lto.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/ls64_lto.c
@@ -1,5 +1,5 @@
 /* { dg-do link { target aarch64_asm_ls64_ok } } */
-/* { dg-additional-options "-march=armv8.7-a -flto" } */
+/* { dg-additional-options "-march=armv8.7-a+ls64 -flto" } */
 #include 
 int main(void)
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c 
b/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c
index 
f56d5e619e8ac23cdf720574bd6ee08fbfd36423..62a82b97c56debad092cc8fd1ed48f0219109cd7
 100644
--- a/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8.7-a -O2" } */
+/* { dg-options "-march=armv8.7-a+ls64 -O2" } */
 #include 
 void do_st64b(data512_t data) {
   __arm_st64b((void*)0x1000, data);
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c 
b/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c
index 
fb88d633dd20772fd96e976a400fe52ae0bc3647..423d91b9a99f269d01d07428414ade7cc518c711
 100644
--- a/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-march=armv8.7-a" } */
+/* { dg-additional-options "-march=armv8.7-a+ls64" } */
 
 /* Check that ls64 builtins can be invoked using a preprocesed testcase
without triggering bogus builtin warnings, 

RE: [PATCH]AArch64: update vget_set_lane_1.c test output

2024-02-15 Thread Tamar Christina
> -Original Message-
> From: Richard Sandiford 
> Sent: Thursday, February 1, 2024 4:42 PM
> To: Tamar Christina 
> Cc: Andrew Pinski ; gcc-patches@gcc.gnu.org; nd
> ; Richard Earnshaw ; Marcus
> Shawcroft ; Kyrylo Tkachov
> 
> Subject: Re: [PATCH]AArch64: update vget_set_lane_1.c test output
> 
> Tamar Christina  writes:
> >> -Original Message-
> >> From: Richard Sandiford 
> >> Sent: Thursday, February 1, 2024 2:24 PM
> >> To: Andrew Pinski 
> >> Cc: Tamar Christina ; gcc-patches@gcc.gnu.org; nd
> >> ; Richard Earnshaw ; Marcus
> >> Shawcroft ; Kyrylo Tkachov
> >> 
> >> Subject: Re: [PATCH]AArch64: update vget_set_lane_1.c test output
> >>
> >> Andrew Pinski  writes:
> >> > On Thu, Feb 1, 2024 at 1:26 AM Tamar Christina 
> >> wrote:
> >> >>
> >> >> Hi All,
> >> >>
> >> >> In the vget_set_lane_1.c test the following entries now generate a zip1
> instead
> >> of an INS
> >> >>
> >> >> BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
> >> >> BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
> >> >> BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
> >> >>
> >> >> This is because the non-Q variant for indices 0 and 1 are just 
> >> >> shuffling values.
> >> >> There is no perf difference between INS SIMD to SIMD and ZIP, as such 
> >> >> just
> >> update the
> >> >> test file.
> >> > Hmm, is this true on all cores? I suspect there is a core out there
> >> > where INS is implemented with a much lower latency than ZIP.
> >> > If we look at config/aarch64/thunderx.md, we can see INS is 2 cycles
> >> > while ZIP is 6 cycles (3/7 for q versions).
> >> > Now I don't have any invested interest in that core any more but I
> >> > just wanted to point out that is not exactly true for all cores.
> >>
> >> Thanks for the pointer.  In that case, perhaps we should prefer
> >> aarch64_evpc_ins over aarch64_evpc_zip in
> aarch64_expand_vec_perm_const_1?
> >> That's enough to fix this failure, but it'll probably require other
> >> tests to be adjusted...
> >
> > I think given that Thundex-X is a 10 year old micro-architecture that is 
> > several
> cases where
> > often used instructions have very high latencies that generic codegen 
> > should not
> be blocked
> > from progressing because of it.
> >
> > we use zips in many things and if thunderx codegen is really of that much
> importance then I
> > think the old codegen should be gated behind -mcpu=thunderx rather than
> preventing generic
> > changes.
> 
> But you said there was no perf difference between INS and ZIP, so it
> sounds like for all known cases, using INS rather than ZIP is either
> neutral or better.
> 
> There's also the possible secondary benefit that the INS patterns use
> standard RTL operations whereas the ZIP patterns use unspecs.
> 
> Keeping ZIP seems OK there's a specific reason to prefer it over INS for
> more modern cores though.

Ok, that's a fair point.  Doing some due diligence, Neoverse-E1 and
Cortex-A65 SWoGs seem to imply that there ZIPs have better throughput
than INSs. However the entries are inconsistent and I can't measure the
difference so I believe this to be a documentation bug.

That said, switching the operands seems to show one issue in that preferring
INS degenerates code in cases where we are inserting the top bits of the first
parameter into the bottom of the second parameter and returning,

Zip being a Three operand instruction allows us to put the result into the final
destination register with one operation whereas INS requires an fmov:

foo_uzp1_s32:
ins v0.s[1], v1.s[0]
fmovd0, d0
ret
foo_uzp2_s32:
ins v1.s[0], v0.s[1]
fmovd0, d1
ret

I've posted uzp but zip has the same issue.

So I guess it's not better to flip the order but perhaps I should add a case to
the zip/unzip RTL patterns for when op0 == op1?

Thanks,
Tamar
> 
> Thanks,
> Richard



[PATCH]AArch64: remove ls64 from being mandatory on armv8.7-a..

2024-02-14 Thread Tamar Christina
Hi All,

The Arm Architectural Reference Manual (Version J.a, section A2.9 on FEAT_LS64)
shows that ls64 is an optional extensions and should not be enabled by default
for Armv8.7-a.

This drops it from the mandatory bits for the architecture and brings GCC inline
with LLVM and the achitecture.

Note that we will not be changing binutils to preserve compatibility with older
released compilers.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master? and backport to GCC 13,12,11?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-arches.def (AARCH64_ARCH): Remove LS64 from
Armv8.7-a.

gcc/testsuite/ChangeLog:

* g++.target/aarch64/acle/ls64.C: Add +ls64.
* gcc.target/aarch64/acle/pr110100.c: Likewise.
* gcc.target/aarch64/acle/pr110132.c: Likewise.
* gcc.target/aarch64/options_set_28.c: Drop check for nols64.
* gcc.target/aarch64/pragma_cpp_predefs_2.c: Correct header checks.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-arches.def 
b/gcc/config/aarch64/aarch64-arches.def
index 
b7115ff7c3d4a7ee7abbedcb091ef15a7efacc79..9bec30e9203bac01155281ef3474846c402bb29e
 100644
--- a/gcc/config/aarch64/aarch64-arches.def
+++ b/gcc/config/aarch64/aarch64-arches.def
@@ -37,7 +37,7 @@ AARCH64_ARCH("armv8.3-a", generic_armv8_a,   V8_3A, 
8,  (V8_2A, PAUTH, R
 AARCH64_ARCH("armv8.4-a", generic_armv8_a,   V8_4A, 8,  (V8_3A, 
F16FML, DOTPROD, FLAGM))
 AARCH64_ARCH("armv8.5-a", generic_armv8_a,   V8_5A, 8,  (V8_4A, SB, 
SSBS, PREDRES))
 AARCH64_ARCH("armv8.6-a", generic_armv8_a,   V8_6A, 8,  (V8_5A, I8MM, 
BF16))
-AARCH64_ARCH("armv8.7-a", generic_armv8_a,   V8_7A, 8,  (V8_6A, LS64))
+AARCH64_ARCH("armv8.7-a", generic_armv8_a,   V8_7A, 8,  (V8_6A))
 AARCH64_ARCH("armv8.8-a", generic_armv8_a,   V8_8A, 8,  (V8_7A, MOPS))
 AARCH64_ARCH("armv8.9-a", generic_armv8_a,   V8_9A, 8,  (V8_8A))
 AARCH64_ARCH("armv8-r",   generic_armv8_a,   V8R  , 8,  (V8_4A))
diff --git a/gcc/testsuite/g++.target/aarch64/acle/ls64.C 
b/gcc/testsuite/g++.target/aarch64/acle/ls64.C
index 
d9002785b578741bde1202761f0881dc3d47e608..dcfe6f1af6711a7f3ec2562f6aabf56baecf417d
 100644
--- a/gcc/testsuite/g++.target/aarch64/acle/ls64.C
+++ b/gcc/testsuite/g++.target/aarch64/acle/ls64.C
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-march=armv8.7-a" } */
+/* { dg-additional-options "-march=armv8.7-a+ls64" } */
 #include 
 int main()
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c 
b/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c
index 
f56d5e619e8ac23cdf720574bd6ee08fbfd36423..62a82b97c56debad092cc8fd1ed48f0219109cd7
 100644
--- a/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=armv8.7-a -O2" } */
+/* { dg-options "-march=armv8.7-a+ls64 -O2" } */
 #include 
 void do_st64b(data512_t data) {
   __arm_st64b((void*)0x1000, data);
diff --git a/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c 
b/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c
index 
fb88d633dd20772fd96e976a400fe52ae0bc3647..423d91b9a99f269d01d07428414ade7cc518c711
 100644
--- a/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c
+++ b/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-march=armv8.7-a" } */
+/* { dg-additional-options "-march=armv8.7-a+ls64" } */
 
 /* Check that ls64 builtins can be invoked using a preprocesed testcase
without triggering bogus builtin warnings, see PR110132.
diff --git a/gcc/testsuite/gcc.target/aarch64/options_set_28.c 
b/gcc/testsuite/gcc.target/aarch64/options_set_28.c
index 
9e63768581e9d429e9408863942051b1b04761ac..d5b15f8bc5831de56fe667179d83d9c853529aaf
 100644
--- a/gcc/testsuite/gcc.target/aarch64/options_set_28.c
+++ b/gcc/testsuite/gcc.target/aarch64/options_set_28.c
@@ -1,9 +1,9 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-march=armv9.3-a+nopredres+nols64+nomops" } */
+/* { dg-additional-options "-march=armv9.3-a+nopredres+nomops" } */
 
 int main ()
 {
   return 0;
 }
 
-/* { dg-final { scan-assembler-times {\.arch 
armv9\.3\-a\+crc\+nopredres\+nols64\+nomops\n} 1 } } */
+/* { dg-final { scan-assembler-times {\.arch 
armv9\.3\-a\+crc\+nopredres\+nomops\n} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c 
b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
index 
2d76bfc23dfdcd78a74ec0e4845a3bd8d110b010..d8fc86d1557895f91ffe8be2f65d6581abe51568
 100644
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c
@@ -242,8 +242,8 @@
 
 #pragma GCC push_options
 #pragma GCC target ("arch=armv8.7-a")
-#ifndef __ARM_FEATURE_LS64
-#error "__ARM_FEATURE_LS64 is not defined but should be!"
+#ifdef __ARM_FEATURE_LS64
+#error 

RE: [PATCH]middle-end: inspect all exits for additional annotations for loop.

2024-02-14 Thread Tamar Christina
> 
> I think this isn't entirely good.  For simple cases for
> do {} while the condition ends up in the latch while for while () {}
> loops it ends up in the header.  In your case the latch isn't empty
> so it doesn't end up with the conditional.
> 
> I think your patch is OK to the point of looking at all loop exit
> sources but you should elide the special-casing of header and
> latch since it's really only exit conditionals that matter.
> 

That makes sense, since in both cases the edges are in the respective
blocks.  Should have thought about it more.

So how about this one.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-cfg.cc (replace_loop_annotate): Inspect loop edges for 
annotations.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-novect_gcond.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c 
b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c
new file mode 100644
index 
..01e69cbef9d51b234c08a400c78dc078d53252f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c
@@ -0,0 +1,39 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break_hw } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+#include "tree-vect.h"
+
+#define N 306
+#define NEEDLE 136
+
+int table[N];
+
+__attribute__ ((noipa))
+int foo (int i, unsigned short parse_tables_n)
+{
+  parse_tables_n >>= 9;
+  parse_tables_n += 11;
+#pragma GCC novector
+  while (i < N && parse_tables_n--)
+table[i++] = 0;
+
+  return table[NEEDLE];
+}
+
+int main ()
+{
+  check_vect ();
+
+#pragma GCC novector
+  for (int j = 0; j < N; j++)
+table[j] = -1;
+
+  if (foo (0, 0x) != 0)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 
cdd439fe7506e7bc33654ffa027b493f23d278ac..bdffc3b4ed277724e81b7dd67fe7966e8ece0c13
 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -320,12 +320,9 @@ replace_loop_annotate (void)
 
   for (auto loop : loops_list (cfun, 0))
 {
-  /* First look into the header.  */
-  replace_loop_annotate_in_block (loop->header, loop);
-
-  /* Then look into the latch, if any.  */
-  if (loop->latch)
-   replace_loop_annotate_in_block (loop->latch, loop);
+  /* Check all exit source blocks for annotations.  */
+  for (auto e : get_loop_exit_edges (loop))
+   replace_loop_annotate_in_block (e->src, loop);
 
   /* Push the global flag_finite_loops state down to individual loops.  */
   loop->finite_p = flag_finite_loops;


rb18267.patch
Description: rb18267.patch


[PATCH]middle-end: inspect all exits for additional annotations for loop.

2024-02-14 Thread Tamar Christina
Hi All,

Attaching a pragma to a loop which has a complex condition often gets the pragma
dropped. e.g.

#pragma GCC novector
  while (i < N && parse_tables_n--)

before lowering this is represented as:

 if (ANNOTATE_EXPR ) ...

But after lowering the condition is broken appart and attached to the final
component of the expression:

  if (parse_tables_n.2_2 != 0) goto ; else goto ;
  :
iftmp.1D.4452 = 1;
goto ;
  :
iftmp.1D.4452 = 0;
  :
D.4451 = .ANNOTATE (iftmp.1D.4452, 2, 0);
if (D.4451 != 0) goto ; else goto ;
  :

and it's never heard from again because during replace_loop_annotate we only
inspect the loop header and latch for annotations.

Since annotations were supposed to apply to the loop as a whole this fixes it
by also checking the loop exit src blocks for annotations.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-cfg.cc (replace_loop_annotate): Inspect loop edges for 
annotations.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-novect_gcond.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c 
b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c
new file mode 100644
index 
..01e69cbef9d51b234c08a400c78dc078d53252f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c
@@ -0,0 +1,39 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break_hw } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+#include "tree-vect.h"
+
+#define N 306
+#define NEEDLE 136
+
+int table[N];
+
+__attribute__ ((noipa))
+int foo (int i, unsigned short parse_tables_n)
+{
+  parse_tables_n >>= 9;
+  parse_tables_n += 11;
+#pragma GCC novector
+  while (i < N && parse_tables_n--)
+table[i++] = 0;
+
+  return table[NEEDLE];
+}
+
+int main ()
+{
+  check_vect ();
+
+#pragma GCC novector
+  for (int j = 0; j < N; j++)
+table[j] = -1;
+
+  if (foo (0, 0x) != 0)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 
cdd439fe7506e7bc33654ffa027b493f23d278ac..a29681bffb902d2d05e3f18764ab519aacb3c5bc
 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -327,6 +327,10 @@ replace_loop_annotate (void)
   if (loop->latch)
replace_loop_annotate_in_block (loop->latch, loop);
 
+  /* Then also check all other exits.  */
+  for (auto e : get_loop_exit_edges (loop))
+   replace_loop_annotate_in_block (e->src, loop);
+
   /* Push the global flag_finite_loops state down to individual loops.  */
   loop->finite_p = flag_finite_loops;
 }




-- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c 
b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c
new file mode 100644
index 
..01e69cbef9d51b234c08a400c78dc078d53252f1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c
@@ -0,0 +1,39 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break_hw } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+#include "tree-vect.h"
+
+#define N 306
+#define NEEDLE 136
+
+int table[N];
+
+__attribute__ ((noipa))
+int foo (int i, unsigned short parse_tables_n)
+{
+  parse_tables_n >>= 9;
+  parse_tables_n += 11;
+#pragma GCC novector
+  while (i < N && parse_tables_n--)
+table[i++] = 0;
+
+  return table[NEEDLE];
+}
+
+int main ()
+{
+  check_vect ();
+
+#pragma GCC novector
+  for (int j = 0; j < N; j++)
+table[j] = -1;
+
+  if (foo (0, 0x) != 0)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 
cdd439fe7506e7bc33654ffa027b493f23d278ac..a29681bffb902d2d05e3f18764ab519aacb3c5bc
 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -327,6 +327,10 @@ replace_loop_annotate (void)
   if (loop->latch)
replace_loop_annotate_in_block (loop->latch, loop);
 
+  /* Then also check all other exits.  */
+  for (auto e : get_loop_exit_edges (loop))
+   replace_loop_annotate_in_block (e->src, loop);
+
   /* Push the global flag_finite_loops state down to individual loops.  */
   loop->finite_p = flag_finite_loops;
 }





[PATCH]middle-end: update vector loop upper bounds when early break vect [PR113734]

2024-02-13 Thread Tamar Christina
Hi All,

When doing early break vectorization we should treat the final iteration as
possibly being partial.  This so that when we calculate the vector loop upper
bounds we take into account that final iteration could have done some work.

The attached testcase shows that if we don't then cunroll may unroll the loop an
if the upper bound is wrong we lose a vector iteration.

This is similar to how we adjust the scalar loop bounds for the PEELED case.

Bootstrapped Regtested on aarch64-none-linux-gnu and
x86_64-pc-linux-gnu no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/113734
* tree-vect-loop.cc (vect_transform_loop): Treat the final iteration of
an early break loop as partial.

gcc/testsuite/ChangeLog:

PR tree-optimization/113734
* gcc.dg/vect/vect-early-break_117-pr113734.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c
new file mode 100644
index 
..36ae09483dfd426f977a3d92cf24a78d76de6961
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c
@@ -0,0 +1,37 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break_hw } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+#include "tree-vect.h"
+
+#define N 306
+#define NEEDLE 136
+
+int table[N];
+
+__attribute__ ((noipa))
+int foo (int i, unsigned short parse_tables_n)
+{
+  parse_tables_n >>= 9;
+  parse_tables_n += 11;
+  while (i < N && parse_tables_n--)
+table[i++] = 0;
+
+  return table[NEEDLE];
+}
+
+int main ()
+{
+  check_vect ();
+
+  for (int j = 0; j < N; j++)
+table[j] = -1;
+
+  if (foo (0, 0x) != 0)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
854e9d78bc71721e6559a6bc5dff78c813603a78..0b1656fef2fed83f30295846c382ad9fb318454a
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -12171,7 +12171,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)
   /* True if the final iteration might not handle a full vector's
  worth of scalar iterations.  */
   bool final_iter_may_be_partial
-= LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
+= LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+  || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
   /* The minimum number of iterations performed by the epilogue.  This
  is 1 when peeling for gaps because we always need a final scalar
  iteration.  */




-- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c
new file mode 100644
index 
..36ae09483dfd426f977a3d92cf24a78d76de6961
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c
@@ -0,0 +1,37 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break_hw } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-O3" } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+#include "tree-vect.h"
+
+#define N 306
+#define NEEDLE 136
+
+int table[N];
+
+__attribute__ ((noipa))
+int foo (int i, unsigned short parse_tables_n)
+{
+  parse_tables_n >>= 9;
+  parse_tables_n += 11;
+  while (i < N && parse_tables_n--)
+table[i++] = 0;
+
+  return table[NEEDLE];
+}
+
+int main ()
+{
+  check_vect ();
+
+  for (int j = 0; j < N; j++)
+table[j] = -1;
+
+  if (foo (0, 0x) != 0)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
854e9d78bc71721e6559a6bc5dff78c813603a78..0b1656fef2fed83f30295846c382ad9fb318454a
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -12171,7 +12171,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)
   /* True if the final iteration might not handle a full vector's
  worth of scalar iterations.  */
   bool final_iter_may_be_partial
-= LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
+= LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+  || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
   /* The minimum number of iterations performed by the epilogue.  This
  is 1 when peeling for gaps because we always need a final scalar
  iteration.  */





RE: [PATCH]middle-end: add two debug counters for early-break vectorization debugging

2024-02-08 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Thursday, February 8, 2024 2:16 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; j...@ventanamicro.com
> Subject: Re: [PATCH]middle-end: add two debug counters for early-break
> vectorization debugging
> 
> On Thu, 8 Feb 2024, Tamar Christina wrote:
> 
> > Hi All,
> >
> > This adds two new debug counter to aid in debugging early break code.
> >
> > - vect_force_last_exit: when reached will always force the final loop exit.
> > - vect_skip_exit: when reached will skip selecting the current candidate 
> > exit
> >   as the loop exit.
> >
> > The first counter essentially allows you to turn off the PEELED case and the
> > second counter to pick a different exit, which may mean you pick no exit at
> > all.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * dbgcnt.def (vect_force_last_exit, vect_skip_exit): New.
> > * tree-vect-loop.cc (vec_init_loop_exit_info): Use them.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
> > index
> ed9f062eac2c28c52df76b39d4312dd9fde1c800..8f7bebf93fceabdf6ae86c2df5
> 91eae4848b8a5c 100644
> > --- a/gcc/dbgcnt.def
> > +++ b/gcc/dbgcnt.def
> > @@ -213,5 +213,7 @@ DEBUG_COUNTER (stv_conversion)
> >  DEBUG_COUNTER (tail_call)
> >  DEBUG_COUNTER (tree_sra)
> >  DEBUG_COUNTER (treepre_insert)
> > +DEBUG_COUNTER (vect_force_last_exit)
> >  DEBUG_COUNTER (vect_loop)
> > +DEBUG_COUNTER (vect_skip_exit)
> >  DEBUG_COUNTER (vect_slp)
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index
> eed2268e9bae7e7ad36d13da03e0b54eab26ef6f..854e9d78bc71721e6559a6bc
> 5dff78c813603a78 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -58,6 +58,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "tree-eh.h"
> >  #include "case-cfn-macros.h"
> >  #include "langhooks.h"
> > +#include "dbgcnt.h"
> >
> >  /* Loop Vectorization Pass.
> >
> > @@ -977,6 +978,20 @@ vec_init_loop_exit_info (class loop *loop)
> >if (exits.length () == 1)
> >  return exits[0];
> >
> > +  /* Check to see if we've been asked to force the last exit.  */
> > +  if (!dbg_cnt (vect_force_last_exit))
> > +{
> > +  basic_block bb = ip_normal_pos (loop);
> > +  if (!bb)
> > +   return NULL;
> > +
> > +  edge exit = EDGE_SUCC (bb, 0);
> > +  if (exit->dest == loop->latch)
> > +   return EDGE_SUCC (bb, 1);
> > +
> > +  return exit;
> 
> Err, that's quite odd.  Why not just below do
> 
> > +}
> > +
> >/* If we have multiple exits we only support counting IV at the moment.
> >   Analyze all exits and return the last one we can analyze.  */
> >class tree_niter_desc niter_desc;
> > @@ -998,6 +1013,7 @@ vec_init_loop_exit_info (class loop *loop)
> >&& exit->src == single_pred (loop->latch)
> >&& (integer_nonzerop (may_be_zero)
> >|| COMPARISON_CLASS_P (may_be_zero
> > + && dbg_cnt (vect_skip_exit)
> 
>   && (dbg_cnt (vect_force_last_exit)
>   || exit->src == single_pred (loop->latch))
> 
> (also computed above already)?  It's also oddly named, it's more like
> vect_allow_peeled_exit or so.

Because this isn't deterministic. If a loop has n exits the above always forces
you to pick the final one regardless of n, rather than just skip consideration 
of an exit.

And in that case is there a point in analyzing all the exits just to throw away 
the information?

Doing in inside the consideration check would only skip one exit unless I'm 
misunderstanding.

> 
> It's also seemingly redundant with vect_skip_exit, no?
> 
> Note the counter gets incremented even if we'd not consider the exit
> because we have a later candidate already.
> 
> I fear it's going to be quite random even with the debug counter.

It is, I think the first counter is more useful. But in general the reason I 
kept the second counter
which kinda does what was suggested in the RFC I sent before was that it should 
in theory at
least allow us to test forcing of a PEELED case. Since we generally prefer the 
non-PEELED case
if possible.

At least that was the intention.

Thanks,
Tamar

> 
> Can you see whether it really helps you?
> 
> >   && (!candidate
> >   || dominated_by_p (CDI_DOMINATORS, exit->src,
> >  candidate->src)))
> >
> >
> >
> >
> >
> 
> --
> Richard Biener 
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


[PATCH]middle-end: add two debug counters for early-break vectorization debugging

2024-02-08 Thread Tamar Christina
Hi All,

This adds two new debug counter to aid in debugging early break code.

- vect_force_last_exit: when reached will always force the final loop exit.
- vect_skip_exit: when reached will skip selecting the current candidate exit
  as the loop exit.

The first counter essentially allows you to turn off the PEELED case and the
second counter to pick a different exit, which may mean you pick no exit at
all.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* dbgcnt.def (vect_force_last_exit, vect_skip_exit): New.
* tree-vect-loop.cc (vec_init_loop_exit_info): Use them.

--- inline copy of patch -- 
diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index 
ed9f062eac2c28c52df76b39d4312dd9fde1c800..8f7bebf93fceabdf6ae86c2df591eae4848b8a5c
 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -213,5 +213,7 @@ DEBUG_COUNTER (stv_conversion)
 DEBUG_COUNTER (tail_call)
 DEBUG_COUNTER (tree_sra)
 DEBUG_COUNTER (treepre_insert)
+DEBUG_COUNTER (vect_force_last_exit)
 DEBUG_COUNTER (vect_loop)
+DEBUG_COUNTER (vect_skip_exit)
 DEBUG_COUNTER (vect_slp)
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
eed2268e9bae7e7ad36d13da03e0b54eab26ef6f..854e9d78bc71721e6559a6bc5dff78c813603a78
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -58,6 +58,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-eh.h"
 #include "case-cfn-macros.h"
 #include "langhooks.h"
+#include "dbgcnt.h"
 
 /* Loop Vectorization Pass.
 
@@ -977,6 +978,20 @@ vec_init_loop_exit_info (class loop *loop)
   if (exits.length () == 1)
 return exits[0];
 
+  /* Check to see if we've been asked to force the last exit.  */
+  if (!dbg_cnt (vect_force_last_exit))
+{
+  basic_block bb = ip_normal_pos (loop);
+  if (!bb)
+   return NULL;
+
+  edge exit = EDGE_SUCC (bb, 0);
+  if (exit->dest == loop->latch)
+   return EDGE_SUCC (bb, 1);
+
+  return exit;
+}
+
   /* If we have multiple exits we only support counting IV at the moment.
  Analyze all exits and return the last one we can analyze.  */
   class tree_niter_desc niter_desc;
@@ -998,6 +1013,7 @@ vec_init_loop_exit_info (class loop *loop)
   && exit->src == single_pred (loop->latch)
   && (integer_nonzerop (may_be_zero)
   || COMPARISON_CLASS_P (may_be_zero
+ && dbg_cnt (vect_skip_exit)
  && (!candidate
  || dominated_by_p (CDI_DOMINATORS, exit->src,
 candidate->src)))




-- 
diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index 
ed9f062eac2c28c52df76b39d4312dd9fde1c800..8f7bebf93fceabdf6ae86c2df591eae4848b8a5c
 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -213,5 +213,7 @@ DEBUG_COUNTER (stv_conversion)
 DEBUG_COUNTER (tail_call)
 DEBUG_COUNTER (tree_sra)
 DEBUG_COUNTER (treepre_insert)
+DEBUG_COUNTER (vect_force_last_exit)
 DEBUG_COUNTER (vect_loop)
+DEBUG_COUNTER (vect_skip_exit)
 DEBUG_COUNTER (vect_slp)
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
eed2268e9bae7e7ad36d13da03e0b54eab26ef6f..854e9d78bc71721e6559a6bc5dff78c813603a78
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -58,6 +58,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-eh.h"
 #include "case-cfn-macros.h"
 #include "langhooks.h"
+#include "dbgcnt.h"
 
 /* Loop Vectorization Pass.
 
@@ -977,6 +978,20 @@ vec_init_loop_exit_info (class loop *loop)
   if (exits.length () == 1)
 return exits[0];
 
+  /* Check to see if we've been asked to force the last exit.  */
+  if (!dbg_cnt (vect_force_last_exit))
+{
+  basic_block bb = ip_normal_pos (loop);
+  if (!bb)
+   return NULL;
+
+  edge exit = EDGE_SUCC (bb, 0);
+  if (exit->dest == loop->latch)
+   return EDGE_SUCC (bb, 1);
+
+  return exit;
+}
+
   /* If we have multiple exits we only support counting IV at the moment.
  Analyze all exits and return the last one we can analyze.  */
   class tree_niter_desc niter_desc;
@@ -998,6 +1013,7 @@ vec_init_loop_exit_info (class loop *loop)
   && exit->src == single_pred (loop->latch)
   && (integer_nonzerop (may_be_zero)
   || COMPARISON_CLASS_P (may_be_zero
+ && dbg_cnt (vect_skip_exit)
  && (!candidate
  || dominated_by_p (CDI_DOMINATORS, exit->src,
 candidate->src)))





RE: [PATCH]middle-end: don't cache restart_loop in vectorizable_live_operations [PR113808]

2024-02-08 Thread Tamar Christina
> Please either drop lastprivate(k) clause or use linear(k:1)
> The iteration var of simd loop without collapse or with
> collapse(1) is implicitly linear with the step, and even linear
> means the value from the last iteration can be used after the
> simd construct.  Overriding the data sharing to something different
> has been only added recently to OpenMP and isn't really needed here.
> 

Sorry I know very little about fortran, is this ok?

Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR tree-optimization/113808
* gfortran.dg/vect/vect-early-break_1-PR113808.f90: Moved to...
* gfortran.dg/vect/vect-early-break_1-pr113808.f90: ...here.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 
b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-pr113808.f90
similarity index 93%
rename from gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90
rename to gcc/testsuite/gfortran.dg/vect/vect-early-break_1-pr113808.f90
index 
5c339fa7a348fac5527bbbf456a535da96b5c1ed..6f92e9095bdee08a5a9db2816f57da6c14d91b11
 100644
--- a/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90
+++ b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-pr113808.f90
@@ -9,7 +9,7 @@ program main
   integer :: n, i,k
   n = 11
   do i = 1, n,2
-!$omp simd lastprivate(k)
+!$omp simd
 do k = 1, i + 41
   if (k > 11 + 41 .or. k < 1) error stop
 end do


rb18253.patch
Description: rb18253.patch


[PATCH]middle-end: don't cache restart_loop in vectorizable_live_operations [PR113808]

2024-02-08 Thread Tamar Christina
Hi All,

There's a bug in vectorizable_live_operation that restart_loop is defined
outside the loop.

This variable is supposed to indicate whether we are doing a first or last
index reduction.  The problem is that by defining it outside the loop it becomes
dependent on the order we visit the USE/DEFs.

In the given example, the loop isn't PEELED, but we visit the early exit uses
first.  This then sets the boolean to true and it can't get to false again.

So when we visit the main exit we still treat it as an early exit for that
SSA name.

This cleans it up and renames the variables to something that's hopefully
clearer to their intention.

Bootstrapped Regtested on aarch64-none-linux-gnu and
x86_64-pc-linux-gnu no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/113808
* tree-vect-loop.cc (vectorizable_live_operation): Don't cache the 
value cross iterations.

gcc/testsuite/ChangeLog:

PR tree-optimization/113808
* gfortran.dg/vect/vect-early-break_1-PR113808.f90: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 
b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90
new file mode 100644
index 
..5c339fa7a348fac5527bbbf456a535da96b5c1ed
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90
@@ -0,0 +1,21 @@
+! { dg-add-options vect_early_break }
+! { dg-require-effective-target vect_early_break }
+! { dg-require-effective-target vect_long_long }
+! { dg-additional-options "-fopenmp-simd" }
+
+! { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } }
+
+program main
+  integer :: n, i,k
+  n = 11
+  do i = 1, n,2
+!$omp simd lastprivate(k)
+do k = 1, i + 41
+  if (k > 11 + 41 .or. k < 1) error stop
+end do
+  end do
+  if (k /= 53) then
+print *, k, 53
+error stop
+  endif
+end
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
190df9ec7741fd05aa0b9abe150baf06b2ca9a57..eed2268e9bae7e7ad36d13da03e0b54eab26ef6f
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10950,7 +10950,7 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
 did.  For the live values we want the value at the start of the 
iteration
 rather than at the end.  */
   edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
-  bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
+  bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED 
(loop_vinfo);
   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
if (!is_gimple_debug (use_stmt)
&& !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
@@ -10966,8 +10966,7 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
  /* For early exit where the exit is not in the BB that leads
 to the latch then we're restarting the iteration in the
 scalar loop.  So get the first live value.  */
- restart_loop = restart_loop || !main_exit_edge;
- if (restart_loop
+ if ((all_exits_as_early_p || !main_exit_edge)
  && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
{
  tmp_vec_lhs = vec_lhs0;




-- 
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 
b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90
new file mode 100644
index 
..5c339fa7a348fac5527bbbf456a535da96b5c1ed
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90
@@ -0,0 +1,21 @@
+! { dg-add-options vect_early_break }
+! { dg-require-effective-target vect_early_break }
+! { dg-require-effective-target vect_long_long }
+! { dg-additional-options "-fopenmp-simd" }
+
+! { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } }
+
+program main
+  integer :: n, i,k
+  n = 11
+  do i = 1, n,2
+!$omp simd lastprivate(k)
+do k = 1, i + 41
+  if (k > 11 + 41 .or. k < 1) error stop
+end do
+  end do
+  if (k /= 53) then
+print *, k, 53
+error stop
+  endif
+end
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
190df9ec7741fd05aa0b9abe150baf06b2ca9a57..eed2268e9bae7e7ad36d13da03e0b54eab26ef6f
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -10950,7 +10950,7 @@ vectorizable_live_operation (vec_info *vinfo, 
stmt_vec_info stmt_info,
 did.  For the live values we want the value at the start of the 
iteration
 rather than at the end.  */
   edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
-  bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
+  bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED 
(loop_vinfo);
   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
if (!is_gimple_debug (use_stmt)
&& !flow_bb_inside_loop_p 

[PATCH][committed]middle-end: fix pointer conversion error in testcase vect-early-break_110-pr113467.c

2024-02-08 Thread Tamar Christina
Hi All,

I had missed a conversion from unsigned long to uint64_t.
This fixes the failing test on -m32.

Regtested on x86_64-pc-linux-gnu with -m32 and no issues.

Committed as obvious.

Thanks,
Tamar

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-early-break_110-pr113467.c: Change unsigned long *
to uint64_t *.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
index 
1e2c47be5fdf1e1fed88e4b5f45d7eda6c3b85d1..12d0ea1e871b51742c040c909ea5741bc820206e
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
@@ -10,7 +10,7 @@
 typedef struct gcry_mpi *gcry_mpi_t;
 struct gcry_mpi {
   int nlimbs;
-  unsigned long *d;
+  uint64_t *d;
 };
 
 long gcry_mpi_add_ui_up;




-- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
index 
1e2c47be5fdf1e1fed88e4b5f45d7eda6c3b85d1..12d0ea1e871b51742c040c909ea5741bc820206e
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
@@ -10,7 +10,7 @@
 typedef struct gcry_mpi *gcry_mpi_t;
 struct gcry_mpi {
   int nlimbs;
-  unsigned long *d;
+  uint64_t *d;
 };
 
 long gcry_mpi_add_ui_up;





RE: [PATCH]middle-end: fix ICE when moving statements to empty BB [PR113731]

2024-02-05 Thread Tamar Christina
> It looks like LOOP_VINFO_EARLY_BRK_STORES is "reverse"?  Is that
> why you are doing gsi_move_before + gsi_prev?  Why do gsi_prev
> at all?
> 

As discussed on IRC, then how about this one.
Incremental building passed all tests and bootstrap is running.

Ok for master if bootstrap and regtesting clean?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/113731
* gimple-iterator.cc (gsi_move_before): Take new parameter for update
method.
* gimple-iterator.h (gsi_move_before): Default new param to
GSI_SAME_STMT.
* tree-vect-loop.cc (move_early_exit_stmts): Call gsi_move_before with
GSI_NEW_STMT.

gcc/testsuite/ChangeLog:

PR tree-optimization/113731
* gcc.dg/vect/vect-early-break_111-pr113731.c: New test.

--- inline copy of patch ---

diff --git a/gcc/gimple-iterator.cc b/gcc/gimple-iterator.cc
index 
517c53376f0511af59e124f52ec7be566a6c4789..f67bcfbfdfdd7c6cb0ad0130972f5b1dc4429bcf
 100644
--- a/gcc/gimple-iterator.cc
+++ b/gcc/gimple-iterator.cc
@@ -666,10 +666,11 @@ gsi_move_after (gimple_stmt_iterator *from, 
gimple_stmt_iterator *to)
 
 
 /* Move the statement at FROM so it comes right before the statement
-   at TO.  */
+   at TO using method M.  */
 
 void
-gsi_move_before (gimple_stmt_iterator *from, gimple_stmt_iterator *to)
+gsi_move_before (gimple_stmt_iterator *from, gimple_stmt_iterator *to,
+gsi_iterator_update m = GSI_SAME_STMT)
 {
   gimple *stmt = gsi_stmt (*from);
   gsi_remove (from, false);
@@ -677,7 +678,7 @@ gsi_move_before (gimple_stmt_iterator *from, 
gimple_stmt_iterator *to)
   /* For consistency with gsi_move_after, it might be better to have
  GSI_NEW_STMT here; however, that breaks several places that expect
  that TO does not change.  */
-  gsi_insert_before (to, stmt, GSI_SAME_STMT);
+  gsi_insert_before (to, stmt, m);
 }
 
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
new file mode 100644
index 
..2d6db91df97625a7f11609d034e89af0461129b2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+char* inet_net_pton_ipv4_bits;
+char inet_net_pton_ipv4_odst;
+void __errno_location();
+void inet_net_pton_ipv4();
+void inet_net_pton() { inet_net_pton_ipv4(); }
+void inet_net_pton_ipv4(char *dst, int size) {
+  while ((inet_net_pton_ipv4_bits > dst) & inet_net_pton_ipv4_odst) {
+if (size-- <= 0)
+  goto emsgsize;
+*dst++ = '\0';
+  }
+emsgsize:
+  __errno_location();
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
30b90d99925bea74caf14833d8ab1695607d0fe9..9aba94bd6ca2061a19487ac4a2735a16d03bcbee
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -11800,8 +11800,7 @@ move_early_exit_stmts (loop_vec_info loop_vinfo)
dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
 
   gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
-  gsi_move_before (_gsi, _gsi);
-  gsi_prev (_gsi);
+  gsi_move_before (_gsi, _gsi, GSI_NEW_STMT);
 }
 
   /* Update all the stmts with their new reaching VUSES.  */


rb18247.patch
Description: rb18247.patch


RE: [PATCH]middle-end: add additional runtime test for [PR113467]

2024-02-05 Thread Tamar Christina
> > Ok for master?
> 
> I think you need a lp64 target check for the large constants or
> alternatively use uint64_t?
> 

Ok, how about this one.

Regtested on x86_64-pc-linux-gnu with -m32,-m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR tree-optimization/113467
* gcc.dg/vect/vect-early-break_110-pr113467.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
new file mode 100644
index 
..1e2c47be5fdf1e1fed88e4b5f45d7eda6c3b85d1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
@@ -0,0 +1,52 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_long_long } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+#include "tree-vect.h"
+#include 
+
+typedef struct gcry_mpi *gcry_mpi_t;
+struct gcry_mpi {
+  int nlimbs;
+  unsigned long *d;
+};
+
+long gcry_mpi_add_ui_up;
+void gcry_mpi_add_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned v) {
+  gcry_mpi_add_ui_up = *w->d;
+  if (u) {
+uint64_t *res_ptr = w->d, *s1_ptr = w->d;
+int s1_size = u->nlimbs;
+unsigned s2_limb = v, x = *s1_ptr++;
+s2_limb += x;
+*res_ptr++ = s2_limb;
+if (x)
+  while (--s1_size) {
+x = *s1_ptr++ + 1;
+*res_ptr++ = x;
+if (x) {
+  break;
+}
+  }
+  }
+}
+
+int main()
+{
+  check_vect ();
+
+  static struct gcry_mpi sv;
+  static uint64_t vals[] = {4294967288ULL, 191ULL,4160749568ULL, 
4294963263ULL,
+127ULL,4294950912ULL, 255ULL,
4294901760ULL,
+534781951ULL,  33546240ULL,   4294967292ULL, 
4294960127ULL,
+4292872191ULL, 4294967295ULL, 4294443007ULL, 3ULL};
+  gcry_mpi_t v = 
+  v->nlimbs = 16;
+  v->d = vals;
+
+  gcry_mpi_add_ui(v, v, 8);
+  if (v->d[1] != 192)
+__builtin_abort();
+}


rb18246.patch
Description: rb18246.patch


RE: [PATCH]middle-end: fix ICE when moving statements to empty BB [PR113731]

2024-02-05 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Monday, February 5, 2024 1:22 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; j...@ventanamicro.com
> Subject: Re: [PATCH]middle-end: fix ICE when moving statements to empty BB
> [PR113731]
> 
> On Mon, 5 Feb 2024, Tamar Christina wrote:
> 
> > Hi All,
> >
> > We use gsi_move_before (_gsi, _gsi); to request that the new
> statement
> > be placed before any other statement.  Typically this then moves the current
> > pointer to be after the statement we just inserted.
> >
> > However it looks like when the BB is empty, this does not happen and the CUR
> > pointer stays NULL.   There's a comment in the source of gsi_insert_before 
> > that
> > explains:
> >
> > /* If CUR is NULL, we link at the end of the sequence (this case happens
> >
> > so it adds it to the end instead of start like you asked.  This means that 
> > in
> > this case there's nothing to move and so we shouldn't move the pointer if 
> > we're
> > already at the HEAD.
> 
> The issue is that a gsi_end_p () is ambiguous, it could be the start
> or the end.  gsi_insert_before treats it as "end" while gsi_insert_after
> treats it as "start" since you can't really insert "after" the "end".
> 
> gsi_move_before doesn't update the insertion pointer (using
> GSI_SAME_STMT), so with a gsi_end_p () you get what you ask for.
> 
> Btw,
> 
>   /* Move all stmts that need moving.  */
>   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
>   gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb);
> 
> should probably use gsi_after_labels (dest_bb) just in case.

See next patch.

> 
> It looks like LOOP_VINFO_EARLY_BRK_STORES is "reverse"?  Is that
> why you are doing gsi_move_before + gsi_prev?  Why do gsi_prev
> at all?
> 

Yes, it stores them reverse because we record them from the latch on up.
So we either have to iterate backwards, insert them to the front or move gsi.

I guess I could remove it by removing the for-each loop and iterating in
reverse.  Is that preferred?

Tamar.

> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > PR tree-optimization/113731
> > * tree-vect-loop.cc (move_early_exit_stmts): Conditionally move pointer.
> >
> > gcc/testsuite/ChangeLog:
> >
> > PR tree-optimization/113731
> > * gcc.dg/vect/vect-early-break_111-pr113731.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
> > new file mode 100644
> > index
> ..2d6db91df97625a7f1160
> 9d034e89af0461129b2
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-add-options vect_early_break } */
> > +/* { dg-require-effective-target vect_early_break } */
> > +/* { dg-require-effective-target vect_int } */
> > +
> > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> > +
> > +char* inet_net_pton_ipv4_bits;
> > +char inet_net_pton_ipv4_odst;
> > +void __errno_location();
> > +void inet_net_pton_ipv4();
> > +void inet_net_pton() { inet_net_pton_ipv4(); }
> > +void inet_net_pton_ipv4(char *dst, int size) {
> > +  while ((inet_net_pton_ipv4_bits > dst) & inet_net_pton_ipv4_odst) {
> > +if (size-- <= 0)
> > +  goto emsgsize;
> > +*dst++ = '\0';
> > +  }
> > +emsgsize:
> > +  __errno_location();
> > +}
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index
> 30b90d99925bea74caf14833d8ab1695607d0fe9..e2587315020a35a7d4ebd3e
> 7a9842caa36bb5d3c 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -11801,7 +11801,8 @@ move_early_exit_stmts (loop_vec_info loop_vinfo)
> >
> >gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
> >gsi_move_before (_gsi, _gsi);
> > -  gsi_prev (_gsi);
> > +  if (!gsi_end_p (dest_gsi))
> > +   gsi_prev (_gsi);
> >  }
> >
> >/* Update all the stmts with their new reaching VUSES.  */
> >
> >
> >
> >
> >
> 
> --
> Richard Biener 
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


[PATCH]middle-end: fix ICE when destination BB for stores starts with a label [PR113750]

2024-02-05 Thread Tamar Christina
Hi All,

The report shows that if the FE leaves a label as the first thing in the dest
BB then we ICE because we move the stores before the label.

This is easy to fix if we know that there's still only one way into the BB.
We would have already rejected the loop if there was multiple paths into the BB
however I added an additional check just for early break in case the other
constraints are relaxed later with an explanation.

After that we fix the issue just by getting the GSI after the labels and I add
a bunch of testcases for different positions the label can be added.  Only the
vect-early-break_112-pr113750.c one results in the label being kept.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/113750
* tree-vect-data-refs.cc (vect_analyze_early_break_dependences): Check
for single predecessor when doing early break vect.
* tree-vect-loop.cc (move_early_exit_stmts): Get gsi at the start but
after labels.

gcc/testsuite/ChangeLog:

PR tree-optimization/113750
* gcc.dg/vect/vect-early-break_112-pr113750.c: New test.
* gcc.dg/vect/vect-early-break_113-pr113750.c: New test.
* gcc.dg/vect/vect-early-break_114-pr113750.c: New test.
* gcc.dg/vect/vect-early-break_115-pr113750.c: New test.
* gcc.dg/vect/vect-early-break_116-pr113750.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_112-pr113750.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_112-pr113750.c
new file mode 100644
index 
..559ebd84d5c39881e694e7c8c31be29d846866ed
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_112-pr113750.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+#ifndef N
+#define N 800
+#endif
+unsigned vect_a[N];
+unsigned vect_b[N];
+
+unsigned test4(unsigned x)
+{
+ unsigned ret = 0;
+ for (int i = 0; i < N; i++)
+ {
+   vect_b[i] = x + i;
+   if (vect_a[i] != x)
+ break;
+foo:
+   vect_a[i] = x;
+ }
+ return ret;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_113-pr113750.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_113-pr113750.c
new file mode 100644
index 
..ba85780a46b1378aaec238ff9eb5f906be9a44dd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_113-pr113750.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+#ifndef N
+#define N 800
+#endif
+unsigned vect_a[N];
+unsigned vect_b[N];
+
+unsigned test4(unsigned x)
+{
+ unsigned ret = 0;
+ for (int i = 0; i < N; i++)
+ {
+   vect_b[i] = x + i;
+   if (vect_a[i] != x)
+ break;
+   vect_a[i] = x;
+foo:
+ }
+ return ret;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_114-pr113750.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_114-pr113750.c
new file mode 100644
index 
..37af2998688f5d60e2cdb372ab43afcaa52a3146
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_114-pr113750.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+#ifndef N
+#define N 800
+#endif
+unsigned vect_a[N];
+unsigned vect_b[N];
+
+unsigned test4(unsigned x)
+{
+ unsigned ret = 0;
+ for (int i = 0; i < N; i++)
+ {
+   vect_b[i] = x + i;
+foo:
+   if (vect_a[i] != x)
+ break;
+   vect_a[i] = x;
+ }
+ return ret;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_115-pr113750.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_115-pr113750.c
new file mode 100644
index 
..502686d308e298cd84e9e3b74d7b4ad1979602a9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_115-pr113750.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+#ifndef N
+#define N 800
+#endif
+unsigned vect_a[N];
+unsigned vect_b[N];
+
+unsigned test4(unsigned x)
+{
+ unsigned ret = 0;
+ for (int i = 0; i < N; i++)
+ {
+foo:
+   vect_b[i] = x + i;
+   if (vect_a[i] != x)
+ break;
+   vect_a[i] = x;
+ }
+ return ret;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_116-pr113750.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_116-pr113750.c
new file mode 

[PATCH]middle-end: fix ICE when moving statements to empty BB [PR113731]

2024-02-05 Thread Tamar Christina
Hi All,

We use gsi_move_before (_gsi, _gsi); to request that the new statement
be placed before any other statement.  Typically this then moves the current
pointer to be after the statement we just inserted.

However it looks like when the BB is empty, this does not happen and the CUR
pointer stays NULL.   There's a comment in the source of gsi_insert_before that
explains:

/* If CUR is NULL, we link at the end of the sequence (this case happens

so it adds it to the end instead of start like you asked.  This means that in
this case there's nothing to move and so we shouldn't move the pointer if we're
already at the HEAD.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/113731
* tree-vect-loop.cc (move_early_exit_stmts): Conditionally move pointer.

gcc/testsuite/ChangeLog:

PR tree-optimization/113731
* gcc.dg/vect/vect-early-break_111-pr113731.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
new file mode 100644
index 
..2d6db91df97625a7f11609d034e89af0461129b2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+char* inet_net_pton_ipv4_bits;
+char inet_net_pton_ipv4_odst;
+void __errno_location();
+void inet_net_pton_ipv4();
+void inet_net_pton() { inet_net_pton_ipv4(); }
+void inet_net_pton_ipv4(char *dst, int size) {
+  while ((inet_net_pton_ipv4_bits > dst) & inet_net_pton_ipv4_odst) {
+if (size-- <= 0)
+  goto emsgsize;
+*dst++ = '\0';
+  }
+emsgsize:
+  __errno_location();
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
30b90d99925bea74caf14833d8ab1695607d0fe9..e2587315020a35a7d4ebd3e7a9842caa36bb5d3c
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -11801,7 +11801,8 @@ move_early_exit_stmts (loop_vec_info loop_vinfo)
 
   gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
   gsi_move_before (_gsi, _gsi);
-  gsi_prev (_gsi);
+  if (!gsi_end_p (dest_gsi))
+   gsi_prev (_gsi);
 }
 
   /* Update all the stmts with their new reaching VUSES.  */




-- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
new file mode 100644
index 
..2d6db91df97625a7f11609d034e89af0461129b2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
+char* inet_net_pton_ipv4_bits;
+char inet_net_pton_ipv4_odst;
+void __errno_location();
+void inet_net_pton_ipv4();
+void inet_net_pton() { inet_net_pton_ipv4(); }
+void inet_net_pton_ipv4(char *dst, int size) {
+  while ((inet_net_pton_ipv4_bits > dst) & inet_net_pton_ipv4_odst) {
+if (size-- <= 0)
+  goto emsgsize;
+*dst++ = '\0';
+  }
+emsgsize:
+  __errno_location();
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
30b90d99925bea74caf14833d8ab1695607d0fe9..e2587315020a35a7d4ebd3e7a9842caa36bb5d3c
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -11801,7 +11801,8 @@ move_early_exit_stmts (loop_vec_info loop_vinfo)
 
   gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
   gsi_move_before (_gsi, _gsi);
-  gsi_prev (_gsi);
+  if (!gsi_end_p (dest_gsi))
+   gsi_prev (_gsi);
 }
 
   /* Update all the stmts with their new reaching VUSES.  */





[PATCH]middle-end: add additional runtime test for [PR113467]

2024-02-05 Thread Tamar Christina
Hi All,

This just adds an additional runtime testcase for the fixed issue.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR tree-optimization/113467
* gcc.dg/vect/vect-early-break_110-pr113467.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
new file mode 100644
index 
..2d8a071c0e922ccfd5fa8c7b2704852dbd95
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
@@ -0,0 +1,51 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+#include "tree-vect.h"
+
+typedef struct gcry_mpi *gcry_mpi_t;
+struct gcry_mpi {
+  int nlimbs;
+  unsigned long *d;
+};
+
+long gcry_mpi_add_ui_up;
+void gcry_mpi_add_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned v) {
+  gcry_mpi_add_ui_up = *w->d;
+  if (u) {
+unsigned long *res_ptr = w->d, *s1_ptr = w->d;
+int s1_size = u->nlimbs;
+unsigned s2_limb = v, x = *s1_ptr++;
+s2_limb += x;
+*res_ptr++ = s2_limb;
+if (x)
+  while (--s1_size) {
+x = *s1_ptr++ + 1;
+*res_ptr++ = x;
+if (x) {
+  break;
+}
+  }
+  }
+}
+
+int main()
+{
+  check_vect ();
+
+  static struct gcry_mpi sv;
+  static unsigned long vals[] = {4294967288, 191,4160749568, 
4294963263,
+ 127,4294950912, 255,
4294901760,
+ 534781951,  33546240,   4294967292, 
4294960127,
+ 4292872191, 4294967295, 4294443007, 3};
+  gcry_mpi_t v = 
+  v->nlimbs = 16;
+  v->d = vals;
+
+  gcry_mpi_add_ui(v, v, 8);
+  if (v->d[1] != 192)
+__builtin_abort();
+}




-- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
new file mode 100644
index 
..2d8a071c0e922ccfd5fa8c7b2704852dbd95
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c
@@ -0,0 +1,51 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+#include "tree-vect.h"
+
+typedef struct gcry_mpi *gcry_mpi_t;
+struct gcry_mpi {
+  int nlimbs;
+  unsigned long *d;
+};
+
+long gcry_mpi_add_ui_up;
+void gcry_mpi_add_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned v) {
+  gcry_mpi_add_ui_up = *w->d;
+  if (u) {
+unsigned long *res_ptr = w->d, *s1_ptr = w->d;
+int s1_size = u->nlimbs;
+unsigned s2_limb = v, x = *s1_ptr++;
+s2_limb += x;
+*res_ptr++ = s2_limb;
+if (x)
+  while (--s1_size) {
+x = *s1_ptr++ + 1;
+*res_ptr++ = x;
+if (x) {
+  break;
+}
+  }
+  }
+}
+
+int main()
+{
+  check_vect ();
+
+  static struct gcry_mpi sv;
+  static unsigned long vals[] = {4294967288, 191,4160749568, 
4294963263,
+ 127,4294950912, 255,
4294901760,
+ 534781951,  33546240,   4294967292, 
4294960127,
+ 4292872191, 4294967295, 4294443007, 3};
+  gcry_mpi_t v = 
+  v->nlimbs = 16;
+  v->d = vals;
+
+  gcry_mpi_add_ui(v, v, 8);
+  if (v->d[1] != 192)
+__builtin_abort();
+}





RE: [PATCH]middle-end: check memory accesses in the destination block [PR113588].

2024-02-01 Thread Tamar Christina
> >
> > If the above is correct then I think I understand what you're saying and
> > will update the patch and do some Checks.
> 
> Yes, I think that's what I wanted to say.
> 

As discussed:

Bootstrapped Regtested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu no 
issues.
Also checked both with --enable-lto --with-build-config='bootstrap-O3 
bootstrap-lto' --enable-multilib
and --enable-lto --with-build-config=bootstrap-O3 
--enable-checking=release,yes,rtl,extra;
and checked the libcrypt testsuite as reported on PR113467.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/113588
PR tree-optimization/113467
(vect_analyze_data_ref_dependence):  Choose correct dest and fix checks.
(vect_analyze_early_break_dependences): Update comments.

gcc/testsuite/ChangeLog:

PR tree-optimization/113588
PR tree-optimization/113467
* gcc.dg/vect/vect-early-break_108-pr113588.c: New test.
* gcc.dg/vect/vect-early-break_109-pr113588.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c
new file mode 100644
index 
..e488619c9aac41fafbcf479818392a6bb7c6924f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+int foo (const char *s, unsigned long n)
+{
+ unsigned long len = 0;
+ while (*s++ && n--)
+   ++len;
+ return len;
+}
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c
new file mode 100644
index 
..488c19d3ede809631d1a7ede0e7f7bcdc7a1ae43
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c
@@ -0,0 +1,44 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target mmap } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+#include 
+#include 
+
+#include "tree-vect.h"
+
+__attribute__((noipa))
+int foo (const char *s, unsigned long n)
+{
+ unsigned long len = 0;
+ while (*s++ && n--)
+   ++len;
+ return len;
+}
+
+int main()
+{
+
+  check_vect ();
+
+  long pgsz = sysconf (_SC_PAGESIZE);
+  void *p = mmap (NULL, pgsz * 3, PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+  if (p == MAP_FAILED)
+return 0;
+  mprotect (p, pgsz, PROT_NONE);
+  mprotect (p+2*pgsz, pgsz, PROT_NONE);
+  char *p1 = p + pgsz;
+  p1[0] = 1;
+  p1[1] = 0;
+  foo (p1, 1000);
+  p1 = p + 2*pgsz - 2;
+  p1[0] = 1;
+  p1[1] = 0;
+  foo (p1, 1000);
+  return 0;
+}
+
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 
f592aeb8028afd4fd70e2175104efab2a2c0d82e..53fdfc25d7dc2deb7788176252697d2e45fc
 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -619,10 +619,10 @@ vect_analyze_data_ref_dependence (struct 
data_dependence_relation *ddr,
   return opt_result::success ();
 }
 
-/* Funcion vect_analyze_early_break_dependences.
+/* Function vect_analyze_early_break_dependences.
 
-   Examime all the data references in the loop and make sure that if we have
-   mulitple exits that we are able to safely move stores such that they become
+   Examine all the data references in the loop and make sure that if we have
+   multiple exits that we are able to safely move stores such that they become
safe for vectorization.  The function also calculates the place where to 
move
the instructions to and computes what the new vUSE chain should be.
 
@@ -639,7 +639,7 @@ vect_analyze_data_ref_dependence (struct 
data_dependence_relation *ddr,
  - Multiple loads are allowed as long as they don't alias.
 
NOTE:
- This implemementation is very conservative. Any overlappig loads/stores
+ This implementation is very conservative. Any overlapping loads/stores
  that take place before the early break statement gets rejected aside from
  WAR dependencies.
 
@@ -668,7 +668,6 @@ vect_analyze_early_break_dependences (loop_vec_info 
loop_vinfo)
   auto_vec bases;
   basic_block dest_bb = NULL;
 
-  hash_set  visited;
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   class loop *loop_nest = loop_outer (loop);
 
@@ -677,19 +676,33 @@ vect_analyze_early_break_dependences (loop_vec_info 
loop_vinfo)
 "loop contains multiple exits, analyzing"
 " statement dependencies.\n");
 
+  if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
+if (dump_enabled_p ())
+  dump_printf_loc (MSG_NOTE, vect_location,
+

RE: [PATCH]AArch64: update vget_set_lane_1.c test output

2024-02-01 Thread Tamar Christina
> -Original Message-
> From: Richard Sandiford 
> Sent: Thursday, February 1, 2024 2:24 PM
> To: Andrew Pinski 
> Cc: Tamar Christina ; gcc-patches@gcc.gnu.org; nd
> ; Richard Earnshaw ; Marcus
> Shawcroft ; Kyrylo Tkachov
> 
> Subject: Re: [PATCH]AArch64: update vget_set_lane_1.c test output
> 
> Andrew Pinski  writes:
> > On Thu, Feb 1, 2024 at 1:26 AM Tamar Christina 
> wrote:
> >>
> >> Hi All,
> >>
> >> In the vget_set_lane_1.c test the following entries now generate a zip1 
> >> instead
> of an INS
> >>
> >> BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
> >> BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
> >> BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
> >>
> >> This is because the non-Q variant for indices 0 and 1 are just shuffling 
> >> values.
> >> There is no perf difference between INS SIMD to SIMD and ZIP, as such just
> update the
> >> test file.
> > Hmm, is this true on all cores? I suspect there is a core out there
> > where INS is implemented with a much lower latency than ZIP.
> > If we look at config/aarch64/thunderx.md, we can see INS is 2 cycles
> > while ZIP is 6 cycles (3/7 for q versions).
> > Now I don't have any invested interest in that core any more but I
> > just wanted to point out that is not exactly true for all cores.
> 
> Thanks for the pointer.  In that case, perhaps we should prefer
> aarch64_evpc_ins over aarch64_evpc_zip in aarch64_expand_vec_perm_const_1?
> That's enough to fix this failure, but it'll probably require other
> tests to be adjusted...

I think given that Thundex-X is a 10 year old micro-architecture that is 
several cases where
often used instructions have very high latencies that generic codegen should 
not be blocked
from progressing because of it.

we use zips in many things and if thunderx codegen is really of that much 
importance then I
think the old codegen should be gated behind -mcpu=thunderx rather than 
preventing generic
changes.

Regards,
Tamar.

> 
> Richard


[PATCH]AArch64: update vget_set_lane_1.c test output

2024-02-01 Thread Tamar Christina
Hi All,

In the vget_set_lane_1.c test the following entries now generate a zip1 instead 
of an INS

BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)

This is because the non-Q variant for indices 0 and 1 are just shuffling values.
There is no perf difference between INS SIMD to SIMD and ZIP, as such just 
update the
test file.

Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vget_set_lane_1.c: Update test output.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c 
b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
index 
07a77de319206c5c6dad1c0d2d9bcc998583f9c1..a3978f68e4ff5899f395a98615a5e86c3b1389cb
 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
@@ -22,7 +22,7 @@ BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
 BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
 BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
 BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
-/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } 
} */
+/* { dg-final { scan-assembler-times "zip1\\tv0.2s, v0.2s, v1.2s" 3 } } */
 
 BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15)
 BUILD_TEST (int8x8_t,  int8x16_t,  , q, s8, 7, 15)




-- 
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c 
b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
index 
07a77de319206c5c6dad1c0d2d9bcc998583f9c1..a3978f68e4ff5899f395a98615a5e86c3b1389cb
 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c
@@ -22,7 +22,7 @@ BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2)
 BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0)
 BUILD_TEST (int32x2_t,   int32x2_t,   , , s32, 1, 0)
 BUILD_TEST (uint32x2_t,  uint32x2_t,  , , u32, 1, 0)
-/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } 
} */
+/* { dg-final { scan-assembler-times "zip1\\tv0.2s, v0.2s, v1.2s" 3 } } */
 
 BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15)
 BUILD_TEST (int8x8_t,  int8x16_t,  , q, s8, 7, 15)





[PATCH 2/2][libsanitizer] hwasan: Remove testsuite check for a complaint message [PR112644]

2024-01-31 Thread Tamar Christina
Hi All,

With recent updates to hwasan runtime libraries, the error reporting for
this particular check is has been reworked.

I would question why it has lost this message.  To me it looks strange
that num_descriptions_printed is incremented whenever we call
PrintHeapOrGlobalCandidate whether that function prints anything or not.
(See PrintAddressDescription in libsanitizer/hwasan/hwasan_report.cpp).

The message is no longer printed because we increment this
num_descriptions_printed variable indicating that we have found some
description.

I would like to question this upstream, but it doesn't look that much of
a problem and if pressed for time we should just change our testsuite.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR sanitizer/112644
* c-c++-common/hwasan/hwasan-thread-clears-stack.c: Update testcase.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c 
b/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c
index 
09c72a56f0f50a8c301d89217aa8c7df70087e6c..6c70684d72a887c49b02ecb17ca097da81a9168f
 100644
--- a/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c
+++ b/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c
@@ -52,5 +52,4 @@ main (int argc, char **argv)
 
 /* { dg-output "HWAddressSanitizer: tag-mismatch on address 0x\[0-9a-f\]*.*" } 
*/
 /* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: 
\[\[:xdigit:\]\]\[\[:xdigit:\]\]/00 \\(ptr/mem\\) in thread T0.*" } */
-/* { dg-output "HWAddressSanitizer can not describe address in more 
detail\..*" } */
 /* { dg-output "SUMMARY: HWAddressSanitizer: tag-mismatch \[^\n\]*.*" } */




-- 
diff --git a/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c 
b/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c
index 
09c72a56f0f50a8c301d89217aa8c7df70087e6c..6c70684d72a887c49b02ecb17ca097da81a9168f
 100644
--- a/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c
+++ b/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c
@@ -52,5 +52,4 @@ main (int argc, char **argv)
 
 /* { dg-output "HWAddressSanitizer: tag-mismatch on address 0x\[0-9a-f\]*.*" } 
*/
 /* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: 
\[\[:xdigit:\]\]\[\[:xdigit:\]\]/00 \\(ptr/mem\\) in thread T0.*" } */
-/* { dg-output "HWAddressSanitizer can not describe address in more 
detail\..*" } */
 /* { dg-output "SUMMARY: HWAddressSanitizer: tag-mismatch \[^\n\]*.*" } */





[PATCH 1/2][libsanitizer] hwasan: Remove testsuite check for a complaint message [PR112644]

2024-01-31 Thread Tamar Christina
Hi All,

Recent libhwasan updates[1] intercept various string and memory functions.
These functions have checking in them, which means there's no need to
inline the checking.

This patch marks said functions as intercepted, and adjusts a testcase
to handle the difference.  It also looks for HWASAN in a check in
expand_builtin.  This check originally is there to avoid using expand to
inline the behaviour of builtins like memset which are intercepted by
ASAN and hence which we rely on the function call staying as a function
call.  With the new reliance on function calls in HWASAN we need to do
the same thing for HWASAN too.

HWASAN and ASAN don't seem to however instrument the same functions.

Looking into 
libsanitizer/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc
it looks like the common ones are memset, memmove and memcpy.

The rest of the routines for asan seem to be defined in
compiler-rt/lib/asan/asan_interceptors.h however compiler-rt/lib/hwasan/
does not have such a file but it does have
compiler-rt/lib/hwasan/hwasan_platform_interceptors.h which it looks like is
forcing off everything but memset, memmove, memcpy, memcmp and bcmp.

As such I've taken those as the final list that hwasan currently supports.
This also means that on future updates this list should be cross checked.

[1] 
https://discourse.llvm.org/t/hwasan-question-about-the-recent-interceptors-being-added/75351

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR sanitizer/112644
* asan.h (asan_intercepted_p): Incercept memset, memmove, memcpy and
memcmp.
* builtins.cc (expand_builtin): Include HWASAN when checking for
builtin inlining.

gcc/testsuite/ChangeLog:

PR sanitizer/112644
* c-c++-common/hwasan/builtin-special-handling.c: Update testcase.

Co-Authored-By: Matthew Malcomson 

--- inline copy of patch -- 
diff --git a/gcc/asan.h b/gcc/asan.h
index 
82811bdbe697665652aba89f2ee1c3ac07970df9..d1bf8b1e701b15525c6a900d324f2aebfb778cba
 100644
--- a/gcc/asan.h
+++ b/gcc/asan.h
@@ -185,8 +185,13 @@ extern hash_set *asan_handled_variables;
 inline bool
 asan_intercepted_p (enum built_in_function fcode)
 {
+  /* This list should be kept up-to-date with upstream's version at
+ compiler-rt/lib/hwasan/hwasan_platform_interceptors.h.  */
   if (hwasan_sanitize_p ())
-return false;
+return fcode == BUILT_IN_MEMCMP
+|| fcode == BUILT_IN_MEMCPY
+|| fcode == BUILT_IN_MEMMOVE
+|| fcode == BUILT_IN_MEMSET;
 
   return fcode == BUILT_IN_INDEX
 || fcode == BUILT_IN_MEMCHR
diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 
a0bd82c7981c05caf2764de70c62fe83bef9ad29..12cc7a54e99555d0f4b21fa2cc32ffa7bb548f18
 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -7792,7 +7792,8 @@ expand_builtin (tree exp, rtx target, rtx subtarget, 
machine_mode mode,
   default:
break;
   }
-  if (sanitize_flags_p (SANITIZE_ADDRESS) && asan_intercepted_p (fcode))
+  if (sanitize_flags_p (SANITIZE_ADDRESS | SANITIZE_HWADDRESS)
+ && asan_intercepted_p (fcode))
 return expand_call (exp, target, ignore);
 
   /* When not optimizing, generate calls to library functions for a certain
diff --git a/gcc/testsuite/c-c++-common/hwasan/builtin-special-handling.c 
b/gcc/testsuite/c-c++-common/hwasan/builtin-special-handling.c
index 
a7a6d91693ae48c20f33ab28f28d27b01af4722c..f975b1cc397bc0d6fd475dbfed5ccc8ac386
 100644
--- a/gcc/testsuite/c-c++-common/hwasan/builtin-special-handling.c
+++ b/gcc/testsuite/c-c++-common/hwasan/builtin-special-handling.c
@@ -8,24 +8,24 @@
 /* { dg-skip-if "" { *-*-* }  { "-flto" } { "-flto-partition=none" } } */
 
 typedef __SIZE_TYPE__ size_t;
-/* Functions to observe that HWASAN instruments memory builtins in the expected
-   manner.  */
+/* HWASAN used to instrument calls to memset, memcpy, and memmove.  It no
+   longer does this.  Many other string and memory builtins are intercepted by
+   the runtime (and hence the codegen need not do anything).  */
 void * __attribute__((noinline))
 memset_builtin (void *dest, int value, size_t len)
 {
   return __builtin_memset (dest, value, len);
 }
 
-/* HWASAN avoids strlen because it doesn't know the size of the memory access
-   until *after* the function call.  */
 size_t __attribute__ ((noinline))
 strlen_builtin (char *element)
 {
   return __builtin_strlen (element);
 }
 
-/* First test ensures that the HWASAN_CHECK was emitted before the
-   memset.  Second test ensures there was only HWASAN_CHECK (which demonstrates
-   that strlen was not instrumented).  */
-/* { dg-final { scan-tree-dump-times "HWASAN_CHECK.*memset" 1 "asan1" } } */
-/* { dg-final { scan-tree-dump-times "HWASAN_CHECK" 1 "asan1" } } */
+/* First check here ensures there is no inline instrumentation generated for
+   these builtins.  Second checks that we end up calling memset (i.e. that it's
+   not optimised 

RE: [PATCH][libsanitizer]: Sync fixes for asan interceptors from upstream [PR112644]

2024-01-31 Thread Tamar Christina
> -Original Message-
> From: Andrew Pinski 
> Sent: Monday, January 29, 2024 9:55 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; ja...@redhat.com;
> do...@redhat.com; k...@google.com; dvyu...@google.com
> Subject: Re: [PATCH][libsanitizer]: Sync fixes for asan interceptors from 
> upstream
> [PR112644]
> 
> On Mon, Jan 29, 2024 at 7:04 AM Tamar Christina 
> wrote:
> >
> > Hi All,
> >
> > This cherry-picks and squashes the differences between commits
> >
> >
> d3e5c20ab846303874a2a25e5877c72271fc798b..76e1e45922e6709392fb82aa
> c44bebe3dbc2ea63
> > from LLVM upstream from compiler-rt/lib/hwasan/ to GCC on the changes
> relevant
> > for GCC.
> >
> > This is required to fix the linked PR.
> >
> > As mentioned in the PR the last sync brought in a bug from upstream[1] where
> > operations became non-recoverable and as such the tests in AArch64 started
> > failing.  This cherry picks the fix and there are minor updates needed to 
> > GCC
> > after this to fix the cases.
> >
> > [1] https://github.com/llvm/llvm-project/pull/74000
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> 
> Thanks for handling this; though I wonder how this slipped through
> testing upstream in LLVM. I see they added some new testcases for
> this. I Know GCC's testsuite for sanitizer is slightly different from
> LLVM's. Is it the case, GCC has more tests in this area? Is someone
> adding the testcases that GCC has in this area upstream to LLVM;
> basically so merging won't bring in regressions like this in the
> future?

There were two parts here.  The first one is that their testsuite didn't have 
any
test for the recovery case.  Which they've now added.

But the second parts (which I'm not posting patches for) is that the change
In hwasan means that the runtime can now instrument some additional
library methods which it couldn't before.  And GCC now needs to not inline
these anymore.

This does mean that on future updates one needs to take a look at the
Instrumentation list and make sure to keep it in sync with GCC's otherwise
we'll lose instrumentation.

Regards,
Tamar
> 
> Thanks,
> Andrew
> 
> >
> > Thanks,
> > Tamar
> >
> > libsanitizer/ChangeLog:
> >
> > PR sanitizer/112644
> > * hwasan/hwasan_interceptors.cpp (ACCESS_MEMORY_RANGE,
> > HWASAN_READ_RANGE, HWASAN_WRITE_RANGE,
> COMMON_SYSCALL_PRE_READ_RANGE,
> > COMMON_SYSCALL_PRE_WRITE_RANGE,
> COMMON_INTERCEPTOR_WRITE_RANGE,
> > COMMON_INTERCEPTOR_READ_RANGE): Make recoverable.
> >
> > --- inline copy of patch --
> > diff --git a/libsanitizer/hwasan/hwasan_interceptors.cpp
> b/libsanitizer/hwasan/hwasan_interceptors.cpp
> > index
> d9237cf9b8e3bf982cf213123ef22e73ec027c9e..96df4dd0c24d7d3db28fa2557
> cf63da0f295e33f 100644
> > --- a/libsanitizer/hwasan/hwasan_interceptors.cpp
> > +++ b/libsanitizer/hwasan/hwasan_interceptors.cpp
> > @@ -36,16 +36,16 @@ struct HWAsanInterceptorContext {
> >const char *interceptor_name;
> >  };
> >
> > -#  define ACCESS_MEMORY_RANGE(ctx, offset, size, access)   
> >  \
> > -do {   
> >  \
> > -  __hwasan::CheckAddressSized > access>((uptr)offset, \
> > -  size);   
> >  \
> > +#  define ACCESS_MEMORY_RANGE(offset, size, access)
> >\
> > +do {   
> >\
> > +  __hwasan::CheckAddressSized > access>((uptr)offset, \
> > +size); 
> >\
> >  } while (0)
> >
> > -#  define HWASAN_READ_RANGE(ctx, offset, size) \
> > -ACCESS_MEMORY_RANGE(ctx, offset, size, AccessType::Load)
> > -#  define HWASAN_WRITE_RANGE(ctx, offset, size) \
> > -ACCESS_MEMORY_RANGE(ctx, offset, size, AccessType::Store)
> > +#  define HWASAN_READ_RANGE(offset, size) \
> > +ACCESS_MEMORY_RANGE(offset, size, AccessType::Load)
> > +#  define HWASAN_WRITE_RANGE(offset, size) \
> > +ACCESS_MEMORY_RANGE(offset, size, AccessType::Store)
> >
> >  #  if !SANITIZER_APPLE
> >  #define HWASAN_INTERCEPT_FUNC(name)
> > \
> > @@ -74,9 +74,8 @@ struct HWAsanInterceptorContext {
> >
> >  #  if HWASAN_WITH_INTERCEPTORS
> >
> > -#define COMMON_SYSC

RE: [PATCH]middle-end: check memory accesses in the destination block [PR113588].

2024-01-30 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, January 30, 2024 9:51 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; j...@ventanamicro.com
> Subject: Re: [PATCH]middle-end: check memory accesses in the destination block
> [PR113588].
> 
> On Mon, 29 Jan 2024, Tamar Christina wrote:
> 
> > Hi All,
> >
> > When analyzing loads for early break it was always the intention that
> > for the exit where things get moved to we only check the loads that can
> > be reached from the condition.
> 
> Looking at the code I'm a bit confused that we always move to
> single_pred (loop->latch) - IIRC that was different at some point?
> 
> Shouldn't we move stores after the last early exit condition instead?

Yes it was changed during another PR fix.  The rationale at that time didn't 
take into account
the peeled case.  It used to be that we would "search" for the the exit to 
place it in.

At that time the rational was, well it doesn't make sense. It has to go in the 
block that is the
last to be executed.  With the non-peeled case it's always the one before the 
latch.

Or put differently, I think the destination should be the main IV block.  I am 
not quite sure
I'm following why you want to put the peeled cases inside the latch block.

Ah, is it because the latch block is always going to only be executed when you 
make a full iteration?
That makes sense, but then I think we should also analyze the stores in all 
blocks (which your change
maybe already does, let me check) since we'll also lifting past the final block 
we need to update the vuses
there too.

If the above is correct then I think I understand what you're saying and will 
update the patch and do some
Checks.

Thanks,
Tamar

> 
> In particular for the peeled case single_pred (loop->latch) is the
> block with the actual early exit condition?  So for that case we'd
> need to move to the latch itself instead?  For non-peeled we move
> to the block with the IV condition which looks OK.
> 
> > However the main loop checks all loads and we skip the destination BB.
> > As such we never actually check the loads reachable from the COND in the
> > last BB unless this BB was also the exit chosen by the vectorizer.
> >
> > This leads us to incorrectly vectorize the loop in the PR and in doing so 
> > access
> > out of bounds.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> 
> The patch ends up with a worklist and another confusing comment
> 
> +  /* For the destination BB we need to only analyze loads reachable from
> the early
> + break statement itself.  */
> 
> But I think it's a downstream issue from the issue above.  That said,
> even for the non-peeled case we need to check ref_within_array_bound,
> no?
> 
> So what about re-doing that initial loop like the following instead
> (and also fix dest_bb, but I'd like clarification here).  Basically
> walk all blocks, do the ref_within_array_bound first and only
> after we've seen 'dest_bb' do the checks required for moving
> stores for all upstream BBs.
> 
> And dest_bb should be
> 
>   /* Move side-effects to the in-loop destination of the last early
>  exit.  */
>   if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
> dest_bb = loop->latch;
>   else
> dest_bb = single_pred (loop->latch);
> 
> 
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index f592aeb8028..d6c8910dd6c 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -668,7 +668,6 @@ vect_analyze_early_break_dependences (loop_vec_info
> loop_vinfo)
>auto_vec bases;
>basic_block dest_bb = NULL;
> 
> -  hash_set  visited;
>class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
>class loop *loop_nest = loop_outer (loop);
> 
> @@ -681,15 +680,11 @@ vect_analyze_early_break_dependences
> (loop_vec_info loop_vinfo)
>   side-effects to is always the latch connected exit.  When we support
>   general control flow we can do better but for now this is fine.  */
>dest_bb = single_pred (loop->latch);
> -  basic_block bb = dest_bb;
> +  basic_block bb = loop->latch;
> +  bool check_deps = false;
> 
>do
>  {
> -  /* If the destination block is also the header then we have nothing to 
> do.  */
> -  if (!single_pred_p (bb))
> - continue;
> -
> -  bb = single_pred (bb);
>gimple_stmt_iterator gsi = gsi_last_bb (bb);
> 
>/* Now analyze all the remaining statements and try to determine which
> @@ -707,6 +702,25 @@ vect_analyze_early_break_dependences (loop_vec_info
> loop_vi

[PATCH]middle-end: check memory accesses in the destination block [PR113588].

2024-01-29 Thread Tamar Christina
Hi All,

When analyzing loads for early break it was always the intention that for the
exit where things get moved to we only check the loads that can be reached from
the condition.

However the main loop checks all loads and we skip the destination BB.  As such
we never actually check the loads reachable from the COND in the last BB unless
this BB was also the exit chosen by the vectorizer.

This leads us to incorrectly vectorize the loop in the PR and in doing so access
out of bounds.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR tree-optimization/113588
* tree-vect-data-refs.cc (vect_analyze_early_break_dependences_1): New.
(vect_analyze_data_ref_dependence):  Use it.
(vect_analyze_early_break_dependences): Update comments.

gcc/testsuite/ChangeLog:

PR tree-optimization/113588
* gcc.dg/vect/vect-early-break_108-pr113588.c: New test.
* gcc.dg/vect/vect-early-break_109-pr113588.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c
new file mode 100644
index 
..e488619c9aac41fafbcf479818392a6bb7c6924f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+int foo (const char *s, unsigned long n)
+{
+ unsigned long len = 0;
+ while (*s++ && n--)
+   ++len;
+ return len;
+}
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c
new file mode 100644
index 
..488c19d3ede809631d1a7ede0e7f7bcdc7a1ae43
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c
@@ -0,0 +1,44 @@
+/* { dg-add-options vect_early_break } */
+/* { dg-require-effective-target vect_early_break } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target mmap } */
+
+/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
+
+#include 
+#include 
+
+#include "tree-vect.h"
+
+__attribute__((noipa))
+int foo (const char *s, unsigned long n)
+{
+ unsigned long len = 0;
+ while (*s++ && n--)
+   ++len;
+ return len;
+}
+
+int main()
+{
+
+  check_vect ();
+
+  long pgsz = sysconf (_SC_PAGESIZE);
+  void *p = mmap (NULL, pgsz * 3, PROT_READ|PROT_WRITE,
+ MAP_ANONYMOUS|MAP_PRIVATE, 0, 0);
+  if (p == MAP_FAILED)
+return 0;
+  mprotect (p, pgsz, PROT_NONE);
+  mprotect (p+2*pgsz, pgsz, PROT_NONE);
+  char *p1 = p + pgsz;
+  p1[0] = 1;
+  p1[1] = 0;
+  foo (p1, 1000);
+  p1 = p + 2*pgsz - 2;
+  p1[0] = 1;
+  p1[1] = 0;
+  foo (p1, 1000);
+  return 0;
+}
+
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 
f592aeb8028afd4fd70e2175104efab2a2c0d82e..52cef242a7ce5d0e525bff639fa1dc2f0a6f30b9
 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -619,10 +619,69 @@ vect_analyze_data_ref_dependence (struct 
data_dependence_relation *ddr,
   return opt_result::success ();
 }
 
-/* Funcion vect_analyze_early_break_dependences.
+/* Function vect_analyze_early_break_dependences_1
 
-   Examime all the data references in the loop and make sure that if we have
-   mulitple exits that we are able to safely move stores such that they become
+   Helper function of vect_analyze_early_break_dependences which performs 
safety
+   analysis for load operations in an early break.  */
+
+static opt_result
+vect_analyze_early_break_dependences_1 (data_reference *dr_ref, gimple *stmt)
+{
+  /* We currently only support statically allocated objects due to
+ not having first-faulting loads support or peeling for
+ alignment support.  Compute the size of the referenced object
+ (it could be dynamically allocated).  */
+  tree obj = DR_BASE_ADDRESS (dr_ref);
+  if (!obj || TREE_CODE (obj) != ADDR_EXPR)
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"early breaks only supported on statically"
+" allocated objects.\n");
+  return opt_result::failure_at (stmt,
+"can't safely apply code motion to "
+"dependencies of %G to vectorize "
+"the early exit.\n", stmt);
+}
+
+  tree refop = TREE_OPERAND (obj, 0);
+  tree refbase = get_base_address (refop);
+  if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase)
+  || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST)
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"early 

  1   2   3   4   5   6   7   8   9   10   >