[gcc r15-1071] AArch64: correct constraint on Upl early clobber alternatives
https://gcc.gnu.org/g:afe85f8e22a703280b17c701f3490d89337f674a commit r15-1071-gafe85f8e22a703280b17c701f3490d89337f674a Author: Tamar Christina Date: Thu Jun 6 14:35:48 2024 +0100 AArch64: correct constraint on Upl early clobber alternatives I made an oversight in the previous patch, where I added a ?Upa alternative to the Upl cases. This causes it to create the tie between the larger register file rather than the constrained one. This fixes the affected patterns. gcc/ChangeLog: * config/aarch64/aarch64-sve.md (@aarch64_pred_cmp, *cmp_cc, *cmp_ptest, @aarch64_pred_cmp_wide, *aarch64_pred_cmp_wide_cc, *aarch64_pred_cmp_wide_ptest): Fix Upl tie alternative. * config/aarch64/aarch64-sve2.md (@aarch64_pred_): Fix Upl tie alternative. Diff: --- gcc/config/aarch64/aarch64-sve.md | 64 +++--- gcc/config/aarch64/aarch64-sve2.md | 2 +- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index d902bce62fd..d69db34016a 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -8134,13 +8134,13 @@ UNSPEC_PRED_Z)) (clobber (reg:CC_NZC CC_REGNUM))] "TARGET_SVE" - {@ [ cons: =0 , 1 , 3 , 4; attrs: pred_clobber ] - [ , Upl , w , ; yes ] cmp\t%0., %1/z, %3., #%4 - [ ?Upa , 0Upl, w , ; yes ] ^ - [ Upa , Upl , w , ; no ] ^ - [ , Upl , w , w; yes ] cmp\t%0., %1/z, %3., %4. - [ ?Upa , 0Upl, w , w; yes ] ^ - [ Upa , Upl , w , w; no ] ^ + {@ [ cons: =0 , 1 , 3 , 4; attrs: pred_clobber ] + [ , Upl, w , ; yes ] cmp\t%0., %1/z, %3., #%4 + [ ?Upl , 0 , w , ; yes ] ^ + [ Upa , Upl, w , ; no ] ^ + [ , Upl, w , w; yes ] cmp\t%0., %1/z, %3., %4. + [ ?Upl , 0 , w , w; yes ] ^ + [ Upa , Upl, w , w; no ] ^ } ) @@ -8170,13 +8170,13 @@ UNSPEC_PRED_Z))] "TARGET_SVE && aarch64_sve_same_pred_for_ptest_p ([4], [6])" - {@ [ cons: =0 , 1, 2 , 3; attrs: pred_clobber ] - [ , Upl , w , ; yes ] cmp\t%0., %1/z, %2., #%3 - [ ?Upa , 0Upl, w , ; yes ] ^ - [ Upa , Upl , w , ; no ] ^ - [ , Upl , w , w; yes ] cmp\t%0., %1/z, %2., %3. - [ ?Upa , 0Upl, w , w; yes ] ^ - [ Upa , Upl , w , w; no ] ^ + {@ [ cons: =0 , 1 , 2 , 3; attrs: pred_clobber ] + [ , Upl, w , ; yes ] cmp\t%0., %1/z, %2., #%3 + [ ?Upl , 0 , w , ; yes ] ^ + [ Upa , Upl, w , ; no ] ^ + [ , Upl, w , w; yes ] cmp\t%0., %1/z, %2., %3. + [ ?Upl , 0 , w , w; yes ] ^ + [ Upa , Upl, w , w; no ] ^ } "&& !rtx_equal_p (operands[4], operands[6])" { @@ -8205,12 +8205,12 @@ "TARGET_SVE && aarch64_sve_same_pred_for_ptest_p ([4], [6])" {@ [ cons: =0, 1, 2 , 3; attrs: pred_clobber ] - [ , Upl , w , ; yes ] cmp\t%0., %1/z, %2., #%3 - [ ?Upa, 0Upl, w , ; yes ] ^ - [ Upa , Upl , w , ; no ] ^ - [ , Upl , w , w; yes ] cmp\t%0., %1/z, %2., %3. - [ ?Upa, 0Upl, w , w; yes ] ^ - [ Upa , Upl , w , w; no ] ^ + [ , Upl, w , ; yes ] cmp\t%0., %1/z, %2., #%3 + [ ?Upl, 0 , w , ; yes ] ^ + [ Upa , Upl, w , ; no ] ^ + [ , Upl, w , w; yes ] cmp\t%0., %1/z, %2., %3. + [ ?Upl, 0 , w , w; yes ] ^ + [ Upa , Upl, w , w; no ] ^ } "&& !rtx_equal_p (operands[4], operands[6])" { @@ -8263,10 +8263,10 @@ UNSPEC_PRED_Z)) (clobber (reg:CC_NZC CC_REGNUM))] "TARGET_SVE" - {@ [ cons: =0, 1, 2, 3, 4; attrs: pred_clobber ] - [ , Upl , , w, w; yes ] cmp\t%0., %1/z, %3., %4.d - [ ?Upa, 0Upl, , w, w; yes ] ^ - [ Upa , Upl , , w, w; no ] ^ + {@ [ cons: =0, 1 , 2, 3, 4; attrs: pred_clobber ] +
[PATCH]AArch64: correct constraint on Upl early clobber alternatives
Hi All, I made an oversight in the previous patch, where I added a ?Upa alternative to the Upl cases. This causes it to create the tie between the larger register file rather than the constrained one. This fixes the affected patterns. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Build SPECCPU 2017 and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-sve.md (@aarch64_pred_cmp, *cmp_cc, *cmp_ptest, @aarch64_pred_cmp_wide, *aarch64_pred_cmp_wide_cc, *aarch64_pred_cmp_wide_ptest): Fix Upl tie alternative. * config/aarch64/aarch64-sve2.md (@aarch64_pred_): Fix Upl tie alternative. --- diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index d902bce62fde88b6d85f8d71f305e7fc76a4d34e..d69db34016a55b4324faa129a3ac1f47227ba776 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -8134,13 +8134,13 @@ (define_insn "@aarch64_pred_cmp" UNSPEC_PRED_Z)) (clobber (reg:CC_NZC CC_REGNUM))] "TARGET_SVE" - {@ [ cons: =0 , 1 , 3 , 4; attrs: pred_clobber ] - [ , Upl , w , ; yes ] cmp\t%0., %1/z, %3., #%4 - [ ?Upa , 0Upl, w , ; yes ] ^ - [ Upa , Upl , w , ; no ] ^ - [ , Upl , w , w; yes ] cmp\t%0., %1/z, %3., %4. - [ ?Upa , 0Upl, w , w; yes ] ^ - [ Upa , Upl , w , w; no ] ^ + {@ [ cons: =0 , 1 , 3 , 4; attrs: pred_clobber ] + [ , Upl, w , ; yes ] cmp\t%0., %1/z, %3., #%4 + [ ?Upl , 0 , w , ; yes ] ^ + [ Upa , Upl, w , ; no ] ^ + [ , Upl, w , w; yes ] cmp\t%0., %1/z, %3., %4. + [ ?Upl , 0 , w , w; yes ] ^ + [ Upa , Upl, w , w; no ] ^ } ) @@ -8170,13 +8170,13 @@ (define_insn_and_rewrite "*cmp_cc" UNSPEC_PRED_Z))] "TARGET_SVE && aarch64_sve_same_pred_for_ptest_p ([4], [6])" - {@ [ cons: =0 , 1, 2 , 3; attrs: pred_clobber ] - [ , Upl , w , ; yes ] cmp\t%0., %1/z, %2., #%3 - [ ?Upa , 0Upl, w , ; yes ] ^ - [ Upa , Upl , w , ; no ] ^ - [ , Upl , w , w; yes ] cmp\t%0., %1/z, %2., %3. - [ ?Upa , 0Upl, w , w; yes ] ^ - [ Upa , Upl , w , w; no ] ^ + {@ [ cons: =0 , 1 , 2 , 3; attrs: pred_clobber ] + [ , Upl, w , ; yes ] cmp\t%0., %1/z, %2., #%3 + [ ?Upl , 0 , w , ; yes ] ^ + [ Upa , Upl, w , ; no ] ^ + [ , Upl, w , w; yes ] cmp\t%0., %1/z, %2., %3. + [ ?Upl , 0 , w , w; yes ] ^ + [ Upa , Upl, w , w; no ] ^ } "&& !rtx_equal_p (operands[4], operands[6])" { @@ -8205,12 +8205,12 @@ (define_insn_and_rewrite "*cmp_ptest" "TARGET_SVE && aarch64_sve_same_pred_for_ptest_p ([4], [6])" {@ [ cons: =0, 1, 2 , 3; attrs: pred_clobber ] - [ , Upl , w , ; yes ] cmp\t%0., %1/z, %2., #%3 - [ ?Upa, 0Upl, w , ; yes ] ^ - [ Upa , Upl , w , ; no ] ^ - [ , Upl , w , w; yes ] cmp\t%0., %1/z, %2., %3. - [ ?Upa, 0Upl, w , w; yes ] ^ - [ Upa , Upl , w , w; no ] ^ + [ , Upl, w , ; yes ] cmp\t%0., %1/z, %2., #%3 + [ ?Upl, 0 , w , ; yes ] ^ + [ Upa , Upl, w , ; no ] ^ + [ , Upl, w , w; yes ] cmp\t%0., %1/z, %2., %3. + [ ?Upl, 0 , w , w; yes ] ^ + [ Upa , Upl, w , w; no ] ^ } "&& !rtx_equal_p (operands[4], operands[6])" { @@ -8263,10 +8263,10 @@ (define_insn "@aarch64_pred_cmp_wide" UNSPEC_PRED_Z)) (clobber (reg:CC_NZC CC_REGNUM))] "TARGET_SVE" - {@ [ cons: =0, 1, 2, 3, 4; attrs: pred_clobber ] - [ , Upl , , w, w; yes ] cmp\t%0., %1/z, %3., %4.d - [ ?Upa, 0Upl, , w, w; yes ] ^ - [ Upa , Upl , , w, w; no ] ^ + {@ [ cons: =0, 1 , 2, 3, 4; attrs: pred_clobber ] + [ , Upl, , w, w; yes ] cmp\t%0., %1/z, %3., %4.d + [ ?Upl, 0 , , w, w; yes ] ^ + [ Upa , Upl, , w, w; no ] ^ } ) @@ -8298,10 +8298,10 @@ (define_insn
[gcc r15-1041] AArch64: enable new predicate tuning for Neoverse cores.
https://gcc.gnu.org/g:3eb9f6eab9802d5ae65ead6b1f2ae6fe0833e06e commit r15-1041-g3eb9f6eab9802d5ae65ead6b1f2ae6fe0833e06e Author: Tamar Christina Date: Wed Jun 5 19:32:16 2024 +0100 AArch64: enable new predicate tuning for Neoverse cores. This enables the new tuning flag for Neoverse V1, Neoverse V2 and Neoverse N2. It is kept off for generic codegen. Note the reason for the +sve even though they are in aarch64-sve.exp is if the testsuite is ran with a forced SVE off option, e.g. -march=armv8-a+nosve then the intrinsics end up being disabled because the -march is preferred over the -mcpu even though the -mcpu comes later. This prevents the tests from failing in such runs. gcc/ChangeLog: * config/aarch64/tuning_models/neoversen2.h (neoversen2_tunings): Add AARCH64_EXTRA_TUNE_AVOID_PRED_RMW. * config/aarch64/tuning_models/neoversev1.h (neoversev1_tunings): Add AARCH64_EXTRA_TUNE_AVOID_PRED_RMW. * config/aarch64/tuning_models/neoversev2.h (neoversev2_tunings): Add AARCH64_EXTRA_TUNE_AVOID_PRED_RMW. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred_clobber_1.c: New test. * gcc.target/aarch64/sve/pred_clobber_2.c: New test. * gcc.target/aarch64/sve/pred_clobber_3.c: New test. * gcc.target/aarch64/sve/pred_clobber_4.c: New test. Diff: --- gcc/config/aarch64/tuning_models/neoversen2.h | 3 ++- gcc/config/aarch64/tuning_models/neoversev1.h | 3 ++- gcc/config/aarch64/tuning_models/neoversev2.h | 3 ++- .../gcc.target/aarch64/sve/pred_clobber_1.c| 22 + .../gcc.target/aarch64/sve/pred_clobber_2.c| 22 + .../gcc.target/aarch64/sve/pred_clobber_3.c| 23 ++ .../gcc.target/aarch64/sve/pred_clobber_4.c| 22 + 7 files changed, 95 insertions(+), 3 deletions(-) diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h index 7e799bbe762..be9a48ac3ad 100644 --- a/gcc/config/aarch64/tuning_models/neoversen2.h +++ b/gcc/config/aarch64/tuning_models/neoversen2.h @@ -236,7 +236,8 @@ static const struct tune_params neoversen2_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS - | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags. */ + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h index 9363f2ad98a..0fc41ce6a41 100644 --- a/gcc/config/aarch64/tuning_models/neoversev1.h +++ b/gcc/config/aarch64/tuning_models/neoversev1.h @@ -227,7 +227,8 @@ static const struct tune_params neoversev1_tunings = (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT - | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS/* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index bc01ed767c9..f76e4ef358f 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -236,7 +236,8 @@ static const struct tune_params neoversev2_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS - | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags. */ + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c new file mode 100644 index 000..25129e8d6f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=neoverse-n2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#pragma GCC target "+sve" + +#include + +extern void use(svbool_t); + +/* +** foo: +** ... +** ptrue p([1-3]).b, all +** cmplo p0.h, p\1/z, z
[gcc r15-1040] AArch64: add new alternative with early clobber to patterns
https://gcc.gnu.org/g:2de3bbde1ebea8689f3596967769f66bf903458e commit r15-1040-g2de3bbde1ebea8689f3596967769f66bf903458e Author: Tamar Christina Date: Wed Jun 5 19:31:39 2024 +0100 AArch64: add new alternative with early clobber to patterns This patch adds new alternatives to the patterns which are affected. The new alternatives with the conditional early clobbers are added before the normal ones in order for LRA to prefer them in the event that we have enough free registers to accommodate them. In case register pressure is too high the normal alternatives will be preferred before a reload is considered as we rather have the tie than a spill. Tests are in the next patch. gcc/ChangeLog: * config/aarch64/aarch64-sve.md (and3, @aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, @aarch64_pred_cmp, *cmp_cc, *cmp_ptest, @aarch64_pred_cmp_wide, *aarch64_pred_cmp_wide_cc, *aarch64_pred_cmp_wide_ptest, @aarch64_brk, *aarch64_brk_cc, *aarch64_brk_ptest, @aarch64_brk, *aarch64_brk_cc, *aarch64_brk_ptest, aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest, *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber alternative. * config/aarch64/aarch64-sve2.md (@aarch64_pred_): Likewise. Diff: --- gcc/config/aarch64/aarch64-sve.md | 178 + gcc/config/aarch64/aarch64-sve2.md | 6 +- 2 files changed, 124 insertions(+), 60 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index ca4d435e705..d902bce62fd 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1161,8 +1161,10 @@ (reg:VNx16BI FFRT_REGNUM) (match_operand:VNx16BI 1 "register_operand")))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffr\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa ; yes ] rdffr\t%0.b, %1/z + [ ?Upa, 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ } ) @@ -1179,8 +1181,10 @@ UNSPEC_PTEST)) (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa ; yes ] rdffrs\t%0.b, %1/z + [ ?Upa, 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ } ) @@ -1195,8 +1199,10 @@ UNSPEC_PTEST)) (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa ; yes ] rdffrs\t%0.b, %1/z + [ ?Upa, 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ } ) @@ -1216,8 +1222,10 @@ (reg:VNx16BI FFRT_REGNUM) (match_dup 1)))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa ; yes ] rdffrs\t%0.b, %1/z + [ ?Upa, 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ } ) @@ -1233,8 +1241,10 @@ (set (match_operand:VNx16BI 0 "register_operand") (reg:VNx16BI FFRT_REGNUM))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa ; yes ] rdffrs\t%0.b, %1/z + [ ?Upa, 0Upa; yes ] ^ + [ Upa , Upa ; no ] ^ } ) @@ -6651,8 +6661,10 @@ (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand") (match_operand:PRED_ALL 2 "register_operand")))] "TARGET_SVE" - {@ [ cons: =0, 1 , 2 ] - [ Upa , Upa, Upa ] and\t%0.b, %1/z, %2.b, %2.b + {@ [ cons: =0, 1 , 2 ; attrs: pred_clobber ] + [ , Upa , Upa ; yes ] and\t%0.b, %1/z, %2.b, %2.b + [ ?Upa, 0Upa, 0Upa; yes ] ^ + [ Upa , Upa , Upa ; no ] ^ } ) @@ -6679,8 +6691,10 @@ (match_operand:PRED_ALL 3 "register_operand")) (match_operand:PRED_ALL 1 "register_operand")))] "TARGET_SVE" - {@ [ cons: =0, 1 , 2 , 3 ] - [ Upa , Upa, Upa, Upa ] \t%0.b, %1/z, %2.b, %3.b + {@ [ cons: =0, 1 , 2 , 3 ; att
[gcc r15-1039] AArch64: add new tuning param and attribute for enabling conditional early clobber
https://gcc.gnu.org/g:35f17c680ca650f8658994f857358e5a529c0b93 commit r15-1039-g35f17c680ca650f8658994f857358e5a529c0b93 Author: Tamar Christina Date: Wed Jun 5 19:31:11 2024 +0100 AArch64: add new tuning param and attribute for enabling conditional early clobber This adds a new tuning parameter AARCH64_EXTRA_TUNE_AVOID_PRED_RMW for AArch64 to allow us to conditionally enable the early clobber alternatives based on the tuning models. gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (AVOID_PRED_RMW): New. * config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New. * config/aarch64/aarch64.md (pred_clobber): New. (arch_enabled): Use it. Diff: --- gcc/config/aarch64/aarch64-tuning-flags.def | 4 gcc/config/aarch64/aarch64.h| 5 + gcc/config/aarch64/aarch64.md | 18 -- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index d5bcaebce77..a9f48f5d3d4 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA) +/* Enable is the target prefers to use a fresh register for predicate outputs + rather than re-use an input predicate register. */ +AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index bbf11faaf4b..0997b82dbc0 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; enabled through +gcs. */ #define TARGET_GCS (AARCH64_ISA_GCS) +/* Prefer different predicate registers for the output of a predicated + operation over re-using an existing input predicate. */ +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ +&& (aarch64_tune_params.extra_tuning_flags \ +& AARCH64_EXTRA_TUNE_AVOID_PRED_RMW)) /* Standard register usage. */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 9dff2d7a2b0..389a1906e23 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -445,6 +445,10 @@ ;; target-independent code. (define_attr "is_call" "no,yes" (const_string "no")) +;; Indicates whether we want to enable the pattern with an optional early +;; clobber for SVE predicates. +(define_attr "pred_clobber" "any,no,yes" (const_string "any")) + ;; [For compatibility with Arm in pipeline models] ;; Attribute that specifies whether or not the instruction touches fp ;; registers. @@ -460,7 +464,17 @@ (define_attr "arch_enabled" "no,yes" (if_then_else -(ior +(and + (ior + (and + (eq_attr "pred_clobber" "no") + (match_test "!TARGET_SVE_PRED_CLOBBER")) + (and + (eq_attr "pred_clobber" "yes") + (match_test "TARGET_SVE_PRED_CLOBBER")) + (eq_attr "pred_clobber" "any")) + + (ior (eq_attr "arch" "any") (and (eq_attr "arch" "rcpc8_4") @@ -488,7 +502,7 @@ (match_test "TARGET_SVE")) (and (eq_attr "arch" "sme") -(match_test "TARGET_SME"))) +(match_test "TARGET_SME" (const_string "yes") (const_string "no")))
[gcc r15-1038] AArch64: convert several predicate patterns to new compact syntax
https://gcc.gnu.org/g:fd4898891ae0c73d6b7aa433cd1ef4539aaa2457 commit r15-1038-gfd4898891ae0c73d6b7aa433cd1ef4539aaa2457 Author: Tamar Christina Date: Wed Jun 5 19:30:39 2024 +0100 AArch64: convert several predicate patterns to new compact syntax This converts the single alternative patterns to the new compact syntax such that when I add the new alternatives it's clearer what's being changed. Note that this will spew out a bunch of warnings from geninsn as it'll warn that @ is useless for a single alternative pattern. These are not fatal so won't break the build and are only temporary. No change in functionality is expected with this patch. gcc/ChangeLog: * config/aarch64/aarch64-sve.md (and3, @aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, *cmp_ptest, @aarch64_pred_cmp_wide, *aarch64_pred_cmp_wide_cc, *aarch64_pred_cmp_wide_ptest, *aarch64_brk_cc, *aarch64_brk_ptest, @aarch64_brk, *aarch64_brk_cc, *aarch64_brk_ptest, aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest, *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Convert to compact syntax. * config/aarch64/aarch64-sve2.md (@aarch64_pred_): Likewise. Diff: --- gcc/config/aarch64/aarch64-sve.md | 262 ++--- gcc/config/aarch64/aarch64-sve2.md | 12 +- 2 files changed, 161 insertions(+), 113 deletions(-) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 0434358122d..ca4d435e705 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1156,76 +1156,86 @@ ;; Likewise with zero predication. (define_insn "aarch64_rdffr_z" - [(set (match_operand:VNx16BI 0 "register_operand" "=Upa") + [(set (match_operand:VNx16BI 0 "register_operand") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) - (match_operand:VNx16BI 1 "register_operand" "Upa")))] + (match_operand:VNx16BI 1 "register_operand")))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffr\t%0.b, %1/z" + {@ [ cons: =0, 1 ] + [ Upa , Upa ] rdffr\t%0.b, %1/z + } ) ;; Read the FFR to test for a fault, without using the predicate result. (define_insn "*aarch64_rdffr_z_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 2 "aarch64_sve_ptrue_flag") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1))] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 ] + [ Upa , Upa ] rdffrs\t%0.b, %1/z + } ) ;; Same for unpredicated RDFFR when tested with a known PTRUE. (define_insn "*aarch64_rdffr_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (const_int SVE_KNOWN_PTRUE) (reg:VNx16BI FFRT_REGNUM)] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 ] + [ Upa , Upa ] rdffrs\t%0.b, %1/z + } ) ;; Read the FFR with zero predication and test the result. (define_insn "*aarch64_rdffr_z_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 2 "aarch64_sve_ptrue_flag") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1))] UNSPEC_PTEST)) - (set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (set (match_operand:VNx16BI 0 "register_operand") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1)))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 ] + [ Upa , Upa ] rdffrs\t%0.b, %1/z + } ) ;; Same for unpredicated RDFFR when tested with a known PTRUE. (define_insn "*aarch64_rdffr_cc" [(set (re
RE: [PATCH] Rearrange SLP nodes with duplicate statements. [PR98138]
> -Original Message- > From: Richard Biener > Sent: Wednesday, June 5, 2024 9:07 AM > To: Manolis Tsamis > Cc: gcc-patches@gcc.gnu.org; Christoph Müllner ; > Kewen . Lin ; Philipp Tomsich ; > Tamar Christina ; Jiangning Liu > > Subject: Re: [PATCH] Rearrange SLP nodes with duplicate statements. [PR98138] > > On Tue, 4 Jun 2024, Manolis Tsamis wrote: > > > This change adds a function that checks for SLP nodes with multiple > > occurrences > > of the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the > > node > > so that there are no duplicates. A vec_perm is then introduced to recreate > > the > > original ordering. These duplicates can appear due to how two_operators > > nodes > > are handled, and they prevent vectorization in some cases. > > So the trick is that when we have two operands we elide duplicate lanes > so we can do discovery for a single combined operand instead which we > then decompose into the required two again. That's a nice one. > > But as implemented this will fail SLP discovery if the combined operand > fails discovery possibly because of divergence in downstream defs. That > is, it doesn't fall back to separate discovery. I suspect the situation > of duplicate lanes isn't common but then I would also suspect that > divergence _is_ common. I think we should also look at the cases where vectorization itself also failed because the generated tree ends up with an unsupported load. i.e. in this particular case we would have failed SLP at a later step. > > The discovery code is already quite complex with the way it possibly > swaps operands of lanes, fitting in this as another variant to try (first) > is likely going to be a bit awkward. A way out might be to split the > function or to make the re-try in the caller which could indicate whether > to apply this pattern trick or not. That said - can you try to get > data on how often the trick applies and discovery succeeds and how > often discovery fails but discovery would suceed without applying the > pattern (say, on SPEC)? > > I also suppose instead of hardcoding three patterns for a fixed > size it should be possible to see there's > only (at most) half unique lanes in both operands (and one less in one > operand if the number of lanes is odd) and compute the un-swizzling lane > permutes during this discovery, removing the need of the explicit enum > and open-coding each case? > > Another general note is that trying (and then undo on fail) such ticks > eats at the discovery limit we have in place to avoid exponential run-off > in exactly this degenerate cases. I suppose this is typically a case where changing to merging multiple single lane SLPs instead of creating the multiline graph in one go would make things easier? Isn't SLP discovery computationally expensive since it has to create the full graph in one go, whereas with merging you just rotate some subgraphs or eventually just keep the single lane separate? Cheers, Tamar > > Thanks, > Richard. > > > This targets the vectorization of the SPEC2017 x264 pixel_satd functions. > > In some processors a larger than 10% improvement on x264 has been observed. > > > > See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138 > > > > gcc/ChangeLog: > > > > * tree-vect-slp.cc (enum slp_oprnd_pattern): new enum for > rearrangement > > patterns. > > (try_rearrange_oprnd_info): Detect if a node corresponds to one of the > > patterns. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/vect-slp-two-operator.c: New test. > > > > Signed-off-by: Manolis Tsamis > > --- > > > > .../aarch64/vect-slp-two-operator.c | 42 > > gcc/tree-vect-slp.cc | 234 ++ > > 2 files changed, 276 insertions(+) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c > b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c > > new file mode 100644 > > index 000..2db066a0b6e > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c > > @@ -0,0 +1,42 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect -fdump-tree-vect- > details" } */ > > + > > +typedef unsigned char uint8_t; > > +typedef unsigned int uint32_t; > > + > > +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\ > > +int t0 = s0 + s1;\ > > +int t1 = s0 - s1;\ > > +int t2 = s2 + s3;\ >
RE: [PATCH] [RFC] lower SLP load permutation to interleaving
> -Original Message- > From: Richard Biener > Sent: Tuesday, June 4, 2024 3:33 PM > To: gcc-patches@gcc.gnu.org > Cc: Richard Sandiford ; Tamar Christina > > Subject: [PATCH] [RFC] lower SLP load permutation to interleaving > > The following emulates classical interleaving for SLP load permutes > that we are unlikely handling natively. This is to handle cases > where interleaving (or load/store-lanes) is the optimal choice for > vectorizing even when we are doing that within SLP. An example > would be > > void foo (int * __restrict a, int * b) > { > for (int i = 0; i < 16; ++i) > { > a[4*i + 0] = b[4*i + 0] * 3; > a[4*i + 1] = b[4*i + 1] + 3; > a[4*i + 2] = (b[4*i + 2] * 3 + 3); > a[4*i + 3] = b[4*i + 3] * 3; > } > } > > where currently the SLP store is merging four single-lane SLP > sub-graphs but none of the loads in it can be code-generated > with V4SImode vectors and a VF of four as the permutes would need > three vectors. > > The patch introduces a lowering phase after SLP discovery but > before SLP pattern recognition or permute optimization that > analyzes all loads from the same dataref group and creates an > interleaving scheme starting from an unpermuted load. > > What can be handled is quite restrictive, matching only a subset > of the non-SLP interleaving cases (the power-of-two group size > ones, in addition only cases without gaps). The interleaving > vectorization in addition can handle size 3 and 5 - but I am not > sure if it's possible to do that in a VL agnostic way. It > should be still possible to set up the SLP graph in a way that > a load-lane could be matched from SLP pattern recognition. > > As said gaps are currently not handled - for SLP we have a > representational issue that SLP_TREE_SCALAR_STMTS for "gap lanes" > would need to be filled in some way (even if we just push NULL). > > The patch misses multi-level even/odd handling as well as CSEing > intermediate generated permutes. Both is quite straight-forward > to add, but eventually there's a better or more general strategy > for lowering? The main goal of the patch is to avoid falling > back to non-SLP for cases the interleaving code handles. I guess not handling CSEing the intermediate permutes only really matter for pattern matching? Those could be eliminated in optimize_slp? > > Comments and suggestions welcome, esp. what representation > you'd think is suitable for SLP pattern matching to > load/store-lane and how to represent that? Maybe this lowering > should happen directly in vect_lower_load_permutations? I like this representation personally, I'd say having the permute explicit, at least until optimize_slp would make pattern matching easier. We wouldn't need hacks such as optimize_load_redistribution. In that sense, does it make sense to eventually just lower all permuted loads? Cheers, Tamar > > Thanks, > Richard. > > * tree-vect-slp.cc (vllp_cmp): New function. > (vect_lower_load_permutations): Likewise. > (vect_analyze_slp): Call it. > --- > gcc/tree-vect-slp.cc | 279 > +++ > 1 file changed, 279 insertions(+) > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index 7e3d0107b4e..766b773452f 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -3839,6 +3839,279 @@ vect_analyze_slp_instance (vec_info *vinfo, >return res; > } > > +/* qsort comparator ordering SLP load nodes. */ > + > +static int > +vllp_cmp (const void *a_, const void *b_) > +{ > + const slp_tree a = *(const slp_tree *)a_; > + const slp_tree b = *(const slp_tree *)b_; > + stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0]; > + stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0]; > + if (STMT_VINFO_GROUPED_ACCESS (a0) > + && STMT_VINFO_GROUPED_ACCESS (b0) > + && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0)) > +{ > + /* Same group, order after lanes used. */ > + if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b)) > + return 1; > + else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b)) > + return -1; > + else > + { > + /* Try to order loads using the same lanes together, breaking > + the tie with the lane number that first differs. */ > + if (!SLP_TREE_LOAD_PERMUTATION (a).exists () > + && !SLP_TREE_LOAD_PERMUTATION (b).exists ()) > + return 0; > + else if (SLP_TREE_LOAD_PERMUTATION (a).exists () > +&& !SLP_TREE_LOAD_PERMUTATION (b).exists ()) > + return 1; > + else if (!SLP_TREE_LOAD_PERMUTATION (a).exist
RE: [PATCH 3/4]AArch64: add new alternative with early clobber to patterns
> -Original Message- > From: Richard Sandiford > Sent: Wednesday, May 22, 2024 12:24 PM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > ; Marcus Shawcroft > ; ktkac...@gcc.gnu.org > Subject: Re: [PATCH 3/4]AArch64: add new alternative with early clobber to > patterns > > Tamar Christina writes: > >> -Original Message- > >> From: Richard Sandiford > >> Sent: Wednesday, May 22, 2024 10:48 AM > >> To: Tamar Christina > >> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > >> ; Marcus Shawcroft > >> ; ktkac...@gcc.gnu.org > >> Subject: Re: [PATCH 3/4]AArch64: add new alternative with early clobber to > >> patterns > >> > >> Tamar Christina writes: > >> > Hi All, > >> > > >> > This patch adds new alternatives to the patterns which are affected. > >> > The new > >> > alternatives with the conditional early clobbers are added before the > >> > normal > >> > ones in order for LRA to prefer them in the event that we have enough > >> > free > >> > registers to accommodate them. > >> > > >> > In case register pressure is too high the normal alternatives will be > >> > preferred > >> > before a reload is considered as we rather have the tie than a spill. > >> > > >> > Tests are in the next patch. > >> > > >> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > >> > > >> > Ok for master? > >> > > >> > Thanks, > >> > Tamar > >> > > >> > gcc/ChangeLog: > >> > > >> > * config/aarch64/aarch64-sve.md (and3, > >> > @aarch64_pred__z, *3_cc, > >> > *3_ptest, aarch64_pred__z, > >> > *3_cc, *3_ptest, > >> > aarch64_pred__z, *3_cc, > >> > *3_ptest, @aarch64_pred_cmp, > >> > *cmp_cc, *cmp_ptest, > >> > @aarch64_pred_cmp_wide, > >> > *aarch64_pred_cmp_wide_cc, > >> > *aarch64_pred_cmp_wide_ptest, > >> @aarch64_brk, > >> > *aarch64_brk_cc, *aarch64_brk_ptest, > >> > @aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest, > >> > *aarch64_brk_cc, *aarch64_brk_ptest, > >> > aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest, > >> > *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber > >> > alternative. > >> > * config/aarch64/aarch64-sve2.md > >> > (@aarch64_pred_): Likewise. > >> > > >> > --- > >> > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64- > >> sve.md > >> > index > >> > e3085c0c636f1317409bbf3b5fbaf5342a2df1f6..8fdc1bc3cd43acfcd675a18350c > >> 297428c85fe46 100644 > >> > --- a/gcc/config/aarch64/aarch64-sve.md > >> > +++ b/gcc/config/aarch64/aarch64-sve.md > >> > @@ -1161,8 +1161,10 @@ (define_insn "aarch64_rdffr_z" > >> >(reg:VNx16BI FFRT_REGNUM) > >> >(match_operand:VNx16BI 1 "register_operand")))] > >> >"TARGET_SVE && TARGET_NON_STREAMING" > >> > - {@ [ cons: =0, 1 ] > >> > - [ Upa , Upa ] rdffr\t%0.b, %1/z > >> > + {@ [ cons: =0, 1 ; attrs: pred_clobber ] > >> > + [ , Upa; yes ] rdffr\t%0.b, %1/z > >> > + [ ?Upa, Upa; yes ] ^ > >> > + [ Upa , Upa; * ] ^ > >> >} > >> > ) > >> > >> Sorry for not explaining it very well, but in the previous review I > >> suggested: > >> > >> > The gather-like approach would be something like: > >> > > >> > [ , Upl , w , ; yes ] > >> cmp\t%0., %1/z, %3., #%4 > >> > [ ?Upl , 0 , w , ; yes ] ^ > >> > [ Upa , Upl , w , ; no ] ^ > >> > [ , Upl , w , w; yes ] > >> > cmp\t%0., > %1/z, > >> %3., %4. > >> > [ ?Upl , 0 , w , w; yes ] ^ > >> > [ Upa , Upl , w , w; no ] ^ > >> > > >> > with: > >> > > >> > (define_attr "pred_clobber" "any,no,yes" (const_string "any")) > >> &
RE: [PATCH 2/4]AArch64: add new tuning param and attribute for enabling conditional early clobber
> -Original Message- > From: Tamar Christina > Sent: Wednesday, May 22, 2024 10:29 AM > To: Richard Sandiford > Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > ; Marcus Shawcroft > ; ktkac...@gcc.gnu.org > Subject: RE: [PATCH 2/4]AArch64: add new tuning param and attribute for > enabling conditional early clobber > > > > > Sorry for the bike-shedding, but how about something like "avoid_pred_rmw"? > > (I'm open to other suggestions.) Just looking for something that describes > > either the architecture or the end result that we want to achieve. > > And preferable something fairly short :) > > > > avoid_* would be consistent with the existing "avoid_cross_loop_fma". > > > > > + > > > #undef AARCH64_EXTRA_TUNING_OPTION > > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > > > index > > > bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d5 > > 6b46c74084ba7c3c 100644 > > > --- a/gcc/config/aarch64/aarch64.h > > > +++ b/gcc/config/aarch64/aarch64.h > > > @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = > > AARCH64_FL_SM_OFF; > > > enabled through +gcs. */ > > > #define TARGET_GCS (AARCH64_ISA_GCS) > > > > > > +/* Prefer different predicate registers for the output of a predicated > > > operation > > over > > > +re-using an existing input predicate. */ > > > +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ > > > + && (aarch64_tune_params.extra_tuning_flags \ > > > + & > > AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST)) > > > > > > /* Standard register usage. */ > > > > > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > > > index > > > dbde066f7478bec51a8703b017ea553aa98be309..1ecd1a2812969504bd5114a > > 53473b478c5ddba82 100644 > > > --- a/gcc/config/aarch64/aarch64.md > > > +++ b/gcc/config/aarch64/aarch64.md > > > @@ -445,6 +445,10 @@ (define_enum_attr "arch" "arches" (const_string > > "any")) > > > ;; target-independent code. > > > (define_attr "is_call" "no,yes" (const_string "no")) > > > > > > +;; Indicates whether we want to enable the pattern with an optional early > > > +;; clobber for SVE predicates. > > > +(define_attr "pred_clobber" "no,yes" (const_string "no")) > > > + > > > ;; [For compatibility with Arm in pipeline models] > > > ;; Attribute that specifies whether or not the instruction touches fp > > > ;; registers. > > > @@ -461,7 +465,8 @@ (define_attr "fp" "no,yes" > > > (define_attr "arch_enabled" "no,yes" > > >(if_then_else > > > (ior > > > - (eq_attr "arch" "any") > > > + (and (eq_attr "arch" "any") > > > + (eq_attr "pred_clobber" "no")) > > > > > > (and (eq_attr "arch" "rcpc8_4") > > >(match_test "AARCH64_ISA_RCPC8_4")) > > > @@ -488,7 +493,10 @@ (define_attr "arch_enabled" "no,yes" > > >(match_test "TARGET_SVE")) > > > > > > (and (eq_attr "arch" "sme") > > > - (match_test "TARGET_SME"))) > > > + (match_test "TARGET_SME")) > > > + > > > + (and (eq_attr "pred_clobber" "yes") > > > + (match_test "TARGET_SVE_PRED_CLOBBER"))) > > > > IMO it'd be bettero handle pred_clobber separately from arch, as a new > > top-level AND: > > > > (and > > (ior > > (eq_attr "pred_clobber" "no") > > (match_test "!TARGET_...")) > > (ior > > ...existing arch tests...)) > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (AVOID_PRED_RMW): New. * config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New. * config/aarch64/aarch64.md (pred_clobber): New. (arch_enabled): Use it. -- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index d5bcaebce770f0b217aac783063d39135f754c77..a9f48f5d3d4ea32f
RE: [PATCH 3/4]AArch64: add new alternative with early clobber to patterns
> -Original Message- > From: Richard Sandiford > Sent: Wednesday, May 22, 2024 10:48 AM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > ; Marcus Shawcroft > ; ktkac...@gcc.gnu.org > Subject: Re: [PATCH 3/4]AArch64: add new alternative with early clobber to > patterns > > Tamar Christina writes: > > Hi All, > > > > This patch adds new alternatives to the patterns which are affected. The > > new > > alternatives with the conditional early clobbers are added before the normal > > ones in order for LRA to prefer them in the event that we have enough free > > registers to accommodate them. > > > > In case register pressure is too high the normal alternatives will be > > preferred > > before a reload is considered as we rather have the tie than a spill. > > > > Tests are in the next patch. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-sve.md (and3, > > @aarch64_pred__z, *3_cc, > > *3_ptest, aarch64_pred__z, > > *3_cc, *3_ptest, > > aarch64_pred__z, *3_cc, > > *3_ptest, @aarch64_pred_cmp, > > *cmp_cc, *cmp_ptest, > > @aarch64_pred_cmp_wide, > > *aarch64_pred_cmp_wide_cc, > > *aarch64_pred_cmp_wide_ptest, > @aarch64_brk, > > *aarch64_brk_cc, *aarch64_brk_ptest, > > @aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest, > > *aarch64_brk_cc, *aarch64_brk_ptest, > > aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest, > > *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber > > alternative. > > * config/aarch64/aarch64-sve2.md > > (@aarch64_pred_): Likewise. > > > > --- > > diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64- > sve.md > > index > e3085c0c636f1317409bbf3b5fbaf5342a2df1f6..8fdc1bc3cd43acfcd675a18350c > 297428c85fe46 100644 > > --- a/gcc/config/aarch64/aarch64-sve.md > > +++ b/gcc/config/aarch64/aarch64-sve.md > > @@ -1161,8 +1161,10 @@ (define_insn "aarch64_rdffr_z" > > (reg:VNx16BI FFRT_REGNUM) > > (match_operand:VNx16BI 1 "register_operand")))] > >"TARGET_SVE && TARGET_NON_STREAMING" > > - {@ [ cons: =0, 1 ] > > - [ Upa , Upa ] rdffr\t%0.b, %1/z > > + {@ [ cons: =0, 1 ; attrs: pred_clobber ] > > + [ , Upa; yes ] rdffr\t%0.b, %1/z > > + [ ?Upa, Upa; yes ] ^ > > + [ Upa , Upa; * ] ^ > >} > > ) > > Sorry for not explaining it very well, but in the previous review I suggested: > > > The gather-like approach would be something like: > > > > [ , Upl , w , ; yes ] > cmp\t%0., %1/z, %3., #%4 > > [ ?Upl , 0 , w , ; yes ] ^ > > [ Upa , Upl , w , ; no ] ^ > > [ , Upl , w , w; yes ] > > cmp\t%0., %1/z, > %3., %4. > > [ ?Upl , 0 , w , w; yes ] ^ > > [ Upa , Upl , w , w; no ] ^ > > > > with: > > > > (define_attr "pred_clobber" "any,no,yes" (const_string "any")) > > (with emphasis on the last line). What I didn't say explicitly is > that "no" should require !TARGET_SVE_PRED_CLOBBER. > > The premise of that review was that we shouldn't enable things like: > > [ Upa , Upl , w , w; no ] ^ > > for TARGET_SVE_PRED_CLOBBER since it contradicts the earlyclobber > alternative. So we should enable either the pred_clobber=yes > alternatives or the pred_clobber=no alternatives, but not both. > > The default "any" is then for other non-predicate instructions that > don't care about TARGET_SVE_PRED_CLOBBER either way. > > In contrast, this patch makes pred_clobber=yes enable the alternatives > that correctly describe the restriction (good!) but then also enables > the normal alternatives too, which IMO makes the semantics unclear. Sure, the reason I still had that is because this ICEs under high register pressure: {@ [ cons: =0 , 1 , 3 , 4; attrs: pred_clobber ] [ , Upl , w , ; yes ] cmp\t%0., %1/z, %3., #%4 [ ?Upa , 0 , w , ; yes ] ^ [ Upa , Upl , w , ; no ] ^ [
[PATCH 3/4]AArch64: add new alternative with early clobber to patterns
Hi All, This patch adds new alternatives to the patterns which are affected. The new alternatives with the conditional early clobbers are added before the normal ones in order for LRA to prefer them in the event that we have enough free registers to accommodate them. In case register pressure is too high the normal alternatives will be preferred before a reload is considered as we rather have the tie than a spill. Tests are in the next patch. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-sve.md (and3, @aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, @aarch64_pred_cmp, *cmp_cc, *cmp_ptest, @aarch64_pred_cmp_wide, *aarch64_pred_cmp_wide_cc, *aarch64_pred_cmp_wide_ptest, @aarch64_brk, *aarch64_brk_cc, *aarch64_brk_ptest, @aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest, *aarch64_brk_cc, *aarch64_brk_ptest, aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest, *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber alternative. * config/aarch64/aarch64-sve2.md (@aarch64_pred_): Likewise. --- diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index e3085c0c636f1317409bbf3b5fbaf5342a2df1f6..8fdc1bc3cd43acfcd675a18350c297428c85fe46 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1161,8 +1161,10 @@ (define_insn "aarch64_rdffr_z" (reg:VNx16BI FFRT_REGNUM) (match_operand:VNx16BI 1 "register_operand")))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffr\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa; yes ] rdffr\t%0.b, %1/z + [ ?Upa, Upa; yes ] ^ + [ Upa , Upa; * ] ^ } ) @@ -1179,8 +1181,10 @@ (define_insn "*aarch64_rdffr_z_ptest" UNSPEC_PTEST)) (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa; yes ] rdffrs\t%0.b, %1/z + [ ?Upa, Upa; yes ] ^ + [ Upa , Upa; * ] ^ } ) @@ -1195,8 +1199,10 @@ (define_insn "*aarch64_rdffr_ptest" UNSPEC_PTEST)) (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa; yes ] rdffrs\t%0.b, %1/z + [ ?Upa, Upa; yes ] ^ + [ Upa , Upa; * ] ^ } ) @@ -1216,8 +1222,10 @@ (define_insn "*aarch64_rdffr_z_cc" (reg:VNx16BI FFRT_REGNUM) (match_dup 1)))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa; yes ] rdffrs\t%0.b, %1/z + [ ?Upa, Upa; yes ] ^ + [ Upa , Upa; * ] ^ } ) @@ -1233,8 +1241,10 @@ (define_insn "*aarch64_rdffr_cc" (set (match_operand:VNx16BI 0 "register_operand") (reg:VNx16BI FFRT_REGNUM))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa; yes ] rdffrs\t%0.b, %1/z + [ ?Upa, Upa; yes ] ^ + [ Upa , Upa; * ] ^ } ) @@ -6651,8 +6661,10 @@ (define_insn "and3" (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand") (match_operand:PRED_ALL 2 "register_operand")))] "TARGET_SVE" - {@ [ cons: =0, 1 , 2 ] - [ Upa , Upa, Upa ] and\t%0.b, %1/z, %2.b, %2.b + {@ [ cons: =0, 1 , 2 ; attrs: pred_clobber ] + [ , Upa, Upa; yes ] and\t%0.b, %1/z, %2.b, %2.b + [ ?Upa, Upa, Upa; yes ] ^ + [ Upa , Upa, Upa; * ] ^ } ) @@ -6679,8 +6691,10 @@ (define_insn "@aarch64_pred__z" (match_operand:PRED_ALL 3 "register_operand")) (match_operand:PRED_ALL 1 "register_operand")))] "TARGET_SVE" - {@ [ cons: =0, 1 , 2 , 3 ] - [ Upa , Upa, Upa, Upa ] \t%0.b, %1/z, %2.b, %3.b + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ , Upa, Upa, Upa; yes ] \t%0.b, %1/z, %2.b, %3.b + [ ?Upa, Upa, Upa, Upa; yes ] ^ + [ Upa , Upa, Upa, Upa; * ] ^ } ) @@ -6703,8 +6717,10 @@ (define_insn "*3_cc" (and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3))
[PATCH 4/4]AArch64: enable new predicate tuning for Neoverse cores.
Hi All, This enables the new tuning flag for Neoverse V1, Neoverse V2 and Neoverse N2. It is kept off for generic codegen. Note the reason for the +sve even though they are in aarch64-sve.exp is if the testsuite is ran with a forced SVE off option, e.g. -march=armv8-a+nosve then the intrinsics end up being disabled because the -march is preferred over the -mcpu even though the -mcpu comes later. This prevents the tests from failing in such runs. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/tuning_models/neoversen2.h (neoversen2_tunings): Add AARCH64_EXTRA_TUNE_AVOID_PRED_RMW. * config/aarch64/tuning_models/neoversev1.h (neoversev1_tunings): Add AARCH64_EXTRA_TUNE_AVOID_PRED_RMW. * config/aarch64/tuning_models/neoversev2.h (neoversev2_tunings): Add AARCH64_EXTRA_TUNE_AVOID_PRED_RMW. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred_clobber_1.c: New test. * gcc.target/aarch64/sve/pred_clobber_2.c: New test. * gcc.target/aarch64/sve/pred_clobber_3.c: New test. * gcc.target/aarch64/sve/pred_clobber_4.c: New test. * gcc.target/aarch64/sve/pred_clobber_5.c: New test. --- diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h index 7e799bbe762fe862e31befed50e54040a7fd1f2f..be9a48ac3adc097f967c217fe09dcac194d7d14f 100644 --- a/gcc/config/aarch64/tuning_models/neoversen2.h +++ b/gcc/config/aarch64/tuning_models/neoversen2.h @@ -236,7 +236,8 @@ static const struct tune_params neoversen2_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS - | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags. */ + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h index 9363f2ad98a5279cc99f2f9b1509ba921d582e84..0fc41ce6a41b3135fa06d2bda1f517fdf4f8dbcf 100644 --- a/gcc/config/aarch64/tuning_models/neoversev1.h +++ b/gcc/config/aarch64/tuning_models/neoversev1.h @@ -227,7 +227,8 @@ static const struct tune_params neoversev1_tunings = (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT - | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS/* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index bc01ed767c9b690504eb98456402df5d9d64eee3..f76e4ef358f7dfb9c7d7b470ea7240eaa2120f8e 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -236,7 +236,8 @@ static const struct tune_params neoversev2_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS - | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags. */ + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c new file mode 100644 index ..934a00a38531c5fd4139d99ff33414904b2c104f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=neoverse-n2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#pragma GCC target "+sve" + +#include + +extern void use(svbool_t); + +/* +** foo: +** ... +** ptrue p([1-9][0-9]?).b, all +** cmplo p0.h, p\1/z, z0.h, z[0-9]+.h +** ... +*/ +void foo (svuint16_t a, uint16_t b) +{ +svbool_t p0 = svcmplt_n_u16 (svptrue_b16 (), a, b); +use (p0); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c new file mode 100644 index ..58badb66a43b1ac50eeec153b9cac44fc831b145 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=neoverse-v2" } */ +/* {
RE: [PATCH 2/4]AArch64: add new tuning param and attribute for enabling conditional early clobber
> > Sorry for the bike-shedding, but how about something like "avoid_pred_rmw"? > (I'm open to other suggestions.) Just looking for something that describes > either the architecture or the end result that we want to achieve. > And preferable something fairly short :) > > avoid_* would be consistent with the existing "avoid_cross_loop_fma". > > > + > > #undef AARCH64_EXTRA_TUNING_OPTION > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > > index > bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d5 > 6b46c74084ba7c3c 100644 > > --- a/gcc/config/aarch64/aarch64.h > > +++ b/gcc/config/aarch64/aarch64.h > > @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = > AARCH64_FL_SM_OFF; > > enabled through +gcs. */ > > #define TARGET_GCS (AARCH64_ISA_GCS) > > > > +/* Prefer different predicate registers for the output of a predicated > > operation > over > > +re-using an existing input predicate. */ > > +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ > > +&& (aarch64_tune_params.extra_tuning_flags \ > > +& > AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST)) > > > > /* Standard register usage. */ > > > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > > index > dbde066f7478bec51a8703b017ea553aa98be309..1ecd1a2812969504bd5114a > 53473b478c5ddba82 100644 > > --- a/gcc/config/aarch64/aarch64.md > > +++ b/gcc/config/aarch64/aarch64.md > > @@ -445,6 +445,10 @@ (define_enum_attr "arch" "arches" (const_string > "any")) > > ;; target-independent code. > > (define_attr "is_call" "no,yes" (const_string "no")) > > > > +;; Indicates whether we want to enable the pattern with an optional early > > +;; clobber for SVE predicates. > > +(define_attr "pred_clobber" "no,yes" (const_string "no")) > > + > > ;; [For compatibility with Arm in pipeline models] > > ;; Attribute that specifies whether or not the instruction touches fp > > ;; registers. > > @@ -461,7 +465,8 @@ (define_attr "fp" "no,yes" > > (define_attr "arch_enabled" "no,yes" > >(if_then_else > > (ior > > - (eq_attr "arch" "any") > > + (and (eq_attr "arch" "any") > > +(eq_attr "pred_clobber" "no")) > > > > (and (eq_attr "arch" "rcpc8_4") > > (match_test "AARCH64_ISA_RCPC8_4")) > > @@ -488,7 +493,10 @@ (define_attr "arch_enabled" "no,yes" > > (match_test "TARGET_SVE")) > > > > (and (eq_attr "arch" "sme") > > -(match_test "TARGET_SME"))) > > +(match_test "TARGET_SME")) > > + > > + (and (eq_attr "pred_clobber" "yes") > > +(match_test "TARGET_SVE_PRED_CLOBBER"))) > > IMO it'd be bettero handle pred_clobber separately from arch, as a new > top-level AND: > > (and > (ior > (eq_attr "pred_clobber" "no") > (match_test "!TARGET_...")) > (ior > ...existing arch tests...)) > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (AVOID_PRED_RMW): New. * config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New. * config/aarch64/aarch64.md (pred_clobber): New. (arch_enabled): Use it. -- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index d5bcaebce770f0b217aac783063d39135f754c77..a9f48f5d3d4ea32fbf53086ba21eab4bc65b6dcb 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA) +/* Enable is the target prefers to use a fresh register for predicate outputs + rather than re-use an input predicate register. */ +AARCH64_EXTRA_TUNING_OPTION ("avoid_pred_rmw", AVOID_PRED_RMW) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index bbf11faaf4b4340956094a983f8b0dc2649b2d27..e7669e65d7dae5df2ba42c265079b1856a5c382b 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; enabled through +gcs. */ #define TARGET_GCS (AARCH64_ISA_GCS) +/* Prefer different predicate registers for the output of a predicated operation over +re-using an existing input predicate. */ +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ +&& (aarch64_tune_params.extra_tuning_flags \ +& AARCH64_EXTRA_TUNE_AVOID_PRED_RMW)) /* Standard register usage. */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index dbde066f7478bec51a8703b017ea553aa98be309..52e5adba4172e14b794b5df9394e58ce49ef8b7f 100644 --- a/gcc/config/aarch64/aarch64.md +++
RE: [RFC] Merge strathegy for all-SLP vectorizer
> -Original Message- > From: Richard Biener > Sent: Friday, May 17, 2024 1:54 PM > To: Richard Sandiford > Cc: Richard Biener via Gcc ; Tamar Christina > > Subject: Re: [RFC] Merge strathegy for all-SLP vectorizer > > On Fri, 17 May 2024, Richard Sandiford wrote: > > > Richard Biener via Gcc writes: > > > Hi, > > > > > > I'd like to discuss how to go forward with getting the vectorizer to > > > all-SLP for this stage1. While there is a personal branch with my > > > ongoing work (users/rguenth/vect-force-slp) branches haven't proved > > > themselves working well for collaboration. > > Yeah, It's hard to keep rebasing and build on top of. > > Speaking for myself, the problem hasn't been so much the branch as > > lack of time. I've been pretty swamped the last eight months of so > > (except for the time that I took off, which admittedly was quite a > > bit!), and so I never even got around to properly reading and replying > > to your message after the Cauldron. It's been on the "this is important, > > I should make time to read and understand it properly" list all this time. > > Sorry about that. :( > > > > I'm hoping to have time to work/help out on SLP stuff soon. > > > > > The branch isn't ready to be merged in full but I have been picking > > > improvements to trunk last stage1 and some remaining bits in the past > > > weeks. I have refrained from merging code paths that cannot be > > > exercised on trunk. > > > > > > There are two important set of changes on the branch, both critical > > > to get more testing on non-x86 targets. > > > > > > 1. enable single-lane SLP discovery > > > 2. avoid splitting store groups (9315bfc661432c3 and 4336060fe2db8ec > > > if you fetch the branch) > > > For no# is there a param or is it just the default? I can run these through regression today. > > > The first point is also most annoying on the testsuite since doing > > > SLP instead of interleaving changes what we dump and thus tests > > > start to fail in random ways when you switch between both modes. > > > On the branch single-lane SLP discovery is gated with > > > --param vect-single-lane-slp. > > > > > > The branch has numerous changes to enable single-lane SLP for some > > > code paths that have SLP not implemented and where I did not bother > > > to try supporting multi-lane SLP at this point. It also adds more > > > SLP discovery entry points. > > > > > > I'm not sure how to try merging these pieces to allow others to > > > more easily help out. One possibility is to merge > > > --param vect-single-lane-slp defaulted off and pick dependent > > > changes even when they cause testsuite regressions with > > > vect-single-lane-slp=1. Alternatively adjust the testsuite by > > > adding --param vect-single-lane-slp=0 and default to 1 > > > (or keep the default). I guess which one is better depends on whether the parameter goes away this release? If so I think we should just leave them broken for now and fix them up when it's the default? > > > > FWIW, this one sounds good to me (the default to 1 version). > > I.e. mechanically add --param vect-single-lane-slp=0 to any tests > > that fail with the new default. That means that the test that need > > fixing are easily greppable for anyone who wants to help. Sometimes > > it'll just be a test update. Sometimes it will be new vectoriser code. > > OK. Meanwhile I figured the most important part is 2. from above > since that enables the single-lane in a grouped access (also covering > single element interleaving). This will cover all problematical cases > with respect to vectorizing loads and stores. It also has less > testsuite fallout, mainly because we have a lot less coverage for > grouped stores without SLP. > > So I'll see to produce a mergeable patch for part 2 and post that > for review next week. Sounds good! Thanks for getting the ball rolling on this. It would be useful to have it in trunk indeed, off by default for now sounds good because then I can work on trunk for the SLP support for early break as well. Cheers, Tamar > > Thanks, > Richard. > > > Thanks, > > Richard > > > > > Or require a clean testsuite with > > > --param vect-single-lane-slp defaulted to 1 but keep the --param > > > for debugging (and allow FAILs with 0). > > > > > > For fun I merged just single-lane discovery of non-grouped stores > > > and have that enabled by
RE: [PATCH 0/4]AArch64: support conditional early clobbers on certain operations.
> -Original Message- > From: Richard Sandiford > Sent: Wednesday, May 15, 2024 10:31 PM > To: Tamar Christina > Cc: Richard Biener ; gcc-patches@gcc.gnu.org; nd > ; Richard Earnshaw ; Marcus > Shawcroft ; ktkac...@gcc.gnu.org > Subject: Re: [PATCH 0/4]AArch64: support conditional early clobbers on certain > operations. > > Tamar Christina writes: > >> >> On Wed, May 15, 2024 at 12:29 PM Tamar Christina > >> >> wrote: > >> >> > > >> >> > Hi All, > >> >> > > >> >> > Some Neoverse Software Optimization Guides (SWoG) have a clause that > state > >> >> > that for predicated operations that also produce a predicate it is > >> >> > preferred > >> >> > that the codegen should use a different register for the destination > >> >> > than > that > >> >> > of the input predicate in order to avoid a performance overhead. > >> >> > > >> >> > This of course has the problem that it increases register pressure > >> >> > and so > >> should > >> >> > be done with care. Additionally not all micro-architectures have this > >> >> > consideration and so it shouldn't be done as a default thing. > >> >> > > >> >> > The patch series adds support for doing conditional early clobbers > >> >> > through > a > >> >> > combination of new alternatives and attributes to control their > >> >> > availability. > >> >> > >> >> You could have two alternatives, one with early clobber and one with > >> >> a matching constraint where you'd disparage the matching constraint one? > >> >> > >> > > >> > Yeah, that's what I do, though there's no need to disparage the non-early > clobber > >> > alternative as the early clobber alternative will naturally get a > >> > penalty if it > needs a > >> > reload. > >> > >> But I think Richard's suggestion was to disparage the one with a matching > >> constraint (not the earlyclobber), to reflect the increased cost of > >> reusing the register. > >> > >> We did take that approach for gathers, e.g.: > >> > >> [, Z, w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s] > >> [?w, Z, 0, Ui1, Ui1, Upl] ^ > >> > >> The (supposed) advantage is that, if register pressure is so tight > >> that using matching registers is the only alternative, we still > >> have the opportunity to do that, as a last resort. > >> > >> Providing only an earlyclobber version means that using the same > >> register is prohibited outright. If no other register is free, the RA > >> would need to spill something else to free up a temporary register. > >> And it might then do the equivalent of (pseudo-code): > >> > >> not p1.b, ..., p0.b > >> mov p0.d, p1.d > >> > >> after spilling what would otherwise have occupied p1. In that > >> situation it would be better use: > >> > >> not p0.b, ..., p0.b > >> > >> and not introduce the spill of p1. > > > > I think I understood what Richi meant, but I thought it was already working > > that > way. > > The suggestion was to use matching constraints (like "0") though, > whereas the patch doesn't. I think your argument is that you don't > need to use matching constraints. But that's different from the > suggestion (and from how we handle gathers). > > I was going to say in response to patch 3 (but got distracted, sorry): > I don't think we should have: > >, Upa, ... >Upa, Upa, ... > > (taken from the pure logic ops) enabled at the same time. Even though > it works for the testcases, I don't think it has well-defined semantics. > > The problem is that, taken on its own, the second alternative says that > matching operands are free. And fundamentally, I don't think the costs > *must* take the earlyclobber alternative over the non-earlyclobber one > (when costing during IRA, for instance). In principle, the cheapest > is best. > > The aim of the gather approach is to make each alternative correct in > isolation. In: > > [, Z, w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s] > [?w, Z, 0, Ui1, Ui1, Upl] ^ > > the second alternative says that it is possible to have operands 0 > and 2 be the same vector register, but using that version has th
RE: [PATCH v3] Match: Extract ternary_integer_types_match_p helper func [NFC]
> -Original Message- > From: pan2...@intel.com > Sent: Tuesday, May 21, 2024 2:13 AM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > ; richard.guent...@gmail.com; Pan Li > > Subject: [PATCH v3] Match: Extract ternary_integer_types_match_p helper func > [NFC] > > From: Pan Li > > There are sorts of match pattern for SAT related cases, there will be > some duplicated code to check the dest, op_0, op_1 are same tree types. > Aka ternary tree type matches. Thus, extract one helper function to > do this and avoid match code duplication. > > The below test suites are passed for this patch: > * The rv64gcv fully regression test. > * The x86 bootstrap test. > * The x86 regression test. > > gcc/ChangeLog: > > * match.pd: Leverage helper func for SAT_ADD match. > * tree.cc (ternary_integer_types_match_p): New func impl to > check if ternary tree types are all integer. > * tree.h (ternary_integer_types_match_p): New func decl. > Thanks, looks good to me! You still need approval from a maintainer.. Cheers, Tamar > Signed-off-by: Pan Li > --- > gcc/match.pd | 28 +++- > gcc/tree.cc | 16 > gcc/tree.h | 5 + > 3 files changed, 28 insertions(+), 21 deletions(-) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 0f9c34fa897..cff67c84498 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -39,7 +39,8 @@ along with GCC; see the file COPYING3. If not see > HONOR_NANS > uniform_vector_p > expand_vec_cmp_expr_p > - bitmask_inv_cst_vector_p) > + bitmask_inv_cst_vector_p > + ternary_integer_types_match_p) > > /* Operator lists. */ > (define_operator_list tcc_comparison > @@ -3046,38 +3047,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned Saturation Add */ > (match (usadd_left_part_1 @0 @1) > (plus:c @0 @1) > - (if (INTEGRAL_TYPE_P (type) > - && TYPE_UNSIGNED (TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@1) > + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED > (type > > (match (usadd_left_part_2 @0 @1) > (realpart (IFN_ADD_OVERFLOW:c @0 @1)) > - (if (INTEGRAL_TYPE_P (type) > - && TYPE_UNSIGNED (TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@1) > + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED > (type > > (match (usadd_right_part_1 @0 @1) > (negate (convert (lt (plus:c @0 @1) @0))) > - (if (INTEGRAL_TYPE_P (type) > - && TYPE_UNSIGNED (TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@1) > + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED > (type > > (match (usadd_right_part_1 @0 @1) > (negate (convert (gt @0 (plus:c @0 @1 > - (if (INTEGRAL_TYPE_P (type) > - && TYPE_UNSIGNED (TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@1) > + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED > (type > > (match (usadd_right_part_2 @0 @1) > (negate (convert (ne (imagpart (IFN_ADD_OVERFLOW:c @0 @1)) > integer_zerop))) > - (if (INTEGRAL_TYPE_P (type) > - && TYPE_UNSIGNED (TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@1) > + (if (ternary_integer_types_match_p (type, @0, @1) && TYPE_UNSIGNED > (type > > /* We cannot merge or overload usadd_left_part_1 and usadd_left_part_2 > because the sub part of left_part_2 cannot work with right_part_1. > diff --git a/gcc/tree.cc b/gcc/tree.cc > index 6564b002dc1..b59d42c3e47 100644 > --- a/gcc/tree.cc > +++ b/gcc/tree.cc > @@ -10622,6 +10622,22 @@ uniform_integer_cst_p (tree t) >return NULL_TREE; > } > > +/* Check if the types T1, T2 and T3 are effectively the same integer type. > + If T1, T2 or T3 is not a type, the test applies to their TREE_TYPE. */ > + > +bool > +ternary_integer_types_match_p (tree t1, tree t2, tree t3) > +{ > + t1 = TYPE_P (t1) ? t1 : TREE_TYPE (t1); > + t2 = TYPE_P (t2) ? t2 : TREE_TYPE (t2); > + t3 = TYPE_P (t3) ? t3 : TREE_TYPE (t3); > + > + if (!INTEGRAL_TYPE_P (t1) || !INTEGRAL_TYPE_P (t2) || !INTEGRAL_TYPE_P > (t3)) > +return false; > + > + return types_compatible_p (t1, t2) && types_compatible_p (t2
RE: [PATCH v1 1/2] Match: Support branch form for unsigned SAT_ADD
Hi Pan, > -Original Message- > From: pan2...@intel.com > Sent: Monday, May 20, 2024 12:01 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > ; richard.guent...@gmail.com; Pan Li > > Subject: [PATCH v1 1/2] Match: Support branch form for unsigned SAT_ADD > > From: Pan Li > > This patch would like to support the branch form for unsigned > SAT_ADD. For example as below: > > uint64_t > sat_add (uint64_t x, uint64_t y) > { > return (uint64_t) (x + y) >= x ? (x + y) : -1; > } > > Different to the branchless version, we leverage the simplify to > convert the branch version of SAT_ADD into branchless if and only > if the backend has supported the IFN_SAT_ADD. Thus, the backend has > the ability to choose branch or branchless implementation of .SAT_ADD. > For example, some target can take care of branches code more optimally. > > When the target implement the IFN_SAT_ADD for unsigned and before this > patch: > uint64_t sat_add_u_1_uint64_t (uint64_t x, uint64_t y) > { > long unsigned int _1; > uint64_t _2; > __complex__ long unsigned int _6; > long unsigned int _7; > > ;; basic block 2, loop depth 0 > ;;pred: ENTRY > _6 = .ADD_OVERFLOW (x_3(D), y_4(D)); > _1 = REALPART_EXPR <_6>; > _7 = IMAGPART_EXPR <_6>; > if (_7 == 0) > goto ; [65.00%] > else > goto ; [35.00%] > ;;succ: 4 > ;;3 > > ;; basic block 3, loop depth 0 > ;;pred: 2 > ;;succ: 4 > > ;; basic block 4, loop depth 0 > ;;pred: 3 > ;;2 > # _2 = PHI <18446744073709551615(3), _1(2)> > return _2; > ;;succ: EXIT > > } > > After this patch: > uint64_t sat_add (uint64_t x, uint64_t y) > { > long unsigned int _9; > > ;; basic block 2, loop depth 0 > ;;pred: ENTRY > _9 = .SAT_ADD (x_3(D), y_4(D)); [tail call] > return _9; > ;;succ: EXIT > } > > The below test suites are passed for this patch: > * The x86 bootstrap test. > * The x86 fully regression test. > * The riscv fully regression test. > > gcc/ChangeLog: > > * match.pd: Add new simplify to convert branch SAT_ADD into > branchless, if and only if backend implement the IFN. > > Signed-off-by: Pan Li > --- > gcc/match.pd | 18 ++ > 1 file changed, 18 insertions(+) > > diff --git a/gcc/match.pd b/gcc/match.pd > index 0f9c34fa897..0547b57b3a3 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3094,6 +3094,24 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > (match (unsigned_integer_sat_add @0 @1) > (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1))) > > +#if GIMPLE > + > +/* Simplify the branch version of SAT_ADD into branchless if and only if > + the backend has supported the IFN_SAT_ADD. Thus, the backend has the > + ability to choose branch or branchless implementation of .SAT_ADD. */ > + > +(simplify > + (cond (ge (plus:c@2 @0 @1) @0) @2 integer_minus_onep) > + (if (direct_internal_fn_supported_p (IFN_SAT_ADD, type, > OPTIMIZE_FOR_BOTH)) > + (bit_ior @2 (negate (convert (lt @2 @0)) > + > +(simplify > + (cond (le @0 (plus:c@2 @0 @1)) @2 integer_minus_onep) > + (if (direct_internal_fn_supported_p (IFN_SAT_ADD, type, > OPTIMIZE_FOR_BOTH)) > + (bit_ior @2 (negate (convert (lt @2 @0)) > + > +#endif Thanks, this looks good to me! I'll leave it up to Richard to approve, Richard: The reason for the direct_internal_fn_supported_p is because some targets said that they currently handle the branch version better due to the lack of some types. At the time I reason it's just a target expansion bug but didn't hear anything. To be honest, it feels to me like we should do this unconditionally, and just have the targets that get faster branch version to handle it during expand? Since the patch series provides a canonicalized version now. This means we can also better support targets that have the vector optab but not the scalar one as the above check would fail for these targets. What do you think? Thanks, Tamar > + > /* x > y && x != XXX_MIN --> x > y > x > y && x == XXX_MIN --> false . */ > (for eqne (eq ne) > -- > 2.34.1
RE: [PATCH v1] Match: Extract integer_types_ternary_match helper to avoid code dup [NFC]
> -Original Message- > From: pan2...@intel.com > Sent: Sunday, May 19, 2024 5:17 AM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > ; richard.guent...@gmail.com; Pan Li > > Subject: [PATCH v1] Match: Extract integer_types_ternary_match helper to avoid > code dup [NFC] > > From: Pan Li > > There are sorts of match pattern for SAT related cases, there will be > some duplicated code to check the dest, op_0, op_1 are same tree types. > Aka ternary tree type matches. Thus, extract one helper function to > do this and avoid match code duplication. > > The below test suites are passed for this patch: > * The rv64gcv fully regression test. > * The x86 bootstrap test. > * The x86 regression test. > > gcc/ChangeLog: > > * generic-match-head.cc (integer_types_ternary_match): New helper > function to check tenary tree type matches or not. > * gimple-match-head.cc (integer_types_ternary_match): Ditto but > for match. > * match.pd: Leverage above helper function to avoid code dup. > Nice cleanup! This function isn't part of the machinery of match.pd and is instead part of a pattern. For these things we usually put them in tree.h/tree.cc and declare them at the top of match.pd in the "define_predicates" list. This will also allow you to get rid of the code duplication. In addition such functions which return a true/false we consider predicates, and name them ending with _p. See e.g. bitmask_inv_cst_vector_p which is also defined in tree.h/tree.cc. Cheers, Tamar > Signed-off-by: Pan Li > --- > gcc/generic-match-head.cc | 17 + > gcc/gimple-match-head.cc | 17 + > gcc/match.pd | 25 + > 3 files changed, 39 insertions(+), 20 deletions(-) > > diff --git a/gcc/generic-match-head.cc b/gcc/generic-match-head.cc > index 0d3f648fe8d..cdd48c7a5cc 100644 > --- a/gcc/generic-match-head.cc > +++ b/gcc/generic-match-head.cc > @@ -59,6 +59,23 @@ types_match (tree t1, tree t2) >return TYPE_MAIN_VARIANT (t1) == TYPE_MAIN_VARIANT (t2); > } > > +/* Routine to determine if the types T1, T2 and T3 are effectively > + the same integer type for GENERIC. If T1, T2 or T3 is not a type, > + the test applies to their TREE_TYPE. */ > + > +static inline bool > +integer_types_ternary_match (tree t1, tree t2, tree t3) > +{ > + t1 = TYPE_P (t1) ? t1 : TREE_TYPE (t1); > + t2 = TYPE_P (t2) ? t2 : TREE_TYPE (t2); > + t3 = TYPE_P (t3) ? t3 : TREE_TYPE (t3); > + > + if (!INTEGRAL_TYPE_P (t1) || !INTEGRAL_TYPE_P (t2) || !INTEGRAL_TYPE_P > (t3)) > +return false; > + > + return types_match (t1, t2) && types_match (t1, t3); > +} > + > /* Return if T has a single use. For GENERIC, we assume this is > always true. */ > > diff --git a/gcc/gimple-match-head.cc b/gcc/gimple-match-head.cc > index 5f8a1a1ad8e..91f2e56b8ef 100644 > --- a/gcc/gimple-match-head.cc > +++ b/gcc/gimple-match-head.cc > @@ -79,6 +79,23 @@ types_match (tree t1, tree t2) >return types_compatible_p (t1, t2); > } > > +/* Routine to determine if the types T1, T2 and T3 are effectively > + the same integer type for GIMPLE. If T1, T2 or T3 is not a type, > + the test applies to their TREE_TYPE. */ > + > +static inline bool > +integer_types_ternary_match (tree t1, tree t2, tree t3) > +{ > + t1 = TYPE_P (t1) ? t1 : TREE_TYPE (t1); > + t2 = TYPE_P (t2) ? t2 : TREE_TYPE (t2); > + t3 = TYPE_P (t3) ? t3 : TREE_TYPE (t3); > + > + if (!INTEGRAL_TYPE_P (t1) || !INTEGRAL_TYPE_P (t2) || !INTEGRAL_TYPE_P > (t3)) > +return false; > + > + return types_match (t1, t2) && types_match (t1, t3); > +} > + > /* Return if T has a single use. For GIMPLE, we also allow any > non-SSA_NAME (ie constants) and zero uses to cope with uses > that aren't linked up yet. */ > diff --git a/gcc/match.pd b/gcc/match.pd > index 0f9c34fa897..b291e34bbe4 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3046,38 +3046,23 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > /* Unsigned Saturation Add */ > (match (usadd_left_part_1 @0 @1) > (plus:c @0 @1) > - (if (INTEGRAL_TYPE_P (type) > - && TYPE_UNSIGNED (TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@0)) > - && types_match (type, TREE_TYPE (@1) > + (if (TYPE_UNSIGNED (type) && integer_types_ternary_match (type, @0, @1 > > (match (usadd_left_part_2 @0 @1) > (realpart (IFN_ADD_OVERFLOW:c @0 @1)) > - (if (INTEGRAL_TYPE_P (type) > - && TYPE_UNSIGNED (TREE_TYPE (@0)) > - &&
RE: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int
Hi Pan, > > Hi Tamar, > > I am trying to add more shape(s) like below branch version for SAT_ADD. I > suspect > that widening_mul may not be the best place to take care of this shape. > Because after_dom_children almost works on bb but we actually need to find the > def/use cross the bb. It actually already does this, see for example optimize_spaceship which optimizes across basic blocks. However... > > Thus, is there any suggestion for branch shape? Add new simplify to match.pd > works well but it is not recommended per previous discussion. The objection previously was not to introduce the IFNs at match.pd, it doesn't mean we can't use match.pd to force the versions with branches to banchless code so the existing patterns can deal with them as is. ...in this case something like this: #if GIMPLE (simplify (cond (ge (plus:c@3 @0 @1) @0) @3 integer_minus_onep) (if (direct_internal_fn_supported_p (...)) (bit_ior @3 (negate (...) #endif Works better I think. That is, for targets we know we can optimize it later on, or do something with it in the vectorizer we canonicalize it. The reason I have it guarded with the IFN is that some target maintainers objected to replacing the branch code with branchless code as their targets can more optimally deal with branches. Cheers, Tamar > > Thanks a lot for help! > > Pan > > ---Source code- > > #define SAT_ADD_U_1(T) \ > T sat_add_u_1_##T(T x, T y) \ > { \ > return (T)(x + y) >= x ? (x + y) : -1; \ > } > > SAT_ADD_U_1(uint16_t) > > ---Gimple- > > uint16_t sat_add_u_1_uint16_t (uint16_t x, uint16_t y) > { > short unsigned int _1; > uint16_t _2; > >[local count: 1073741824]: > _1 = x_3(D) + y_4(D); > if (_1 >= x_3(D)) > goto ; [65.00%] > else > goto ; [35.00%] > >[local count: 697932184]: > > [local count: 1073741824]: > # _2 = PHI <65535(2), _1(3)> > return _2; > } > > Pan > > -Original Message- > From: Tamar Christina > Sent: Wednesday, May 15, 2024 5:12 PM > To: Li, Pan2 ; gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; > Liu, Hongtao > Subject: RE: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar int > > Hi Pan, > > Thanks! > > > -Original Message- > > From: pan2...@intel.com > > Sent: Wednesday, May 15, 2024 3:14 AM > > To: gcc-patches@gcc.gnu.org > > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > > ; richard.guent...@gmail.com; > > hongtao@intel.com; Pan Li > > Subject: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar > > int > > > > From: Pan Li > > > > This patch would like to add the middle-end presentation for the > > saturation add. Aka set the result of add to the max when overflow. > > It will take the pattern similar as below. > > > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > > > Take uint8_t as example, we will have: > > > > * SAT_ADD (1, 254) => 255. > > * SAT_ADD (1, 255) => 255. > > * SAT_ADD (2, 255) => 255. > > * SAT_ADD (255, 255) => 255. > > > > Given below example for the unsigned scalar integer uint64_t: > > > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > > { > > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > > } > > > > Before this patch: > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > long unsigned int _1; > > _Bool _2; > > long unsigned int _3; > > long unsigned int _4; > > uint64_t _7; > > long unsigned int _10; > > __complex__ long unsigned int _11; > > > > ;; basic block 2, loop depth 0 > > ;;pred: ENTRY > > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > > _1 = REALPART_EXPR <_11>; > > _10 = IMAGPART_EXPR <_11>; > > _2 = _10 != 0; > > _3 = (long unsigned int) _2; > > _4 = -_3; > > _7 = _1 | _4; > > return _7; > > ;;succ: EXIT > > > > } > > > > After this patch: > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > uint64_t _7; > > > > ;; basic block 2, loop depth 0 > > ;;pred: ENTRY > > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > > return _7; > > ;;succ: EXIT > > } > > > > The below tests are passed for this patch: > > 1. The riscv fully regression tests. > > 3. The x86 bootstrap tests. > &g
RE: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer
> -Original Message- > From: Richard Biener > Sent: Friday, May 17, 2024 10:46 AM > To: Tamar Christina > Cc: Victor Do Nascimento ; gcc- > patc...@gcc.gnu.org; Richard Sandiford ; Richard > Earnshaw ; Victor Do Nascimento > > Subject: Re: [PATCH] middle-end: Expand {u|s}dot product support in > autovectorizer > > On Fri, May 17, 2024 at 11:05 AM Tamar Christina > wrote: > > > > > -Original Message- > > > From: Richard Biener > > > Sent: Friday, May 17, 2024 6:51 AM > > > To: Victor Do Nascimento > > > Cc: gcc-patches@gcc.gnu.org; Richard Sandiford > ; > > > Richard Earnshaw ; Victor Do Nascimento > > > > > > Subject: Re: [PATCH] middle-end: Expand {u|s}dot product support in > > > autovectorizer > > > > > > On Thu, May 16, 2024 at 4:40 PM Victor Do Nascimento > > > wrote: > > > > > > > > From: Victor Do Nascimento > > > > > > > > At present, the compiler offers the `{u|s|us}dot_prod_optab' direct > > > > optabs for dealing with vectorizable dot product code sequences. The > > > > consequence of using a direct optab for this is that backend-pattern > > > > selection is only ever able to match against one datatype - Either > > > > that of the operands or of the accumulated value, never both. > > > > > > > > With the introduction of the 2-way (un)signed dot-product insn [1][2] > > > > in AArch64 SVE2, the existing direct opcode approach is no longer > > > > sufficient for full specification of all the possible dot product > > > > machine instructions to be matched to the code sequence; a dot product > > > > resulting in VNx4SI may result from either dot products on VNx16QI or > > > > VNx8HI values for the 4- and 2-way dot product operations, respectively. > > > > > > > > This means that the following example fails autovectorization: > > > > > > > > uint32_t foo(int n, uint16_t* data) { > > > > uint32_t sum = 0; > > > > for (int i=0; i > > > sum += data[i] * data[i]; > > > > } > > > > return sum; > > > > } > > > > > > > > To remedy the issue a new optab is added, tentatively named > > > > `udot_prod_twoway_optab', whose selection is dependent upon checking > > > > of both input and output types involved in the operation. > > > > > > I don't like this too much. I'll note we document dot_prod as > > > > > > @cindex @code{sdot_prod@var{m}} instruction pattern > > > @item @samp{sdot_prod@var{m}} > > > > > > Compute the sum of the products of two signed elements. > > > Operand 1 and operand 2 are of the same mode. Their > > > product, which is of a wider mode, is computed and added to operand 3. > > > Operand 3 is of a mode equal or wider than the mode of the product. The > > > result is placed in operand 0, which is of the same mode as operand 3. > > > @var{m} is the mode of operand 1 and operand 2. > > > > > > with no restriction on the wider mode but we don't specify it which is > > > bad design. This should have been a convert optab with two modes > > > from the start - adding a _twoway variant is just a hack. > > > > We did discuss this at the time we started implementing it. There was two > > options, one was indeed to change it to a convert dot_prod optab, but doing > > this means we have to update every target that uses it. > > > > Now that means 3 ISAs for AArch64, Arm, Arc, c6x, 2 for x86, loongson and > altivec. > > > > Which sure could be possible, but there's also every use in the backends > > that > need > > to be updated, and tested, which for some targets we don't even know how to > begin. > > > > So it seems very hard to correct dotprod to a convert optab now. > > It's still the correct way to go. At _least_ your new pattern should > have been this, > otherwise what do you do when you have two-way, four-way and eight-way > variants? > Add yet another optab? I guess that's fair, but having the new optab only be convert resulted in messy code as everywhere you must check for both variants. Additionally that optab would then overlap with the existing optabs as, as you Say, the documentation only says it's of a wider type and doesn't indicate precision. So to avoid issues down the line then If the new optab isn't acceptable then we'll have to do a wholesale conversion then.. > > Another thing is that when you do it
RE: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer
> -Original Message- > From: Hongtao Liu > Sent: Friday, May 17, 2024 3:14 AM > To: Victor Do Nascimento > Cc: gcc-patches@gcc.gnu.org; Richard Sandiford ; > Richard Earnshaw ; Victor Do Nascimento > > Subject: Re: [PATCH] middle-end: Expand {u|s}dot product support in > autovectorizer > > > > > > Sorry to chime in, for x86 backend, we defined usdot_prodv16hi, and > > 2-way dot_prod operations can be generated > > > This is the link https://godbolt.org/z/hcWr64vx3, x86 define > udot_prodv16qi/udot_prod8hi and both 2-way and 4-way dot_prod > instructions are generated > That's not the same, the 2-way vs 4-way dot_prod here is that e.g. udot_prod8hi can reduce to either DImode or SImode. udot_prod8hi does not have enough information to distinguish the two and in RTL you can't overload the names. So this is about the ISA having instructions that overlap on the source mode of the instruction. Tamar > > -- > BR, > Hongtao
RE: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer
> -Original Message- > From: Richard Biener > Sent: Friday, May 17, 2024 6:51 AM > To: Victor Do Nascimento > Cc: gcc-patches@gcc.gnu.org; Richard Sandiford ; > Richard Earnshaw ; Victor Do Nascimento > > Subject: Re: [PATCH] middle-end: Expand {u|s}dot product support in > autovectorizer > > On Thu, May 16, 2024 at 4:40 PM Victor Do Nascimento > wrote: > > > > From: Victor Do Nascimento > > > > At present, the compiler offers the `{u|s|us}dot_prod_optab' direct > > optabs for dealing with vectorizable dot product code sequences. The > > consequence of using a direct optab for this is that backend-pattern > > selection is only ever able to match against one datatype - Either > > that of the operands or of the accumulated value, never both. > > > > With the introduction of the 2-way (un)signed dot-product insn [1][2] > > in AArch64 SVE2, the existing direct opcode approach is no longer > > sufficient for full specification of all the possible dot product > > machine instructions to be matched to the code sequence; a dot product > > resulting in VNx4SI may result from either dot products on VNx16QI or > > VNx8HI values for the 4- and 2-way dot product operations, respectively. > > > > This means that the following example fails autovectorization: > > > > uint32_t foo(int n, uint16_t* data) { > > uint32_t sum = 0; > > for (int i=0; i > sum += data[i] * data[i]; > > } > > return sum; > > } > > > > To remedy the issue a new optab is added, tentatively named > > `udot_prod_twoway_optab', whose selection is dependent upon checking > > of both input and output types involved in the operation. > > I don't like this too much. I'll note we document dot_prod as > > @cindex @code{sdot_prod@var{m}} instruction pattern > @item @samp{sdot_prod@var{m}} > > Compute the sum of the products of two signed elements. > Operand 1 and operand 2 are of the same mode. Their > product, which is of a wider mode, is computed and added to operand 3. > Operand 3 is of a mode equal or wider than the mode of the product. The > result is placed in operand 0, which is of the same mode as operand 3. > @var{m} is the mode of operand 1 and operand 2. > > with no restriction on the wider mode but we don't specify it which is > bad design. This should have been a convert optab with two modes > from the start - adding a _twoway variant is just a hack. We did discuss this at the time we started implementing it. There was two options, one was indeed to change it to a convert dot_prod optab, but doing this means we have to update every target that uses it. Now that means 3 ISAs for AArch64, Arm, Arc, c6x, 2 for x86, loongson and altivec. Which sure could be possible, but there's also every use in the backends that need to be updated, and tested, which for some targets we don't even know how to begin. So it seems very hard to correct dotprod to a convert optab now. Tamar > > Richard. > > > In order to minimize changes to the existing codebase, > > `optab_for_tree_code' is renamed `optab_for_tree_code_1' and a new > > argument is added to its signature - `const_tree otype', allowing type > > information to be specified for both input and output types. The > > existing nterface is retained by defining a new `optab_for_tree_code', > > which serves as a shim to `optab_for_tree_code_1', passing old > > parameters as-is and setting the new `optype' argument to `NULL_TREE'. > > > > For DOT_PROD_EXPR tree codes, we can call `optab_for_tree_code_1' > > directly, passing it both types, adding the internal logic to the > > function to distinguish between competing optabs. > > > > Finally, necessary changes are made to `expand_widen_pattern_expr' to > > ensure the new icode can be correctly selected, given the new optab. > > > > [1] https://developer.arm.com/documentation/ddi0602/2024-03/SVE- > Instructions/UDOT--2-way--vectors---Unsigned-integer-dot-product- > > [2] https://developer.arm.com/documentation/ddi0602/2024-03/SVE- > Instructions/SDOT--2-way--vectors---Signed-integer-dot-product- > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-sve2.md > > (@aarch64_sve_dotvnx4sivnx8hi): > > renamed to `dot_prod_twoway_vnx8hi'. > > * config/aarch64/aarch64-sve-builtins-base.cc (svdot_impl.expand): > > update icodes used in line with above rename. > > * optabs-tree.cc (optab_for_tree_code_1): Renamed > > `optab_for_tree_code' and added new argument. > > (optab_for_tree_code): Now a call to `optab_for_tree_code_1'. > > * optabs-tree.h (optab_for_tree_code_1): New. > > * optabs.cc (expand_widen_pattern_expr): Expand support for > > DOT_PROD_EXPR patterns. > > * optabs.def (udot_prod_twoway_optab): New. > > (sdot_prod_twoway_optab): Likewise. > > * tree-vect-patterns.cc (vect_supportable_direct_optab_p): Add > > support for misc optabs that use two modes. > > > > gcc/testsuite/ChangeLog: > > > > *
RE: [PATCH] middle-end: Drop __builtin_pretech calls in autovectorization [PR114061]'
Hi, > -Original Message- > From: Victor Do Nascimento > Sent: Thursday, May 16, 2024 2:57 PM > To: gcc-patches@gcc.gnu.org > Cc: Richard Sandiford ; Richard Earnshaw > ; Victor Do Nascimento > > Subject: [PATCH] middle-end: Drop __builtin_pretech calls in autovectorization > [PR114061]' > > At present the autovectorizer fails to vectorize simple loops > involving calls to `__builtin_prefetch'. A simple example of such > loop is given below: > > void foo(double * restrict a, double * restrict b, int n){ > int i; > for(i=0; i a[i] = a[i] + b[i]; > __builtin_prefetch(&(b[i+8])); > } > } > > The failure stems from two issues: > > 1. Given that it is typically not possible to fully reason about a >function call due to the possibility of side effects, the >autovectorizer does not attempt to vectorize loops which make such >calls. > >Given the memory reference passed to `__builtin_prefetch', in the >absence of assurances about its effect on the passed memory >location the compiler deems the function unsafe to vectorize, >marking it as clobbering memory in `vect_find_stmt_data_reference'. >This leads to the failure in autovectorization. > > 2. Notwithstanding the above issue, though the prefetch statement >would be classed as `vect_unused_in_scope', the loop invariant that >is used in the address of the prefetch is the scalar loop's and not >the vector loop's IV. That is, it still uses `i' and not `vec_iv' >because the instruction wasn't vectorized, causing DCE to think the >value is live, such that we now have both the vector and scalar loop >invariant actively used in the loop. > > This patch addresses both of these: > > 1. About the issue regarding the memory clobber, data prefetch does >not generate faults if its address argument is invalid and does not >write to memory. Therefore, it does not alter the internal state >of the program or its control flow under any circumstance. As >such, it is reasonable that the function be marked as not affecting >memory contents. > >To achieve this, we add the necessary logic to >`get_references_in_stmt' to ensure that builtin functions are given >given the same treatment as internal functions. If the gimple call >is to a builtin function and its function code is >`BUILT_IN_PREFETCH', we mark `clobbers_memory' as false. > > 2. Finding precedence in the way clobber statements are handled, >whereby the vectorizer drops these from both the scalar and >vectorized versions of a given loop, we choose to drop prefetch >hints in a similar fashion. This seems appropriate given how >software prefetch hints are typically ignored by processors across >architectures, as they seldom lead to performance gain over their >hardware counterparts. > >PR target/114061 > > gcc/ChangeLog: > > * tree-data-ref.cc (get_references_in_stmt): set > `clobbers_memory' to false for __builtin_prefetch. > * tree-vect-loop.cc (vect_transform_loop): Drop all > __builtin_prefetch calls from loops. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/vect-prefetch-drop.c: New test. > --- > gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c | 14 ++ > gcc/tree-data-ref.cc | 9 + > gcc/tree-vect-loop.cc | 7 ++- > 3 files changed, 29 insertions(+), 1 deletion(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c > b/gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c > new file mode 100644 > index 000..57723a8c972 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-prefetch-drop.c > @@ -0,0 +1,14 @@ > +/* { dg-do compile { target { aarch64*-*-* } } } */ > +/* { dg-additional-options "-march=-O3 -march=armv9.2-a+sve -fdump-tree- > vect-details" { target { aarch64*-*-* } } } */ > + See the review about two-way dotprod for comments on this. However this specific test does not need to check for any assembly instructions. You're going from being unable to vectorize a function, to being able to vectorize It. So the `vectorized 1 loops` check is sufficient, then this will work for all targets. This requires a check on vect_double (see gcc/testsuite/lib/target-supports.exp) I'd also change the loop to just use int, as more targets will support vectorizing those, (and of course at a vect_int check instead) > +void foo(double * restrict a, double * restrict b, int n){ > + int i; > + for(i=0; i +a[i] = a[i] + b[i]; > +__builtin_prefetch(&(b[i+8])); > + } > +} > + > +/* { dg-final { scan-assembler-not "prfm" } } */ > +/* { dg-final { scan-assembler "fadd\tz\[0-9\]+.d, p\[0-9\]+/m, z\[0-9\]+.d, > z\[0- > 9\]+.d" } } */ > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc >
RE: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer
Hi Victor, > -Original Message- > From: Victor Do Nascimento > Sent: Thursday, May 16, 2024 3:39 PM > To: gcc-patches@gcc.gnu.org > Cc: Richard Sandiford ; Richard Earnshaw > ; Victor Do Nascimento > > Subject: [PATCH] middle-end: Expand {u|s}dot product support in autovectorizer > > From: Victor Do Nascimento > > At present, the compiler offers the `{u|s|us}dot_prod_optab' direct > optabs for dealing with vectorizable dot product code sequences. The > consequence of using a direct optab for this is that backend-pattern > selection is only ever able to match against one datatype - Either > that of the operands or of the accumulated value, never both. > > With the introduction of the 2-way (un)signed dot-product insn [1][2] > in AArch64 SVE2, the existing direct opcode approach is no longer > sufficient for full specification of all the possible dot product > machine instructions to be matched to the code sequence; a dot product > resulting in VNx4SI may result from either dot products on VNx16QI or > VNx8HI values for the 4- and 2-way dot product operations, respectively. > > This means that the following example fails autovectorization: > > uint32_t foo(int n, uint16_t* data) { > uint32_t sum = 0; > for (int i=0; i sum += data[i] * data[i]; > } > return sum; > } > > To remedy the issue a new optab is added, tentatively named > `udot_prod_twoway_optab', whose selection is dependent upon checking > of both input and output types involved in the operation. > > In order to minimize changes to the existing codebase, > `optab_for_tree_code' is renamed `optab_for_tree_code_1' and a new > argument is added to its signature - `const_tree otype', allowing type > information to be specified for both input and output types. The > existing nterface is retained by defining a new `optab_for_tree_code', > which serves as a shim to `optab_for_tree_code_1', passing old > parameters as-is and setting the new `optype' argument to `NULL_TREE'. > > For DOT_PROD_EXPR tree codes, we can call `optab_for_tree_code_1' > directly, passing it both types, adding the internal logic to the > function to distinguish between competing optabs. > > Finally, necessary changes are made to `expand_widen_pattern_expr' to > ensure the new icode can be correctly selected, given the new optab. > > [1] https://developer.arm.com/documentation/ddi0602/2024-03/SVE- > Instructions/UDOT--2-way--vectors---Unsigned-integer-dot-product- > [2] https://developer.arm.com/documentation/ddi0602/2024-03/SVE- > Instructions/SDOT--2-way--vectors---Signed-integer-dot-product- > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve2.md > (@aarch64_sve_dotvnx4sivnx8hi): > renamed to `dot_prod_twoway_vnx8hi'. > * config/aarch64/aarch64-sve-builtins-base.cc (svdot_impl.expand): > update icodes used in line with above rename. Please split the target specific bits from the target agnostic parts. I.e. this patch series should be split in two. > * optabs-tree.cc (optab_for_tree_code_1): Renamed > `optab_for_tree_code' and added new argument. > (optab_for_tree_code): Now a call to `optab_for_tree_code_1'. > * optabs-tree.h (optab_for_tree_code_1): New. > * optabs.cc (expand_widen_pattern_expr): Expand support for > DOT_PROD_EXPR patterns. > * optabs.def (udot_prod_twoway_optab): New. > (sdot_prod_twoway_optab): Likewise. > * tree-vect-patterns.cc (vect_supportable_direct_optab_p): Add > support for misc optabs that use two modes. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/vect-dotprod-twoway.c: New. > --- > .../aarch64/aarch64-sve-builtins-base.cc | 4 ++-- > gcc/config/aarch64/aarch64-sve2.md| 2 +- > gcc/optabs-tree.cc| 23 -- > gcc/optabs-tree.h | 2 ++ > gcc/optabs.cc | 2 +- > gcc/optabs.def| 2 ++ > .../gcc.dg/vect/vect-dotprod-twoway.c | 24 +++ > gcc/tree-vect-patterns.cc | 2 +- > 8 files changed, 54 insertions(+), 7 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-dotprod-twoway.c > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index 0d2edf3f19e..e457db09f66 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -764,8 +764,8 @@ public: >icode = (e.type_suffix (0).float_p > ? CODE_FOR_aarch64_sve_fdotvnx4sfvnx8hf > : e.type_suffix (0).unsigned_p > -? CODE_FOR_aarch64_sve_udotvnx4sivnx8hi > -: CODE_FOR_aarch64_sve_sdotvnx4sivnx8hi); > +? CODE_FOR_udot_prod_twoway_vnx8hi > +: CODE_FOR_sdot_prod_twoway_vnx8hi); > return e.use_unpred_insn (icode); >} > }; > diff --git
RE: [PATCH v2 1/3] Vect: Support loop len in vectorizable early exit
> -Original Message- > From: pan2...@intel.com > Sent: Thursday, May 16, 2024 5:06 AM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > ; richard.guent...@gmail.com; Richard Sandiford > ; Pan Li > Subject: [PATCH v2 1/3] Vect: Support loop len in vectorizable early exit > > From: Pan Li > > This patch adds early break auto-vectorization support for target which > use length on partial vectorization. Consider this following example: > > unsigned vect_a[802]; > unsigned vect_b[802]; > > void test (unsigned x, int n) > { > for (int i = 0; i < n; i++) > { > vect_b[i] = x + i; > > if (vect_a[i] > x) > break; > > vect_a[i] = x; > } > } > > We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias). > And then the IR of RVV looks like below: > > ... > _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]); > _55 = (int) _87; > ... > mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67; > vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \ > {0, ... }, _87, 0); > if (vec_len_mask_72 != { 0, ... }) > goto ; [5.50%] > else > goto ; [94.50%] > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 2. The x86 bootstrap tests. > 3. The x86 fully regression tests. > > gcc/ChangeLog: > > * tree-vect-stmts.cc (vectorizable_early_exit): Add loop len > handling for one or multiple stmt. > > gcc/ChangeLog: > > * tree-vect-loop.cc (vect_gen_loop_len_mask): New func to gen > the loop len mask. > * tree-vect-stmts.cc (vectorizable_early_exit): Invoke the > vect_gen_loop_len_mask for 1 or more stmt(s). > * tree-vectorizer.h (vect_gen_loop_len_mask): New func decl > for vect_gen_loop_len_mask. > Thanks, this version looks good to me! You'll need Richi's review still. Cheers, Tamar > Signed-off-by: Pan Li > --- > gcc/tree-vect-loop.cc | 27 +++ > gcc/tree-vect-stmts.cc | 17 +++-- > gcc/tree-vectorizer.h | 4 > 3 files changed, 46 insertions(+), 2 deletions(-) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 361aec06488..83c0544b6aa 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -11416,6 +11416,33 @@ vect_get_loop_len (loop_vec_info loop_vinfo, > gimple_stmt_iterator *gsi, >return loop_len; > } > > +/* Generate the tree for the loop len mask and return it. Given the lens, > + nvectors, vectype, index and factor to gen the len mask as below. > + > + tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias) > +*/ > +tree > +vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi, > + gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens, > + unsigned int nvectors, tree vectype, tree stmt, > + unsigned int index, unsigned int factor) > +{ > + tree all_one_mask = build_all_ones_cst (vectype); > + tree all_zero_mask = build_zero_cst (vectype); > + tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, > index, > + factor); > + tree bias = build_int_cst (intQI_type_node, > + LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS > (loop_vinfo)); > + tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, > "vec_len_mask"); > + gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt, > + all_one_mask, all_zero_mask, len, > + bias); > + gimple_call_set_lhs (call, len_mask); > + gsi_insert_before (cond_gsi, call, GSI_SAME_STMT); > + > + return len_mask; > +} > + > /* Scale profiling counters by estimation for LOOP which is vectorized > by factor VF. > If FLAT is true, the loop we started with had unrealistically flat > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index b8a71605f1b..672959501bb 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -12895,7 +12895,9 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > ncopies = vect_get_num_copies (loop_vinfo, vectype); > >vec_loop_masks *masks = _VINFO_MASKS (loop_vinfo); > + vec_loop_lens *lens = _VINFO_LENS (loop_vinfo); >bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); > + bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo); > >/* Now build the new conditional. Pattern gimple_conds get dropped
RE: [PATCH 0/4]AArch64: support conditional early clobbers on certain operations.
> -Original Message- > From: Richard Sandiford > Sent: Wednesday, May 15, 2024 10:31 PM > To: Tamar Christina > Cc: Richard Biener ; gcc-patches@gcc.gnu.org; nd > ; Richard Earnshaw ; Marcus > Shawcroft ; ktkac...@gcc.gnu.org > Subject: Re: [PATCH 0/4]AArch64: support conditional early clobbers on certain > operations. > > Tamar Christina writes: > >> >> On Wed, May 15, 2024 at 12:29 PM Tamar Christina > >> >> wrote: > >> >> > > >> >> > Hi All, > >> >> > > >> >> > Some Neoverse Software Optimization Guides (SWoG) have a clause that > state > >> >> > that for predicated operations that also produce a predicate it is > >> >> > preferred > >> >> > that the codegen should use a different register for the destination > >> >> > than > that > >> >> > of the input predicate in order to avoid a performance overhead. > >> >> > > >> >> > This of course has the problem that it increases register pressure > >> >> > and so > >> should > >> >> > be done with care. Additionally not all micro-architectures have this > >> >> > consideration and so it shouldn't be done as a default thing. > >> >> > > >> >> > The patch series adds support for doing conditional early clobbers > >> >> > through > a > >> >> > combination of new alternatives and attributes to control their > >> >> > availability. > >> >> > >> >> You could have two alternatives, one with early clobber and one with > >> >> a matching constraint where you'd disparage the matching constraint one? > >> >> > >> > > >> > Yeah, that's what I do, though there's no need to disparage the non-early > clobber > >> > alternative as the early clobber alternative will naturally get a > >> > penalty if it > needs a > >> > reload. > >> > >> But I think Richard's suggestion was to disparage the one with a matching > >> constraint (not the earlyclobber), to reflect the increased cost of > >> reusing the register. > >> > >> We did take that approach for gathers, e.g.: > >> > >> [, Z, w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s] > >> [?w, Z, 0, Ui1, Ui1, Upl] ^ > >> > >> The (supposed) advantage is that, if register pressure is so tight > >> that using matching registers is the only alternative, we still > >> have the opportunity to do that, as a last resort. > >> > >> Providing only an earlyclobber version means that using the same > >> register is prohibited outright. If no other register is free, the RA > >> would need to spill something else to free up a temporary register. > >> And it might then do the equivalent of (pseudo-code): > >> > >> not p1.b, ..., p0.b > >> mov p0.d, p1.d > >> > >> after spilling what would otherwise have occupied p1. In that > >> situation it would be better use: > >> > >> not p0.b, ..., p0.b > >> > >> and not introduce the spill of p1. > > > > I think I understood what Richi meant, but I thought it was already working > > that > way. > > The suggestion was to use matching constraints (like "0") though, > whereas the patch doesn't. I think your argument is that you don't > need to use matching constraints. But that's different from the > suggestion (and from how we handle gathers). > > I was going to say in response to patch 3 (but got distracted, sorry): > I don't think we should have: > >, Upa, ... >Upa, Upa, ... > > (taken from the pure logic ops) enabled at the same time. Even though > it works for the testcases, I don't think it has well-defined semantics. > > The problem is that, taken on its own, the second alternative says that > matching operands are free. And fundamentally, I don't think the costs > *must* take the earlyclobber alternative over the non-earlyclobber one > (when costing during IRA, for instance). In principle, the cheapest > is best. > > The aim of the gather approach is to make each alternative correct in > isolation. In: > > [, Z, w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s] > [?w, Z, 0, Ui1, Ui1, Upl] ^ > > the second alternative says that it is possible to have operands 0 > and 2 be the same vector register, but using that version has the
RE: [PATCH 0/4]AArch64: support conditional early clobbers on certain operations.
> >> On Wed, May 15, 2024 at 12:29 PM Tamar Christina > >> wrote: > >> > > >> > Hi All, > >> > > >> > Some Neoverse Software Optimization Guides (SWoG) have a clause that > >> > state > >> > that for predicated operations that also produce a predicate it is > >> > preferred > >> > that the codegen should use a different register for the destination > >> > than that > >> > of the input predicate in order to avoid a performance overhead. > >> > > >> > This of course has the problem that it increases register pressure and so > should > >> > be done with care. Additionally not all micro-architectures have this > >> > consideration and so it shouldn't be done as a default thing. > >> > > >> > The patch series adds support for doing conditional early clobbers > >> > through a > >> > combination of new alternatives and attributes to control their > >> > availability. > >> > >> You could have two alternatives, one with early clobber and one with > >> a matching constraint where you'd disparage the matching constraint one? > >> > > > > Yeah, that's what I do, though there's no need to disparage the non-early > > clobber > > alternative as the early clobber alternative will naturally get a penalty > > if it needs a > > reload. > > But I think Richard's suggestion was to disparage the one with a matching > constraint (not the earlyclobber), to reflect the increased cost of > reusing the register. > > We did take that approach for gathers, e.g.: > > [, Z, w, Ui1, Ui1, Upl] ld1\t%0.s, %5/z, [%2.s] > [?w, Z, 0, Ui1, Ui1, Upl] ^ > > The (supposed) advantage is that, if register pressure is so tight > that using matching registers is the only alternative, we still > have the opportunity to do that, as a last resort. > > Providing only an earlyclobber version means that using the same > register is prohibited outright. If no other register is free, the RA > would need to spill something else to free up a temporary register. > And it might then do the equivalent of (pseudo-code): > > not p1.b, ..., p0.b > mov p0.d, p1.d > > after spilling what would otherwise have occupied p1. In that > situation it would be better use: > > not p0.b, ..., p0.b > > and not introduce the spill of p1. I think I understood what Richi meant, but I thought it was already working that way. i.e. as one of the testcases I had: > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2 > -ffixed-p[1-15] foo: mov z31.h, w0 ptrue p0.b, all cmplo p0.h, p0/z, z0.h, z31.h b use and reload did not force a spill. My understanding of how this works, and how it seems to be working is that since reload costs Alternative from front to back the cheapest one wins and it stops evaluating the rest. The early clobber case is first and preferred, however when it's not possible, i.e. requires a non-pseudo reload, the reload cost is added to the alternative. However you're right that in the following testcase: -mcpu=neoverse-n2 -ffixed-p1 -ffixed-p2 -ffixed-p3 -ffixed-p4 -ffixed-p5 -ffixed-p6 -ffixed-p7 -ffixed-p8 -ffixed-p9 -ffixed-p10 -ffixed-p11 -ffixed-p12 -ffixed-p12 -ffixed-p13 -ffixed-p14 -ffixed-p14 -fdump-rtl-reload i.e. giving it an extra free register inexplicably causes a spill: foo: addvl sp, sp, #-1 mov z31.h, w0 ptrue p0.b, all str p15, [sp] cmplo p15.h, p0/z, z0.h, z31.h mov p0.b, p15.b ldr p15, [sp] addvl sp, sp, #1 b use so that's unexpected and is very weird as p15 has no defined value.. Now adding the ? as suggested to the non-early clobber alternative does not fix it, and my mental model for how this is supposed to work does not quite line up.. Why would making the non-clobber alternative even more expensive help it during high register pressure?? But with that suggestion the above case does not get fixed and the following case -mcpu=neoverse-n2 -ffixed-p1 -ffixed-p2 -ffixed-p3 -ffixed-p4 -ffixed-p5 -ffixed-p6 -ffixed-p7 -ffixed-p8 -ffixed-p9 -ffixed-p10 -ffixed-p11 -ffixed-p12 -ffixed-p12 -ffixed-p13 -ffixed-p14 -ffixed-p15 -fdump-rtl-reload ICEs: pred-clobber.c: In function 'foo': pred-clobber.c:9:1: error: unable to find a register to spill 9 | } | ^ pred-clobber.c:9:1: error: this is the insn: (insn 10 22 19 2 (parallel [ (set (reg:VNx8BI 110 [104]) (unspec:VNx8BI [ (reg:VNx8BI 112) (const_int 1 [0x1])
RE: [PATCH 0/4]AArch64: support conditional early clobbers on certain operations.
> -Original Message- > From: Richard Biener > Sent: Wednesday, May 15, 2024 12:20 PM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > ; Marcus Shawcroft > ; ktkac...@gcc.gnu.org; Richard Sandiford > > Subject: Re: [PATCH 0/4]AArch64: support conditional early clobbers on certain > operations. > > On Wed, May 15, 2024 at 12:29 PM Tamar Christina > wrote: > > > > Hi All, > > > > Some Neoverse Software Optimization Guides (SWoG) have a clause that state > > that for predicated operations that also produce a predicate it is preferred > > that the codegen should use a different register for the destination than > > that > > of the input predicate in order to avoid a performance overhead. > > > > This of course has the problem that it increases register pressure and so > > should > > be done with care. Additionally not all micro-architectures have this > > consideration and so it shouldn't be done as a default thing. > > > > The patch series adds support for doing conditional early clobbers through a > > combination of new alternatives and attributes to control their > > availability. > > You could have two alternatives, one with early clobber and one with > a matching constraint where you'd disparage the matching constraint one? > Yeah, that's what I do, though there's no need to disparage the non-early clobber alternative as the early clobber alternative will naturally get a penalty if it needs a reload. Cheers, Tamar > > On high register pressure we also use LRA's costing to prefer not to use the > > alternative and instead just use the tie as this is preferable to a reload. > > > > Concretely this patch series does: > > > > > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2 > > > > foo: > > mov z31.h, w0 > > ptrue p3.b, all > > cmplo p0.h, p3/z, z0.h, z31.h > > b use > > > > > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n1+sve > > > > foo: > > mov z31.h, w0 > > ptrue p0.b, all > > cmplo p0.h, p0/z, z0.h, z31.h > > b use > > > > > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2 - > ffixed-p[1-15] > > > > foo: > > mov z31.h, w0 > > ptrue p0.b, all > > cmplo p0.h, p0/z, z0.h, z31.h > > b use > > > > Testcases for the changes are in the last patch of the series. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Thanks, > > Tamar > > > > --- > > > > --
RE: [PATCH 2/4]AArch64: add new tuning param and attribute for enabling conditional early clobber
> -Original Message- > From: Richard Sandiford > Sent: Wednesday, May 15, 2024 11:56 AM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > ; Marcus Shawcroft > ; ktkac...@gcc.gnu.org > Subject: Re: [PATCH 2/4]AArch64: add new tuning param and attribute for > enabling conditional early clobber > > Tamar Christina writes: > > Hi All, > > > > This adds a new tuning parameter EARLY_CLOBBER_SVE_PRED_DEST for AArch64 > to > > allow us to conditionally enable the early clobber alternatives based on the > > tuning models. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-tuning-flags.def > > (EARLY_CLOBBER_SVE_PRED_DEST): New. > > * config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New. > > * config/aarch64/aarch64.md (pred_clobber): New. > > (arch_enabled): Use it. > > > > --- > > diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def > b/gcc/config/aarch64/aarch64-tuning-flags.def > > index > d5bcaebce770f0b217aac783063d39135f754c77..49fbad3ff28bc82b25c61ac50 > 1ccf533ec4b4c3f 100644 > > --- a/gcc/config/aarch64/aarch64-tuning-flags.def > > +++ b/gcc/config/aarch64/aarch64-tuning-flags.def > > @@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION > ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) > > > > AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", > FULLY_PIPELINED_FMA) > > > > +/* Enable is the target prefers to use a fresh register for predicate > > outputs > > + rather than re-use an input predicate register. */ > > +AARCH64_EXTRA_TUNING_OPTION ("early_clobber_sve_pred_dest", > EARLY_CLOBBER_SVE_PRED_DEST) > > Sorry for the bike-shedding, but how about something like "avoid_pred_rmw"? > (I'm open to other suggestions.) Just looking for something that describes > either the architecture or the end result that we want to achieve. > And preferable something fairly short :) > > avoid_* would be consistent with the existing "avoid_cross_loop_fma". Sure, happy to, it's something we initially struggled with naming internally as well. It sounds there's precedence so the avoid_ naming, so happy to use this naming. Will respin with it. Thanks, Tamar > > > + > > #undef AARCH64_EXTRA_TUNING_OPTION > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > > index > bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d5 > 6b46c74084ba7c3c 100644 > > --- a/gcc/config/aarch64/aarch64.h > > +++ b/gcc/config/aarch64/aarch64.h > > @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = > AARCH64_FL_SM_OFF; > > enabled through +gcs. */ > > #define TARGET_GCS (AARCH64_ISA_GCS) > > > > +/* Prefer different predicate registers for the output of a predicated > > operation > over > > +re-using an existing input predicate. */ > > +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ > > +&& (aarch64_tune_params.extra_tuning_flags \ > > +& > AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST)) > > > > /* Standard register usage. */ > > > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > > index > dbde066f7478bec51a8703b017ea553aa98be309..1ecd1a2812969504bd5114a > 53473b478c5ddba82 100644 > > --- a/gcc/config/aarch64/aarch64.md > > +++ b/gcc/config/aarch64/aarch64.md > > @@ -445,6 +445,10 @@ (define_enum_attr "arch" "arches" (const_string > "any")) > > ;; target-independent code. > > (define_attr "is_call" "no,yes" (const_string "no")) > > > > +;; Indicates whether we want to enable the pattern with an optional early > > +;; clobber for SVE predicates. > > +(define_attr "pred_clobber" "no,yes" (const_string "no")) > > + > > ;; [For compatibility with Arm in pipeline models] > > ;; Attribute that specifies whether or not the instruction touches fp > > ;; registers. > > @@ -461,7 +465,8 @@ (define_attr "fp" "no,yes" > > (define_attr "arch_enabled" "no,yes" > >(if_then_else > > (ior > > - (eq_attr "arch" "any") > > + (and (eq_attr "arch" "any") > > +(eq_attr "pred_clobber" "no")) > > > > (and (eq_attr "arch" "rcpc8_4") > > (match_test "AARCH64_ISA_RCPC8_4")) > > @@ -488,7 +493,10 @@ (define_attr "arch_enabled" "no,yes" > > (match_test "TARGET_SVE")) > > > > (and (eq_attr "arch" "sme") > > -(match_test "TARGET_SME"))) > > +(match_test "TARGET_SME")) > > + > > + (and (eq_attr "pred_clobber" "yes") > > +(match_test "TARGET_SVE_PRED_CLOBBER"))) > > IMO it'd be bettero handle pred_clobber separately from arch, as a new > top-level AND: > > (and > (ior > (eq_attr "pred_clobber" "no") > (match_test "!TARGET_...")) > (ior > ...existing arch tests...)) > > Thanks, > Richard
[PATCH 3/4]AArch64: add new alternative with early clobber to patterns
Hi All, This patch adds new alternatives to the patterns which are affected. The new alternatives with the conditional early clobbers are added before the normal ones in order for LRA to prefer them in the event that we have enough free registers to accommodate them. In case register pressure is too high the normal alternatives will be preferred before a reload is considered as we rather have the tie than a spill. Tests are in the next patch. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-sve.md (and3, @aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, @aarch64_pred_cmp, *cmp_cc, *cmp_ptest, @aarch64_pred_cmp_wide, *aarch64_pred_cmp_wide_cc, *aarch64_pred_cmp_wide_ptest, @aarch64_brk, *aarch64_brk_cc, *aarch64_brk_ptest, @aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest, *aarch64_brk_cc, *aarch64_brk_ptest, aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest, *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Add new early clobber alternative. * config/aarch64/aarch64-sve2.md (@aarch64_pred_): Likewise. --- diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 839ab0627747d7a49bef7b0192ee9e7a42587ca0..93ec59e58afee260b85082c472db2abfea7386b6 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1161,8 +1161,9 @@ (define_insn "aarch64_rdffr_z" (reg:VNx16BI FFRT_REGNUM) (match_operand:VNx16BI 1 "register_operand")))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffr\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa; yes ] rdffr\t%0.b, %1/z + [ Upa , Upa; * ] ^ } ) @@ -1179,8 +1180,9 @@ (define_insn "*aarch64_rdffr_z_ptest" UNSPEC_PTEST)) (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 , 2 ] - [ Upa , Upa, ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 , 2; attrs: pred_clobber ] + [ , Upa, ; yes ] rdffrs\t%0.b, %1/z + [ Upa , Upa, ; * ] ^ } ) @@ -1195,8 +1197,9 @@ (define_insn "*aarch64_rdffr_ptest" UNSPEC_PTEST)) (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 ] - [ Upa , Upa ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 ; attrs: pred_clobber ] + [ , Upa; yes ] rdffrs\t%0.b, %1/z + [ Upa , Upa; * ] ^ } ) @@ -1216,8 +1219,9 @@ (define_insn "*aarch64_rdffr_z_cc" (reg:VNx16BI FFRT_REGNUM) (match_dup 1)))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 , 2 ] - [ Upa , Upa, ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 , 2; attrs: pred_clobber ] + [ , Upa, ; yes ] rdffrs\t%0.b, %1/z + [ Upa , Upa, ; * ] ^ } ) @@ -1233,8 +1237,9 @@ (define_insn "*aarch64_rdffr_cc" (set (match_operand:VNx16BI 0 "register_operand") (reg:VNx16BI FFRT_REGNUM))] "TARGET_SVE && TARGET_NON_STREAMING" - {@ [ cons: =0, 1 , 2 ] - [ Upa , Upa, ] rdffrs\t%0.b, %1/z + {@ [ cons: =0, 1 , 2; attrs: pred_clobber ] + [ , Upa, ; yes ] rdffrs\t%0.b, %1/z + [ Upa , Upa, ; * ] ^ } ) @@ -6651,8 +6656,9 @@ (define_insn "and3" (and:PRED_ALL (match_operand:PRED_ALL 1 "register_operand") (match_operand:PRED_ALL 2 "register_operand")))] "TARGET_SVE" - {@ [ cons: =0, 1 , 2 ] - [ Upa , Upa, Upa ] and\t%0.b, %1/z, %2.b, %2.b + {@ [ cons: =0, 1 , 2 ; attrs: pred_clobber ] + [ , Upa, Upa; yes ] and\t%0.b, %1/z, %2.b, %2.b + [ Upa , Upa, Upa; * ] ^ } ) @@ -6679,8 +6685,9 @@ (define_insn "@aarch64_pred__z" (match_operand:PRED_ALL 3 "register_operand")) (match_operand:PRED_ALL 1 "register_operand")))] "TARGET_SVE" - {@ [ cons: =0, 1 , 2 , 3 ] - [ Upa , Upa, Upa, Upa ] \t%0.b, %1/z, %2.b, %3.b + {@ [ cons: =0, 1 , 2 , 3 ; attrs: pred_clobber ] + [ , Upa, Upa, Upa; yes ] \t%0.b, %1/z, %2.b, %3.b + [ Upa , Upa, Upa, Upa; * ] ^ } ) @@ -6703,8 +6710,9 @@ (define_insn "*3_cc" (and:PRED_ALL (LOGICAL:PRED_ALL (match_dup 2) (match_dup 3)) (match_dup 4)))] "TARGET_SVE" - {@ [ cons: =0, 1 , 2 , 3 , 4, 5 ] - [ Upa , Upa, Upa, Upa, , ] s\t%0.b, %1/z, %2.b, %3.b + {@ [ cons: =0, 1 , 2 , 3 , 4, 5; attrs: pred_clobber ] + [ , Upa, Upa, Upa, , ; yes ] s\t%0.b, %1/z, %2.b, %3.b +
[PATCH 4/4]AArch64: enable new predicate tuning for Neoverse cores.
Hi All, This enables the new tuning flag for Neoverse V1, Neoverse V2 and Neoverse N2. It is kept off for generic codegen. Note the reason for the +sve even though they are in aarch64-sve.exp is if the testsuite is ran with a forced SVE off option, e.g. -march=armv8-a+nosve then the intrinsics end up being disabled because the -march is preferred over the -mcpu even though the -mcpu comes later. This prevents the tests from failing in such runs. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/tuning_models/neoversen2.h (neoversen2_tunings): Add AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST. * config/aarch64/tuning_models/neoversev1.h (neoversev1_tunings): Add AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST. * config/aarch64/tuning_models/neoversev2.h (neoversev2_tunings): Add AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/pred_clobber_1.c: New test. * gcc.target/aarch64/sve/pred_clobber_2.c: New test. * gcc.target/aarch64/sve/pred_clobber_3.c: New test. * gcc.target/aarch64/sve/pred_clobber_4.c: New test. * gcc.target/aarch64/sve/pred_clobber_5.c: New test. --- diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h b/gcc/config/aarch64/tuning_models/neoversen2.h index 7e799bbe762fe862e31befed50e54040a7fd1f2f..0d8f3f6be67f3583b00473bef97ea3ae4fcea4ec 100644 --- a/gcc/config/aarch64/tuning_models/neoversen2.h +++ b/gcc/config/aarch64/tuning_models/neoversen2.h @@ -236,7 +236,8 @@ static const struct tune_params neoversen2_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS - | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags. */ + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h b/gcc/config/aarch64/tuning_models/neoversev1.h index 9363f2ad98a5279cc99f2f9b1509ba921d582e84..d28d0b1c0498ed250b0a93ca69720fe10c65c93d 100644 --- a/gcc/config/aarch64/tuning_models/neoversev1.h +++ b/gcc/config/aarch64/tuning_models/neoversev1.h @@ -227,7 +227,8 @@ static const struct tune_params neoversev1_tunings = (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT - | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ + | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND + | AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS/* stp_policy_model. */ diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h b/gcc/config/aarch64/tuning_models/neoversev2.h index bc01ed767c9b690504eb98456402df5d9d64eee3..3b2f9797bd777e73ca9c21501fa97448d96cb65e 100644 --- a/gcc/config/aarch64/tuning_models/neoversev2.h +++ b/gcc/config/aarch64/tuning_models/neoversev2.h @@ -236,7 +236,8 @@ static const struct tune_params neoversev2_tunings = (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS - | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),/* tune_flags. */ + | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT + | AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST), /* tune_flags. */ _prefetch_tune, AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */ AARCH64_LDP_STP_POLICY_ALWAYS /* stp_policy_model. */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c new file mode 100644 index ..934a00a38531c5fd4139d99ff33414904b2c104f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_1.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=neoverse-n2" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#pragma GCC target "+sve" + +#include + +extern void use(svbool_t); + +/* +** foo: +** ... +** ptrue p([1-9][0-9]?).b, all +** cmplo p0.h, p\1/z, z0.h, z[0-9]+.h +** ... +*/ +void foo (svuint16_t a, uint16_t b) +{ +svbool_t p0 = svcmplt_n_u16 (svptrue_b16 (), a, b); +use (p0); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c new file mode 100644 index ..58badb66a43b1ac50eeec153b9cac44fc831b145 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred_clobber_2.c @@ -0,0 +1,22 @@ +/* { dg-do compile
[PATCH 1/4]AArch64: convert several predicate patterns to new compact syntax
Hi All, This converts the single alternative patterns to the new compact syntax such that when I add the new alternatives it's clearer what's being changed. Note that this will spew out a bunch of warnings from geninsn as it'll warn that @ is useless for a single alternative pattern. These are not fatal so won't break the build and are only temporary. No change in functionality is expected with this patch. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-sve.md (and3, @aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, aarch64_pred__z, *3_cc, *3_ptest, *cmp_ptest, @aarch64_pred_cmp_wide, *aarch64_pred_cmp_wide_cc, *aarch64_pred_cmp_wide_ptest, *aarch64_brk_cc, *aarch64_brk_ptest, @aarch64_brk, *aarch64_brkn_cc, *aarch64_brkn_ptest, *aarch64_brk_cc, *aarch64_brk_ptest, aarch64_rdffr_z, *aarch64_rdffr_z_ptest, *aarch64_rdffr_ptest, *aarch64_rdffr_z_cc, *aarch64_rdffr_cc): Convert to compact syntax. * config/aarch64/aarch64-sve2.md (@aarch64_pred_): Likewise. --- diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 0434358122d2fde71bd0e0f850338e739e9be02c..839ab0627747d7a49bef7b0192ee9e7a42587ca0 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -1156,76 +1156,86 @@ (define_insn "aarch64_rdffr" ;; Likewise with zero predication. (define_insn "aarch64_rdffr_z" - [(set (match_operand:VNx16BI 0 "register_operand" "=Upa") + [(set (match_operand:VNx16BI 0 "register_operand") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) - (match_operand:VNx16BI 1 "register_operand" "Upa")))] + (match_operand:VNx16BI 1 "register_operand")))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffr\t%0.b, %1/z" + {@ [ cons: =0, 1 ] + [ Upa , Upa ] rdffr\t%0.b, %1/z + } ) ;; Read the FFR to test for a fault, without using the predicate result. (define_insn "*aarch64_rdffr_z_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 2 "aarch64_sve_ptrue_flag") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1))] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 , 2 ] + [ Upa , Upa, ] rdffrs\t%0.b, %1/z + } ) ;; Same for unpredicated RDFFR when tested with a known PTRUE. (define_insn "*aarch64_rdffr_ptest" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (const_int SVE_KNOWN_PTRUE) (reg:VNx16BI FFRT_REGNUM)] UNSPEC_PTEST)) - (clobber (match_scratch:VNx16BI 0 "=Upa"))] + (clobber (match_scratch:VNx16BI 0))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 ] + [ Upa , Upa ] rdffrs\t%0.b, %1/z + } ) ;; Read the FFR with zero predication and test the result. (define_insn "*aarch64_rdffr_z_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (match_operand:SI 2 "aarch64_sve_ptrue_flag") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1))] UNSPEC_PTEST)) - (set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (set (match_operand:VNx16BI 0 "register_operand") (and:VNx16BI (reg:VNx16BI FFRT_REGNUM) (match_dup 1)))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 , 2 ] + [ Upa , Upa, ] rdffrs\t%0.b, %1/z + } ) ;; Same for unpredicated RDFFR when tested with a known PTRUE. (define_insn "*aarch64_rdffr_cc" [(set (reg:CC_NZC CC_REGNUM) (unspec:CC_NZC - [(match_operand:VNx16BI 1 "register_operand" "Upa") + [(match_operand:VNx16BI 1 "register_operand") (match_dup 1) (const_int SVE_KNOWN_PTRUE) (reg:VNx16BI FFRT_REGNUM)] UNSPEC_PTEST)) - (set (match_operand:VNx16BI 0 "register_operand" "=Upa") + (set (match_operand:VNx16BI 0 "register_operand") (reg:VNx16BI FFRT_REGNUM))] "TARGET_SVE && TARGET_NON_STREAMING" - "rdffrs\t%0.b, %1/z" + {@ [ cons: =0, 1 , 2 ] + [ Upa , Upa, ] rdffrs\t%0.b, %1/z + } ) ;; [R3 in the block comment above about FFR handling] @@ -6637,11 +6647,13 @@ (define_insn
[PATCH 2/4]AArch64: add new tuning param and attribute for enabling conditional early clobber
Hi All, This adds a new tuning parameter EARLY_CLOBBER_SVE_PRED_DEST for AArch64 to allow us to conditionally enable the early clobber alternatives based on the tuning models. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-tuning-flags.def (EARLY_CLOBBER_SVE_PRED_DEST): New. * config/aarch64/aarch64.h (TARGET_SVE_PRED_CLOBBER): New. * config/aarch64/aarch64.md (pred_clobber): New. (arch_enabled): Use it. --- diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index d5bcaebce770f0b217aac783063d39135f754c77..49fbad3ff28bc82b25c61ac501ccf533ec4b4c3f 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA) +/* Enable is the target prefers to use a fresh register for predicate outputs + rather than re-use an input predicate register. */ +AARCH64_EXTRA_TUNING_OPTION ("early_clobber_sve_pred_dest", EARLY_CLOBBER_SVE_PRED_DEST) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d56b46c74084ba7c3c 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; enabled through +gcs. */ #define TARGET_GCS (AARCH64_ISA_GCS) +/* Prefer different predicate registers for the output of a predicated operation over +re-using an existing input predicate. */ +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ +&& (aarch64_tune_params.extra_tuning_flags \ +& AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST)) /* Standard register usage. */ diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index dbde066f7478bec51a8703b017ea553aa98be309..1ecd1a2812969504bd5114a53473b478c5ddba82 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -445,6 +445,10 @@ (define_enum_attr "arch" "arches" (const_string "any")) ;; target-independent code. (define_attr "is_call" "no,yes" (const_string "no")) +;; Indicates whether we want to enable the pattern with an optional early +;; clobber for SVE predicates. +(define_attr "pred_clobber" "no,yes" (const_string "no")) + ;; [For compatibility with Arm in pipeline models] ;; Attribute that specifies whether or not the instruction touches fp ;; registers. @@ -461,7 +465,8 @@ (define_attr "fp" "no,yes" (define_attr "arch_enabled" "no,yes" (if_then_else (ior - (eq_attr "arch" "any") + (and (eq_attr "arch" "any") +(eq_attr "pred_clobber" "no")) (and (eq_attr "arch" "rcpc8_4") (match_test "AARCH64_ISA_RCPC8_4")) @@ -488,7 +493,10 @@ (define_attr "arch_enabled" "no,yes" (match_test "TARGET_SVE")) (and (eq_attr "arch" "sme") -(match_test "TARGET_SME"))) +(match_test "TARGET_SME")) + + (and (eq_attr "pred_clobber" "yes") +(match_test "TARGET_SVE_PRED_CLOBBER"))) (const_string "yes") (const_string "no"))) -- diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index d5bcaebce770f0b217aac783063d39135f754c77..49fbad3ff28bc82b25c61ac501ccf533ec4b4c3f 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -48,4 +48,8 @@ AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA) AARCH64_EXTRA_TUNING_OPTION ("fully_pipelined_fma", FULLY_PIPELINED_FMA) +/* Enable is the target prefers to use a fresh register for predicate outputs + rather than re-use an input predicate register. */ +AARCH64_EXTRA_TUNING_OPTION ("early_clobber_sve_pred_dest", EARLY_CLOBBER_SVE_PRED_DEST) + #undef AARCH64_EXTRA_TUNING_OPTION diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index bbf11faaf4b4340956094a983f8b0dc2649b2d27..76a18dd511f40ebb58ed12d56b46c74084ba7c3c 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -495,6 +495,11 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE = AARCH64_FL_SM_OFF; enabled through +gcs. */ #define TARGET_GCS (AARCH64_ISA_GCS) +/* Prefer different predicate registers for the output of a predicated operation over +re-using an existing input predicate. */ +#define TARGET_SVE_PRED_CLOBBER (TARGET_SVE \ + && (aarch64_tune_params.extra_tuning_flags \ + & AARCH64_EXTRA_TUNE_EARLY_CLOBBER_SVE_PRED_DEST)) /* Standard register usage. */ diff --git a/gcc/config/aarch64/aarch64.md
[PATCH 0/4]AArch64: support conditional early clobbers on certain operations.
Hi All, Some Neoverse Software Optimization Guides (SWoG) have a clause that state that for predicated operations that also produce a predicate it is preferred that the codegen should use a different register for the destination than that of the input predicate in order to avoid a performance overhead. This of course has the problem that it increases register pressure and so should be done with care. Additionally not all micro-architectures have this consideration and so it shouldn't be done as a default thing. The patch series adds support for doing conditional early clobbers through a combination of new alternatives and attributes to control their availability. On high register pressure we also use LRA's costing to prefer not to use the alternative and instead just use the tie as this is preferable to a reload. Concretely this patch series does: > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2 foo: mov z31.h, w0 ptrue p3.b, all cmplo p0.h, p3/z, z0.h, z31.h b use > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n1+sve foo: mov z31.h, w0 ptrue p0.b, all cmplo p0.h, p0/z, z0.h, z31.h b use > aarch64-none-elf-gcc -O3 -g0 -S -o - pred-clobber.c -mcpu=neoverse-n2 > -ffixed-p[1-15] foo: mov z31.h, w0 ptrue p0.b, all cmplo p0.h, p0/z, z0.h, z31.h b use Testcases for the changes are in the last patch of the series. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Thanks, Tamar --- --
RE: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int
Hi Pan, Thanks! > -Original Message- > From: pan2...@intel.com > Sent: Wednesday, May 15, 2024 3:14 AM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > ; richard.guent...@gmail.com; > hongtao@intel.com; Pan Li > Subject: [PATCH v5 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar > int > > From: Pan Li > > This patch would like to add the middle-end presentation for the > saturation add. Aka set the result of add to the max when overflow. > It will take the pattern similar as below. > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > Take uint8_t as example, we will have: > > * SAT_ADD (1, 254) => 255. > * SAT_ADD (1, 255) => 255. > * SAT_ADD (2, 255) => 255. > * SAT_ADD (255, 255) => 255. > > Given below example for the unsigned scalar integer uint64_t: > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > { > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > } > > Before this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > long unsigned int _1; > _Bool _2; > long unsigned int _3; > long unsigned int _4; > uint64_t _7; > long unsigned int _10; > __complex__ long unsigned int _11; > > ;; basic block 2, loop depth 0 > ;;pred: ENTRY > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > _1 = REALPART_EXPR <_11>; > _10 = IMAGPART_EXPR <_11>; > _2 = _10 != 0; > _3 = (long unsigned int) _2; > _4 = -_3; > _7 = _1 | _4; > return _7; > ;;succ: EXIT > > } > > After this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > uint64_t _7; > > ;; basic block 2, loop depth 0 > ;;pred: ENTRY > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > return _7; > ;;succ: EXIT > } > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > PR target/51492 > PR target/112600 > > gcc/ChangeLog: > > * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD > to the return true switch case(s). > * internal-fn.def (SAT_ADD): Add new signed optab SAT_ADD. > * match.pd: Add unsigned SAT_ADD match(es). > * optabs.def (OPTAB_NL): Remove fixed-point limitation for > us/ssadd. > * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New > extern func decl generated in match.pd match. > (match_saturation_arith): New func impl to match the saturation arith. > (math_opts_dom_walker::after_dom_children): Try match saturation > arith when IOR expr. > LGTM but you'll need an OK from Richard, Thanks for working on this! Tamar > Signed-off-by: Pan Li > --- > gcc/internal-fn.cc| 1 + > gcc/internal-fn.def | 2 ++ > gcc/match.pd | 51 +++ > gcc/optabs.def| 4 +-- > gcc/tree-ssa-math-opts.cc | 32 > 5 files changed, 88 insertions(+), 2 deletions(-) > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 0a7053c2286..73045ca8c8c 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) > case IFN_UBSAN_CHECK_MUL: > case IFN_ADD_OVERFLOW: > case IFN_MUL_OVERFLOW: > +case IFN_SAT_ADD: > case IFN_VEC_WIDEN_PLUS: > case IFN_VEC_WIDEN_PLUS_LO: > case IFN_VEC_WIDEN_PLUS_HI: > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 848bb9dbff3..25badbb86e5 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST > | ECF_NOTHROW, first, > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, > first, > smulhrs, umulhrs, binary) > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, > binary) > + > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) > diff --git a/gcc/match.pd b/gcc/match.pd > index 07e743ae464..0f9c34fa897 100644 > --- a/gcc/match.pd > +++ b/gcc/match.pd > @@ -3043,6 +3043,57 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) > || POINTER_TYPE_P (itype)) >&& wi::eq_p (wi::to_wide (int_cst), wi::max_value (itype)) > > +/* Unsigned Saturation Add */ > +(match (usadd_left_part_1 @0 @1) > + (plus:c @0 @1) > + (if (INTEG
RE: [PATCH v1 1/3] Vect: Support loop len in vectorizable early exit
> -Original Message- > From: pan2...@intel.com > Sent: Monday, May 13, 2024 3:54 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; > Tamar Christina ; Richard Sandiford > ; Pan Li > Subject: [PATCH v1 1/3] Vect: Support loop len in vectorizable early exit > > From: Pan Li > > This patch adds early break auto-vectorization support for target which > use length on partial vectorization. Consider this following example: > > unsigned vect_a[802]; > unsigned vect_b[802]; > > void test (unsigned x, int n) > { > for (int i = 0; i < n; i++) > { > vect_b[i] = x + i; > > if (vect_a[i] > x) > break; > > vect_a[i] = x; > } > } > > We use VCOND_MASK_LEN to simulate the generate (mask && i < len + bias). > And then the IR of RVV looks like below: > > ... > _87 = .SELECT_VL (ivtmp_85, POLY_INT_CST [32, 32]); > _55 = (int) _87; > ... > mask_patt_6.13_69 = vect_cst__62 < vect__3.12_67; > vec_len_mask_72 = .VCOND_MASK_LEN (mask_patt_6.13_69, { -1, ... }, \ > {0, ... }, _87, 0); > if (vec_len_mask_72 != { 0, ... }) > goto ; [5.50%] > else > goto ; [94.50%] > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 2. The aarch64 fully regression tests. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > gcc/ChangeLog: > > * tree-vect-stmts.cc (vectorizable_early_exit): Add loop len > handling for one or multiple stmt. > > Signed-off-by: Pan Li > --- > gcc/tree-vect-stmts.cc | 47 - > - > 1 file changed, 45 insertions(+), 2 deletions(-) > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index 21e8fe98e44..bfd9d66568f 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -12896,7 +12896,9 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > ncopies = vect_get_num_copies (loop_vinfo, vectype); > >vec_loop_masks *masks = _VINFO_MASKS (loop_vinfo); > + vec_loop_lens *lens = _VINFO_LENS (loop_vinfo); >bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); > + bool len_loop_p = LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo); > >/* Now build the new conditional. Pattern gimple_conds get dropped during > codegen so we must replace the original insn. */ > @@ -12960,12 +12962,11 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > { > if (direct_internal_fn_supported_p (IFN_VCOND_MASK_LEN, vectype, > OPTIMIZE_FOR_SPEED)) > - return false; > + vect_record_loop_len (loop_vinfo, lens, ncopies, vectype, 1); > else > vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, NULL); > } > > - >return true; > } > > @@ -13018,6 +13019,25 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > stmts[i], _gsi); > workset.quick_push (stmt_mask); > } > + else if (len_loop_p) > + for (unsigned i = 0; i < stmts.length (); i++) > + { > + tree all_ones_mask = build_all_ones_cst (vectype); > + tree all_zero_mask = build_zero_cst (vectype); > + tree len = vect_get_loop_len (loop_vinfo, gsi, lens, ncopies, > + vectype, i, 1); > + signed char cst = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS > (loop_vinfo); > + tree bias = build_int_cst (intQI_type_node, cst); > + tree len_mask > + = make_temp_ssa_name (TREE_TYPE (stmts[i]), NULL, > "vec_len_mask"); > + gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, > + stmts[i], all_ones_mask, > + all_zero_mask, len, bias); > + gimple_call_set_lhs (call, len_mask); > + gsi_insert_before (_gsi, call, GSI_SAME_STMT); > + > + workset.quick_push (len_mask); > + } >else > workset.splice (stmts); > > @@ -13042,6 +13062,29 @@ vectorizable_early_exit (vec_info *vinfo, > stmt_vec_info stmt_info, > new_temp = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, > new_temp, _gsi); > } > + else if (len_loop_p) > + { > + /* len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias) > + > + which is equivalent to: > + > + len_mask
RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int
> > Thanks Tamer for comments. > > > I think OPTIMIZE_FOR_BOTH is better here, since this is a win also when > optimizing for size. > > Sure thing, let me update it in v5. > > > Hmm why do you iterate independently over the statements? The block below > already visits > > Every statement doesn't it? > > Because it will hit .ADD_OVERFLOW first, then it will never hit SAT_ADD as the > shape changed, or shall we put it to the previous pass ? > That's just a matter of matching the overflow as an additional case no? i.e. you can add an overload for unsigned_integer_sat_add matching the IFN_ ADD_OVERFLOW and using the realpart and imagpart helpers. I think that would be better as it avoid visiting all the statements twice but also extends the matching to some __builtin_add_overflow uses and should be fairly simple. > > The root of your match is a BIT_IOR_EXPR expression, so I think you just > > need to > change the entry below to: > > > > case BIT_IOR_EXPR: > > match_saturation_arith (, stmt, m_cfg_changed_p); > > /* fall-through */ > > case BIT_XOR_EXPR: > > match_uaddc_usubc (, stmt, code); > > break; > > There are other shapes (not covered in this patch) of SAT_ADD like below > branch > version, the IOR should be one of the ROOT. Thus doesn't > add case here. Then, shall we take case for each shape here ? Both works for > me. > Yeah, I think that's better than iterating over the statements twice. It also fits better In the existing code. Tamar. > #define SAT_ADD_U_1(T) \ > T sat_add_u_1_##T(T x, T y) \ > { \ > return (T)(x + y) >= x ? (x + y) : -1; \ > } > > SAT_ADD_U_1(uint32_t) > > Pan > > > -Original Message- > From: Tamar Christina > Sent: Monday, May 13, 2024 5:10 PM > To: Li, Pan2 ; gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; > Liu, Hongtao > Subject: RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar int > > Hi Pan, > > > -Original Message- > > From: pan2...@intel.com > > Sent: Monday, May 6, 2024 3:48 PM > > To: gcc-patches@gcc.gnu.org > > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > > ; richard.guent...@gmail.com; > > hongtao@intel.com; Pan Li > > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar > > int > > > > From: Pan Li > > > > This patch would like to add the middle-end presentation for the > > saturation add. Aka set the result of add to the max when overflow. > > It will take the pattern similar as below. > > > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > > > Take uint8_t as example, we will have: > > > > * SAT_ADD (1, 254) => 255. > > * SAT_ADD (1, 255) => 255. > > * SAT_ADD (2, 255) => 255. > > * SAT_ADD (255, 255) => 255. > > > > Given below example for the unsigned scalar integer uint64_t: > > > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > > { > > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > > } > > > > Before this patch: > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > long unsigned int _1; > > _Bool _2; > > long unsigned int _3; > > long unsigned int _4; > > uint64_t _7; > > long unsigned int _10; > > __complex__ long unsigned int _11; > > > > ;; basic block 2, loop depth 0 > > ;;pred: ENTRY > > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > > _1 = REALPART_EXPR <_11>; > > _10 = IMAGPART_EXPR <_11>; > > _2 = _10 != 0; > > _3 = (long unsigned int) _2; > > _4 = -_3; > > _7 = _1 | _4; > > return _7; > > ;;succ: EXIT > > > > } > > > > After this patch: > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > uint64_t _7; > > > > ;; basic block 2, loop depth 0 > > ;;pred: ENTRY > > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > > return _7; > > ;;succ: EXIT > > } > > > > We perform the tranform during widen_mult because that the sub-expr of > > SAT_ADD will be optimized to .ADD_OVERFLOW. We need to try the .SAT_ADD > > pattern first and then .ADD_OVERFLOW, or we may never catch the pattern > > .SAT_ADD. Meanwhile, the isel pass is after widen_mult and then we > > cannot perform the .SAT_ADD pattern match as the
RE: [PATCH] Allow patterns in SLP reductions
> -Original Message- > From: Richard Biener > Sent: Friday, May 10, 2024 2:07 PM > To: Richard Biener > Cc: gcc-patches@gcc.gnu.org > Subject: Re: [PATCH] Allow patterns in SLP reductions > > On Fri, Mar 1, 2024 at 10:21 AM Richard Biener wrote: > > > > The following removes the over-broad rejection of patterns for SLP > > reductions which is done by removing them from LOOP_VINFO_REDUCTIONS > > during pattern detection. That's also insufficient in case the > > pattern only appears on the reduction path. Instead this implements > > the proper correctness check in vectorizable_reduction and guides > > SLP discovery to heuristically avoid forming later invalid groups. > > > > I also couldn't find any testcase that FAILs when allowing the SLP > > reductions to form so I've added one. > > > > I came across this for single-lane SLP reductions with the all-SLP > > work where we rely on patterns to properly vectorize COND_EXPR > > reductions. > > > > Bootstrapped and tested on x86_64-unknown-linux-gnu, queued for stage1. > > Re-bootstrapped/tested, r15-361-g52d4691294c847 Awesome! Does this now allow us to write new reductions using patterns? i.e. widening reductions? Cheers, Tamar > > Richard. > > > Richard. > > > > * tree-vect-patterns.cc (vect_pattern_recog_1): Do not > > remove reductions involving patterns. > > * tree-vect-loop.cc (vectorizable_reduction): Reject SLP > > reduction groups with multiple lane-reducing reductions. > > * tree-vect-slp.cc (vect_analyze_slp_instance): When discovering > > SLP reduction groups avoid including lane-reducing ones. > > > > * gcc.dg/vect/vect-reduc-sad-9.c: New testcase. > > --- > > gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c | 68 > > gcc/tree-vect-loop.cc| 15 + > > gcc/tree-vect-patterns.cc| 13 > > gcc/tree-vect-slp.cc | 26 +--- > > 4 files changed, 101 insertions(+), 21 deletions(-) > > create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c > > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c > b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c > > new file mode 100644 > > index 000..3c6af4510f4 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-sad-9.c > > @@ -0,0 +1,68 @@ > > +/* Disabling epilogues until we find a better way to deal with scans. */ > > +/* { dg-additional-options "--param vect-epilogues-nomask=0" } */ > > +/* { dg-additional-options "-msse4.2" { target { x86_64-*-* i?86-*-* } } } > > */ > > +/* { dg-require-effective-target vect_usad_char } */ > > + > > +#include > > +#include "tree-vect.h" > > + > > +#define N 64 > > + > > +unsigned char X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); > > +unsigned char Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__))); > > +int abs (int); > > + > > +/* Sum of absolute differences between arrays of unsigned char types. > > + Detected as a sad pattern. > > + Vectorized on targets that support sad for unsigned chars. */ > > + > > +__attribute__ ((noinline)) int > > +foo (int len, int *res2) > > +{ > > + int i; > > + int result = 0; > > + int result2 = 0; > > + > > + for (i = 0; i < len; i++) > > +{ > > + /* Make sure we are not using an SLP reduction for this. */ > > + result += abs (X[2*i] - Y[2*i]); > > + result2 += abs (X[2*i + 1] - Y[2*i + 1]); > > +} > > + > > + *res2 = result2; > > + return result; > > +} > > + > > + > > +int > > +main (void) > > +{ > > + int i; > > + int sad; > > + > > + check_vect (); > > + > > + for (i = 0; i < N/2; i++) > > +{ > > + X[2*i] = i; > > + Y[2*i] = N/2 - i; > > + X[2*i+1] = i; > > + Y[2*i+1] = 0; > > + __asm__ volatile (""); > > +} > > + > > + > > + int sad2; > > + sad = foo (N/2, ); > > + if (sad != (N/2)*(N/4)) > > +abort (); > > + if (sad2 != (N/2-1)*(N/2)/2) > > +abort (); > > + > > + return 0; > > +} > > + > > +/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } > > } */ > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > > + > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index 35f1f8c7d42..13dcdba403a 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -7703,6 +7703,21 @@ vectorizable_reduction (loop_vec_info loop_vinfo, > >return false; > > } > > > > + /* Lane-reducing ops also never can be used in a SLP reduction group > > + since we'll mix lanes belonging to different reductions. But it's > > + OK to use them in a reduction chain or when the reduction group > > + has just one element. */ > > + if (lane_reduc_code_p > > + && slp_node > > + && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) > > + && SLP_TREE_LANES (slp_node) > 1) > > +{ > > + if (dump_enabled_p ()) > > + dump_printf_loc
RE: [PATCH v4 2/3] VECT: Support new IFN SAT_ADD for unsigned vector int
Hi Pan, > -Original Message- > From: pan2...@intel.com > Sent: Monday, May 6, 2024 3:49 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > ; richard.guent...@gmail.com; > hongtao@intel.com; Pan Li > Subject: [PATCH v4 2/3] VECT: Support new IFN SAT_ADD for unsigned vector int > > From: Pan Li > > This patch depends on below scalar enabling patch: > > https://gcc.gnu.org/pipermail/gcc-patches/2024-May/650822.html > > For vectorize, we leverage the existing vect pattern recog to find > the pattern similar to scalar and let the vectorizer to perform > the rest part for standard name usadd3 in vector mode. > The riscv vector backend have insn "Vector Single-Width Saturating > Add and Subtract" which can be leveraged when expand the usadd3 > in vector mode. For example: > > void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n) > { > unsigned i; > > for (i = 0; i < n; i++) > out[i] = (x[i] + y[i]) | (- (uint64_t)((uint64_t)(x[i] + y[i]) < x[i])); > } > > Before this patch: > void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n) > { > ... > _80 = .SELECT_VL (ivtmp_78, POLY_INT_CST [2, 2]); > ivtmp_58 = _80 * 8; > vect__4.7_61 = .MASK_LEN_LOAD (vectp_x.5_59, 64B, { -1, ... }, _80, 0); > vect__6.10_65 = .MASK_LEN_LOAD (vectp_y.8_63, 64B, { -1, ... }, _80, 0); > vect__7.11_66 = vect__4.7_61 + vect__6.10_65; > mask__8.12_67 = vect__4.7_61 > vect__7.11_66; > vect__12.15_72 = .VCOND_MASK (mask__8.12_67, { 18446744073709551615, > ... }, vect__7.11_66); > .MASK_LEN_STORE (vectp_out.16_74, 64B, { -1, ... }, _80, 0, vect__12.15_72); > vectp_x.5_60 = vectp_x.5_59 + ivtmp_58; > vectp_y.8_64 = vectp_y.8_63 + ivtmp_58; > vectp_out.16_75 = vectp_out.16_74 + ivtmp_58; > ivtmp_79 = ivtmp_78 - _80; > ... > } > > After this patch: > void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n) > { > ... > _62 = .SELECT_VL (ivtmp_60, POLY_INT_CST [2, 2]); > ivtmp_46 = _62 * 8; > vect__4.7_49 = .MASK_LEN_LOAD (vectp_x.5_47, 64B, { -1, ... }, _62, 0); > vect__6.10_53 = .MASK_LEN_LOAD (vectp_y.8_51, 64B, { -1, ... }, _62, 0); > vect__12.11_54 = .SAT_ADD (vect__4.7_49, vect__6.10_53); > .MASK_LEN_STORE (vectp_out.12_56, 64B, { -1, ... }, _62, 0, vect__12.11_54); > ... > } > > The below test suites are passed for this patch. > * The riscv fully regression tests. > * The aarch64 fully regression tests. > * The x86 bootstrap tests. > * The x86 fully regression tests. > > PR target/51492 > PR target/112600 > > gcc/ChangeLog: > > * tree-vect-patterns.cc (gimple_unsigned_integer_sat_add): New func > decl generated by match.pd match. > (vect_recog_sat_add_pattern): New func impl to recog the pattern > for unsigned SAT_ADD. > > Signed-off-by: Pan Li Patch looks good to me, but I cannot approve so I'll pass it on to Richi. Cheers, Tamar > --- > gcc/tree-vect-patterns.cc | 51 +++ > 1 file changed, 51 insertions(+) > > diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc > index 87c2acff386..8ffcaf71d5c 100644 > --- a/gcc/tree-vect-patterns.cc > +++ b/gcc/tree-vect-patterns.cc > @@ -4487,6 +4487,56 @@ vect_recog_mult_pattern (vec_info *vinfo, >return pattern_stmt; > } > > +extern bool gimple_unsigned_integer_sat_add (tree, tree*, tree (*)(tree)); > + > +/* > + * Try to detect saturation add pattern (SAT_ADD), aka below gimple: > + * _7 = _4 + _6; > + * _8 = _4 > _7; > + * _9 = (long unsigned int) _8; > + * _10 = -_9; > + * _12 = _7 | _10; > + * > + * And then simplied to > + * _12 = .SAT_ADD (_4, _6); > + */ > + > +static gimple * > +vect_recog_sat_add_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo, > + tree *type_out) > +{ > + gimple *last_stmt = STMT_VINFO_STMT (stmt_vinfo); > + > + if (!is_gimple_assign (last_stmt)) > +return NULL; > + > + tree res_ops[2]; > + tree lhs = gimple_assign_lhs (last_stmt); > + > + if (gimple_unsigned_integer_sat_add (lhs, res_ops, NULL)) > +{ > + tree itype = TREE_TYPE (res_ops[0]); > + tree vtype = get_vectype_for_scalar_type (vinfo, itype); > + > + if (vtype != NULL_TREE && direct_internal_fn_supported_p ( > + IFN_SAT_ADD, vtype, OPTIMIZE_FOR_SPEED)) > + { > + *type_out = vtype; > + gcall *call = gimple_build_call_internal (IFN_SAT_ADD, 2, res_ops[0], > + res_ops[1]); > +
RE: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned scalar int
Hi Pan, > -Original Message- > From: pan2...@intel.com > Sent: Monday, May 6, 2024 3:48 PM > To: gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; Tamar Christina > ; richard.guent...@gmail.com; > hongtao@intel.com; Pan Li > Subject: [PATCH v4 1/3] Internal-fn: Support new IFN SAT_ADD for unsigned > scalar > int > > From: Pan Li > > This patch would like to add the middle-end presentation for the > saturation add. Aka set the result of add to the max when overflow. > It will take the pattern similar as below. > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > Take uint8_t as example, we will have: > > * SAT_ADD (1, 254) => 255. > * SAT_ADD (1, 255) => 255. > * SAT_ADD (2, 255) => 255. > * SAT_ADD (255, 255) => 255. > > Given below example for the unsigned scalar integer uint64_t: > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > { > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > } > > Before this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > long unsigned int _1; > _Bool _2; > long unsigned int _3; > long unsigned int _4; > uint64_t _7; > long unsigned int _10; > __complex__ long unsigned int _11; > > ;; basic block 2, loop depth 0 > ;;pred: ENTRY > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > _1 = REALPART_EXPR <_11>; > _10 = IMAGPART_EXPR <_11>; > _2 = _10 != 0; > _3 = (long unsigned int) _2; > _4 = -_3; > _7 = _1 | _4; > return _7; > ;;succ: EXIT > > } > > After this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > uint64_t _7; > > ;; basic block 2, loop depth 0 > ;;pred: ENTRY > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > return _7; > ;;succ: EXIT > } > > We perform the tranform during widen_mult because that the sub-expr of > SAT_ADD will be optimized to .ADD_OVERFLOW. We need to try the .SAT_ADD > pattern first and then .ADD_OVERFLOW, or we may never catch the pattern > .SAT_ADD. Meanwhile, the isel pass is after widen_mult and then we > cannot perform the .SAT_ADD pattern match as the sub-expr will be > optmized to .ADD_OVERFLOW first. > > The below tests are passed for this patch: > 1. The riscv fully regression tests. > 2. The aarch64 fully regression tests. > 3. The x86 bootstrap tests. > 4. The x86 fully regression tests. > > PR target/51492 > PR target/112600 > > gcc/ChangeLog: > > * internal-fn.cc (commutative_binary_fn_p): Add type IFN_SAT_ADD > to the return true switch case(s). > * internal-fn.def (SAT_ADD): Add new signed optab SAT_ADD. > * match.pd: Add unsigned SAT_ADD match. > * optabs.def (OPTAB_NL): Remove fixed-point limitation for us/ssadd. > * tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_add): New extern > func decl generated in match.pd match. > (match_saturation_arith): New func impl to match the saturation arith. > (math_opts_dom_walker::after_dom_children): Try match saturation > arith. > > Signed-off-by: Pan Li > --- > gcc/internal-fn.cc| 1 + > gcc/internal-fn.def | 2 ++ > gcc/match.pd | 28 > gcc/optabs.def| 4 ++-- > gcc/tree-ssa-math-opts.cc | 46 > +++ > 5 files changed, 79 insertions(+), 2 deletions(-) > > diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc > index 0a7053c2286..73045ca8c8c 100644 > --- a/gcc/internal-fn.cc > +++ b/gcc/internal-fn.cc > @@ -4202,6 +4202,7 @@ commutative_binary_fn_p (internal_fn fn) > case IFN_UBSAN_CHECK_MUL: > case IFN_ADD_OVERFLOW: > case IFN_MUL_OVERFLOW: > +case IFN_SAT_ADD: > case IFN_VEC_WIDEN_PLUS: > case IFN_VEC_WIDEN_PLUS_LO: > case IFN_VEC_WIDEN_PLUS_HI: > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 848bb9dbff3..25badbb86e5 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -275,6 +275,8 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHS, ECF_CONST > | ECF_NOTHROW, first, > DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | ECF_NOTHROW, > first, > smulhrs, umulhrs, binary) > > +DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, > binary) > + > DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary) > DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary) > DEF_INTERNAL_COND_FN (MUL, ECF_CONST, smul, binary) > diff --git a/gcc/match.pd b/gcc/match.pd > index d401e7503e6..7058e4cbe29 100644 >
RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD
> > So he was responding for how to do it for the vectorizer and scalar parts. > > Remember that the goal is not to introduce new gimple IL that can block > > other > optimizations. > > The vectorizer already introduces new IL (various IFN) but this is fine as > > we don't > track things like ranges for > > vector instructions. So we don't loose any information here. > > > Now for the scalar, if we do an early replacement like in match.pd we > > prevent a > lot of other optimizations > > because they don't know what IFN_SAT_ADD does. gimple-isel runs pretty late, > and so at this point we don't > > expect many more optimizations to happen, so it's a safe spot to insert > > more IL > with "unknown semantics". > > > Was that your intention Richi? > > Thanks Tamar for clear explanation, does that mean both the scalar and vector > will > go isel approach? If so I may > misunderstand in previous that it is only for vectorize. No, The isel would only be for the scalar, The vectorizer will still use the vect_pattern. It needs to so we can cost the operation correctly, and in some cases depending on how the saturation is described you are unable the vectorize. The pattern allows us to catch these cases and still vectorize. But you should be able to use the same match.pd predicate for both the vectorizer pattern and isel. > > Understand the point that we would like to put the pattern match late but I > may > have a question here. > Given SAT_ADD related pattern is sort of complicated, it is possible that the > sub- > expression of SAT_ADD is optimized > In early pass by others and we can hardly catch the shapes later. > > For example, there is a plus expression in SAT_ADD, and in early pass it may > be > optimized to .ADD_OVERFLOW, and > then the pattern is quite different to aware of that in later pass. > Yeah, it looks like this transformation is done in widening_mul, which is the other place richi suggested to recognize SAT_ADD. widening_mul already runs quite late as well so it's also ok. If you put it there before the code that transforms the sequence to overflow it should work. Eventually we do need to recognize this variant since: uint64_t add_sat(uint64_t x, uint64_t y) noexcept { uint64_t z; if (!__builtin_add_overflow(x, y, )) return z; return -1u; } Is a valid and common way to do saturation too. But for now, it's fine. Cheers, Tamar > Sorry not sure if my understanding is correct, feel free to correct me. > > Pan > > -Original Message- > From: Tamar Christina > Sent: Thursday, May 2, 2024 11:26 AM > To: Li, Pan2 ; gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; > Liu, Hongtao > Subject: RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD > > > -Original Message- > > From: Li, Pan2 > > Sent: Thursday, May 2, 2024 4:11 AM > > To: Tamar Christina ; gcc-patches@gcc.gnu.org > > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; > > Liu, Hongtao > > Subject: RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD > > > > Thanks Tamar > > > > > Could you also split off the vectorizer change from scalar recog one? > > > Typically I > > would structure a change like this as: > > > > > 1. create types/structures + scalar recogn > > > 2. Vector recog code > > > 3. Backend changes > > > > Sure thing, will rearrange the patch like this. > > > > > Is ECF_NOTHROW correct here? At least on most targets I believe the scalar > > version > > > can set flags/throw exceptions if the saturation happens? > > > > I see, will remove that. > > > > > Hmm I believe Richi mentioned that he wanted the recognition done in isel? > > > > > The problem with doing it in match.pd is that it replaces the operations > > > quite > > > early the pipeline. Did I miss an email perhaps? The early replacement > > > means > we > > > lose optimizations and things such as range calculations etc, since e.g. > > > ranger > > > doesn't know these internal functions. > > > > > I think Richi will want this in islet or mult widening but I'll continue > > > with > match.pd > > > review just in case. > > > > If I understand is correct, Richard suggested try vectorizer patterns first > > and then > > possible isel. > > Thus, I don't have a try for SAT_ADD in ISEL as vectorizer patterns works > > well for > > SAT_ADD. > > Let's wait
RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD
> -Original Message- > From: Li, Pan2 > Sent: Thursday, May 2, 2024 4:11 AM > To: Tamar Christina ; gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; > Liu, Hongtao > Subject: RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD > > Thanks Tamar > > > Could you also split off the vectorizer change from scalar recog one? > > Typically I > would structure a change like this as: > > > 1. create types/structures + scalar recogn > > 2. Vector recog code > > 3. Backend changes > > Sure thing, will rearrange the patch like this. > > > Is ECF_NOTHROW correct here? At least on most targets I believe the scalar > version > > can set flags/throw exceptions if the saturation happens? > > I see, will remove that. > > > Hmm I believe Richi mentioned that he wanted the recognition done in isel? > > > The problem with doing it in match.pd is that it replaces the operations > > quite > > early the pipeline. Did I miss an email perhaps? The early replacement > > means we > > lose optimizations and things such as range calculations etc, since e.g. > > ranger > > doesn't know these internal functions. > > > I think Richi will want this in islet or mult widening but I'll continue > > with match.pd > > review just in case. > > If I understand is correct, Richard suggested try vectorizer patterns first > and then > possible isel. > Thus, I don't have a try for SAT_ADD in ISEL as vectorizer patterns works > well for > SAT_ADD. > Let's wait the confirmation from Richard. Below are the original words from > previous mail for reference. > I think the comment he made was this > > Given we have saturating integer alu like below, could you help to coach me > > the most reasonable way to represent > > It in scalar as well as vectorize part? Sorry not familiar with this part > > and still dig into how it works... > > As in your v2, .SAT_ADD for both sat_uadd and sat_sadd, similar for > the other cases. > > As I said, use vectorizer patterns and possibly do instruction > selection at ISEL/widen_mult time. So he was responding for how to do it for the vectorizer and scalar parts. Remember that the goal is not to introduce new gimple IL that can block other optimizations. The vectorizer already introduces new IL (various IFN) but this is fine as we don't track things like ranges for vector instructions. So we don't loose any information here. Now for the scalar, if we do an early replacement like in match.pd we prevent a lot of other optimizations because they don't know what IFN_SAT_ADD does. gimple-isel runs pretty late, and so at this point we don't expect many more optimizations to happen, so it's a safe spot to insert more IL with "unknown semantics". Was that your intention Richi? Thanks, Tamar > >> As I said, use vectorizer patterns and possibly do instruction > >> selection at ISEL/widen_mult time. > > > The optimize checks in the match.pd file are weird as it seems to check if > > we have > > optimizations enabled? > > > We don't typically need to do this. > > Sure, will remove this. > > > The function has only one caller, you should just inline it into the > > pattern. > > Sure thing. > > > Once you inline vect_sat_add_build_call you can do the check for > > vtype here, which is the cheaper check so perform it early. > > Sure thing. > > Thanks again and will send the v4 with all comments addressed, as well as the > test > results. > > Pan > > -Original Message- > From: Tamar Christina > Sent: Thursday, May 2, 2024 1:06 AM > To: Li, Pan2 ; gcc-patches@gcc.gnu.org > Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; > Liu, Hongtao > Subject: RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD > > Hi, > > > From: Pan Li > > > > Update in v3: > > * Rebase upstream for conflict. > > > > Update in v2: > > * Fix one failure for x86 bootstrap. > > > > Original log: > > > > This patch would like to add the middle-end presentation for the > > saturation add. Aka set the result of add to the max when overflow. > > It will take the pattern similar as below. > > > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > > > Take uint8_t as example, we will have: > > > > * SAT_ADD (1, 254) => 255. > > * SAT_ADD (1, 255) => 255. > > * SAT_ADD (2, 255) => 255. > > * SAT_ADD (255, 255) => 255. > > > > The p
RE: [PATCH v3] Internal-fn: Introduce new internal function SAT_ADD
Hi, > From: Pan Li > > Update in v3: > * Rebase upstream for conflict. > > Update in v2: > * Fix one failure for x86 bootstrap. > > Original log: > > This patch would like to add the middle-end presentation for the > saturation add. Aka set the result of add to the max when overflow. > It will take the pattern similar as below. > > SAT_ADD (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > Take uint8_t as example, we will have: > > * SAT_ADD (1, 254) => 255. > * SAT_ADD (1, 255) => 255. > * SAT_ADD (2, 255) => 255. > * SAT_ADD (255, 255) => 255. > > The patch also implement the SAT_ADD in the riscv backend as > the sample for both the scalar and vector. Given below example: > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > { > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > } > > Before this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > long unsigned int _1; > _Bool _2; > long unsigned int _3; > long unsigned int _4; > uint64_t _7; > long unsigned int _10; > __complex__ long unsigned int _11; > > ;; basic block 2, loop depth 0 > ;;pred: ENTRY > _11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > _1 = REALPART_EXPR <_11>; > _10 = IMAGPART_EXPR <_11>; > _2 = _10 != 0; > _3 = (long unsigned int) _2; > _4 = -_3; > _7 = _1 | _4; > return _7; > ;;succ: EXIT > > } > > After this patch: > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > { > uint64_t _7; > > ;; basic block 2, loop depth 0 > ;;pred: ENTRY > _7 = .SAT_ADD (x_5(D), y_6(D)); [tail call] > return _7; > ;;succ: EXIT > } > > For vectorize, we leverage the existing vect pattern recog to find > the pattern similar to scalar and let the vectorizer to perform > the rest part for standard name usadd3 in vector mode. > The riscv vector backend have insn "Vector Single-Width Saturating > Add and Subtract" which can be leveraged when expand the usadd3 > in vector mode. For example: > > void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n) > { > unsigned i; > > for (i = 0; i < n; i++) > out[i] = (x[i] + y[i]) | (- (uint64_t)((uint64_t)(x[i] + y[i]) < x[i])); > } > > Before this patch: > void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n) > { > ... > _80 = .SELECT_VL (ivtmp_78, POLY_INT_CST [2, 2]); > ivtmp_58 = _80 * 8; > vect__4.7_61 = .MASK_LEN_LOAD (vectp_x.5_59, 64B, { -1, ... }, _80, 0); > vect__6.10_65 = .MASK_LEN_LOAD (vectp_y.8_63, 64B, { -1, ... }, _80, 0); > vect__7.11_66 = vect__4.7_61 + vect__6.10_65; > mask__8.12_67 = vect__4.7_61 > vect__7.11_66; > vect__12.15_72 = .VCOND_MASK (mask__8.12_67, { 18446744073709551615, > ... }, vect__7.11_66); > .MASK_LEN_STORE (vectp_out.16_74, 64B, { -1, ... }, _80, 0, vect__12.15_72); > vectp_x.5_60 = vectp_x.5_59 + ivtmp_58; > vectp_y.8_64 = vectp_y.8_63 + ivtmp_58; > vectp_out.16_75 = vectp_out.16_74 + ivtmp_58; > ivtmp_79 = ivtmp_78 - _80; > ... > } > > vec_sat_add_u64: > ... > vsetvli a5,a3,e64,m1,ta,ma > vle64.v v0,0(a1) > vle64.v v1,0(a2) > sllia4,a5,3 > sub a3,a3,a5 > add a1,a1,a4 > add a2,a2,a4 > vadd.vv v1,v0,v1 > vmsgtu.vv v0,v0,v1 > vmerge.vim v1,v1,-1,v0 > vse64.v v1,0(a0) > ... > > After this patch: > void vec_sat_add_u64 (uint64_t *out, uint64_t *x, uint64_t *y, unsigned n) > { > ... > _62 = .SELECT_VL (ivtmp_60, POLY_INT_CST [2, 2]); > ivtmp_46 = _62 * 8; > vect__4.7_49 = .MASK_LEN_LOAD (vectp_x.5_47, 64B, { -1, ... }, _62, 0); > vect__6.10_53 = .MASK_LEN_LOAD (vectp_y.8_51, 64B, { -1, ... }, _62, 0); > vect__12.11_54 = .SAT_ADD (vect__4.7_49, vect__6.10_53); > .MASK_LEN_STORE (vectp_out.12_56, 64B, { -1, ... }, _62, 0, vect__12.11_54); > ... > } > > vec_sat_add_u64: > ... > vsetvli a5,a3,e64,m1,ta,ma > vle64.v v1,0(a1) > vle64.v v2,0(a2) > sllia4,a5,3 > sub a3,a3,a5 > add a1,a1,a4 > add a2,a2,a4 > vsaddu.vv v1,v1,v2 > vse64.v v1,0(a0) > ... > > To limit the patch size for review, only unsigned version of > usadd3 are involved here. The signed version will be covered > in the underlying patch(es). > > The below test suites are passed for this patch. > * The riscv fully regression tests. > * The aarch64 fully regression tests. > * The x86 bootstrap tests. > * The x86 fully regression tests. > > PR target/51492 > PR target/112600 > > gcc/ChangeLog: > > * config/riscv/autovec.md (usadd3): New pattern expand > for unsigned SAT_ADD vector. > * config/riscv/riscv-protos.h (riscv_expand_usadd): New func > decl to expand usadd3 pattern. > (expand_vec_usadd): Ditto but for vector. > * config/riscv/riscv-v.cc (emit_vec_saddu): New func impl to > emit the vsadd insn. > (expand_vec_usadd): New func impl to expand usadd3 for > vector. > * config/riscv/riscv.cc (riscv_expand_usadd): New func impl > to
[gcc r14-10040] middle-end: refactory vect_recog_absolute_difference to simplify flow [PR114769]
https://gcc.gnu.org/g:1216460e7023cd8ec49933866107417c70e933c9 commit r14-10040-g1216460e7023cd8ec49933866107417c70e933c9 Author: Tamar Christina Date: Fri Apr 19 15:22:13 2024 +0100 middle-end: refactory vect_recog_absolute_difference to simplify flow [PR114769] Hi All, As the reporter in PR114769 points out the control flow for the abd detection is hard to follow. This is because vect_recog_absolute_difference has two different ways it can return true. 1. It can return true when the widening operation is matched, in which case unprom is set, half_type is not NULL and diff_stmt is not set. 2. It can return true when the widening operation is not matched, but the stmt being checked is a minus. In this case unprom is not set, half_type is set to NULL and diff_stmt is set. This because to get to diff_stmt you have to dig through the abs statement and any possible promotions. This however leads to complicated uses of the function at the call sites as the exact semantic needs to be known to use it safely. vect_recog_absolute_difference has two callers: 1. vect_recog_sad_pattern where if you return true with unprom not set, then *half_type will be NULL. The call to vect_supportable_direct_optab_p will always reject it since there's no vector mode for NULL. Note that if looking at the dump files, the convention in the dump files have always been that we first indicate that a pattern could possibly be recognize and then check that it's supported. This change somewhat incorrectly makes the diagnostic message get printed for "invalid" patterns. 2. vect_recog_abd_pattern, where if half_type is NULL, it then uses diff_stmt to set them. This refactors the code, it now only has 1 success condition, and diff_stmt is always set to the minus statement in the abs if there is one. The function now only returns success if the widening minus is found, in which case unprom and half_type set. This then leaves it up to the caller to decide if they want to do anything with diff_stmt. Thanks, Tamar gcc/ChangeLog: PR tree-optimization/114769 * tree-vect-patterns.cc: (vect_recog_absolute_difference): Have only one success condition. (vect_recog_abd_pattern): Handle further checks if vect_recog_absolute_difference fails. Diff: --- gcc/tree-vect-patterns.cc | 43 --- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 4f491c6b833..87c2acff386 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -797,8 +797,7 @@ vect_split_statement (vec_info *vinfo, stmt_vec_info stmt2_info, tree new_rhs, HALF_TYPE and UNPROM will be set should the statement be found to be a widened operation. DIFF_STMT will be set to the MINUS_EXPR - statement that precedes the ABS_STMT unless vect_widened_op_tree - succeeds. + statement that precedes the ABS_STMT if it is a MINUS_EXPR.. */ static bool vect_recog_absolute_difference (vec_info *vinfo, gassign *abs_stmt, @@ -843,6 +842,12 @@ vect_recog_absolute_difference (vec_info *vinfo, gassign *abs_stmt, if (!diff_stmt_vinfo) return false; + gassign *diff = dyn_cast (STMT_VINFO_STMT (diff_stmt_vinfo)); + if (diff_stmt && diff + && gimple_assign_rhs_code (diff) == MINUS_EXPR + && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (abs_oprnd))) +*diff_stmt = diff; + /* FORNOW. Can continue analyzing the def-use chain when this stmt in a phi inside the loop (in case we are analyzing an outer-loop). */ if (vect_widened_op_tree (vinfo, diff_stmt_vinfo, @@ -850,17 +855,6 @@ vect_recog_absolute_difference (vec_info *vinfo, gassign *abs_stmt, false, 2, unprom, half_type)) return true; - /* Failed to find a widen operation so we check for a regular MINUS_EXPR. */ - gassign *diff = dyn_cast (STMT_VINFO_STMT (diff_stmt_vinfo)); - if (diff_stmt && diff - && gimple_assign_rhs_code (diff) == MINUS_EXPR - && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (abs_oprnd))) -{ - *diff_stmt = diff; - *half_type = NULL_TREE; - return true; -} - return false; } @@ -1499,27 +1493,22 @@ vect_recog_abd_pattern (vec_info *vinfo, tree out_type = TREE_TYPE (gimple_assign_lhs (last_stmt)); vect_unpromoted_value unprom[2]; - gassign *diff_stmt; - tree half_type; - if (!vect_recog_absolute_difference (vinfo, last_stmt, _type, + gassign *diff_stmt = NULL; + tree abd_in_type; + if (!vect_recog_absolute_difference (vinfo, last_stmt, _in_type, unprom, _st
[PATCH]middle-end: refactory vect_recog_absolute_difference to simplify flow [PR114769]
Hi All, As the reporter in PR114769 points out the control flow for the abd detection is hard to follow. This is because vect_recog_absolute_difference has two different ways it can return true. 1. It can return true when the widening operation is matched, in which case unprom is set, half_type is not NULL and diff_stmt is not set. 2. It can return true when the widening operation is not matched, but the stmt being checked is a minus. In this case unprom is not set, half_type is set to NULL and diff_stmt is set. This because to get to diff_stmt you have to dig through the abs statement and any possible promotions. This however leads to complicated uses of the function at the call sites as the exact semantic needs to be known to use it safely. vect_recog_absolute_difference has two callers: 1. vect_recog_sad_pattern where if you return true with unprom not set, then *half_type will be NULL. The call to vect_supportable_direct_optab_p will always reject it since there's no vector mode for NULL. Note that if looking at the dump files, the convention in the dump files have always been that we first indicate that a pattern could possibly be recognize and then check that it's supported. This change somewhat incorrectly makes the diagnostic message get printed for "invalid" patterns. 2. vect_recog_abd_pattern, where if half_type is NULL, it then uses diff_stmt to set them. So while the note in the dump file is misleading, the code is safe. This refactors the code, it now only has 1 success condition, and diff_stmt is always set to the minus statement in the abs if there is one. The function now only returns success if the widening minus is found, in which case unprom and half_type set. This then leaves it up to the caller to decide if they want to do anything with diff_stmt. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/114769 * tree-vect-patterns.cc: (vect_recog_absolute_difference): Have only one success condition. (vect_recog_abd_pattern): Handle further checks if vect_recog_absolute_difference fails. --- diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 4f491c6b8336f8710c3519dec1fa7e0f49387d2b..87c2acff386d91d22a3b2d6e6443d1f2f2326ea6 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -797,8 +797,7 @@ vect_split_statement (vec_info *vinfo, stmt_vec_info stmt2_info, tree new_rhs, HALF_TYPE and UNPROM will be set should the statement be found to be a widened operation. DIFF_STMT will be set to the MINUS_EXPR - statement that precedes the ABS_STMT unless vect_widened_op_tree - succeeds. + statement that precedes the ABS_STMT if it is a MINUS_EXPR.. */ static bool vect_recog_absolute_difference (vec_info *vinfo, gassign *abs_stmt, @@ -843,6 +842,12 @@ vect_recog_absolute_difference (vec_info *vinfo, gassign *abs_stmt, if (!diff_stmt_vinfo) return false; + gassign *diff = dyn_cast (STMT_VINFO_STMT (diff_stmt_vinfo)); + if (diff_stmt && diff + && gimple_assign_rhs_code (diff) == MINUS_EXPR + && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (abs_oprnd))) +*diff_stmt = diff; + /* FORNOW. Can continue analyzing the def-use chain when this stmt in a phi inside the loop (in case we are analyzing an outer-loop). */ if (vect_widened_op_tree (vinfo, diff_stmt_vinfo, @@ -850,17 +855,6 @@ vect_recog_absolute_difference (vec_info *vinfo, gassign *abs_stmt, false, 2, unprom, half_type)) return true; - /* Failed to find a widen operation so we check for a regular MINUS_EXPR. */ - gassign *diff = dyn_cast (STMT_VINFO_STMT (diff_stmt_vinfo)); - if (diff_stmt && diff - && gimple_assign_rhs_code (diff) == MINUS_EXPR - && TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (abs_oprnd))) -{ - *diff_stmt = diff; - *half_type = NULL_TREE; - return true; -} - return false; } @@ -1499,27 +1493,22 @@ vect_recog_abd_pattern (vec_info *vinfo, tree out_type = TREE_TYPE (gimple_assign_lhs (last_stmt)); vect_unpromoted_value unprom[2]; - gassign *diff_stmt; - tree half_type; - if (!vect_recog_absolute_difference (vinfo, last_stmt, _type, + gassign *diff_stmt = NULL; + tree abd_in_type; + if (!vect_recog_absolute_difference (vinfo, last_stmt, _in_type, unprom, _stmt)) -return NULL; - - tree abd_in_type, abd_out_type; - - if (half_type) -{ - abd_in_type = half_type; - abd_out_type = abd_in_type; -} - else { + /* We cannot try further without having a non-widening MINUS. */ + if (!diff_stmt) + return NULL; + unprom[0].op = gimple_assign_rhs1 (diff_stmt); unprom[1].op = gimple_assign_rhs2 (diff_stmt); abd_in_type = signed_type_for (out_type); - abd_out_type = abd_in_type; } +
[gcc r14-10014] AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]
https://gcc.gnu.org/g:a2f4be3dae04fa8606d1cc8451f0b9d450f7e6e6 commit r14-10014-ga2f4be3dae04fa8606d1cc8451f0b9d450f7e6e6 Author: Tamar Christina Date: Thu Apr 18 11:47:42 2024 +0100 AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741] In PR114741 we see that we have a regression in codegen when SVE is enable where the simple testcase: void foo(unsigned v, unsigned *p) { *p = v & 1; } generates foo: fmovs31, w0 and z31.s, z31.s, #1 str s31, [x1] ret instead of: foo: and w0, w0, 1 str w0, [x1] ret This causes an impact it not just codesize but also performance. This is caused by the use of the ^ constraint modifier in the pattern 3. The documentation states that this modifier should only have an effect on the alternative costing in that a particular alternative is to be preferred unless a non-psuedo reload is needed. The pattern was trying to convey that whenever both r and w are required, that it should prefer r unless a reload is needed. This is because if a reload is needed then we can construct the constants more flexibly on the SIMD side. We were using this so simplify the implementation and to get generic cases such as: double negabs (double x) { unsigned long long y; memcpy (, , sizeof(double)); y = y | (1UL << 63); memcpy (, , sizeof(double)); return x; } which don't go through an expander. However the implementation of ^ in the register allocator is not according to the documentation in that it also has an effect during coloring. During initial register class selection it applies a penalty to a class, similar to how ? does. In this example the penalty makes the use of GP regs expensive enough that it no longer considers them: r106: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS ;;3--> b 0: i 9 r106=r105&0x1 :cortex_a53_slot_any:GENERAL_REGS+0(-1)FP_REGS+1(1)PR_LO_REGS+0(0) PR_HI_REGS+0(0):model 4 which is not the expected behavior. For GCC 14 this is a conservative fix. 1. we remove the ^ modifier from the logical optabs. 2. In order not to regress copysign we then move the copysign expansion to directly use the SIMD variant. Since copysign only supports floating point modes this is fine and no longer relies on the register allocator to select the right alternative. It once again regresses the general case, but this case wasn't optimized in earlier GCCs either so it's not a regression in GCC 14. This change gives strict better codegen than earlier GCCs and still optimizes the important cases. gcc/ChangeLog: PR target/114741 * config/aarch64/aarch64.md (3): Remove ^ from alt 2. (copysign3): Use SIMD version of IOR directly. gcc/testsuite/ChangeLog: PR target/114741 * gcc.target/aarch64/fneg-abs_2.c: Update codegen. * gcc.target/aarch64/fneg-abs_4.c: xfail for now. * gcc.target/aarch64/pr114741.c: New test. Diff: --- gcc/config/aarch64/aarch64.md | 23 + gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c | 5 ++--- gcc/testsuite/gcc.target/aarch64/fneg-abs_4.c | 4 ++-- gcc/testsuite/gcc.target/aarch64/pr114741.c | 29 +++ 4 files changed, 48 insertions(+), 13 deletions(-) diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 385a669b9b3..dbde066f747 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4811,7 +4811,7 @@ "" {@ [ cons: =0 , 1 , 2; attrs: type , arch ] [ r, %r , r; logic_reg , * ] \t%0, %1, %2 - [ rk , ^r , ; logic_imm , * ] \t%0, %1, %2 + [ rk , r , ; logic_imm , * ] \t%0, %1, %2 [ w, 0 , ; * , sve ] \t%Z0., %Z0., #%2 [ w, w , w; neon_logic , simd ] \t%0., %1., %2. } @@ -7192,22 +7192,29 @@ (match_operand:GPF 2 "nonmemory_operand")] "TARGET_SIMD" { - machine_mode int_mode = mode; - rtx bitmask = gen_reg_rtx (int_mode); - emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U - << (GET_MODE_BITSIZE (mode) - 1))); + rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U + << (GET_MODE_BITSIZE (mode) - 1)); /* copysign (x, -1) should instead be expanded as orr with the sign bit. */ rtx op2_elt = unwrap_const_vec_duplicate (operands[2]); if (GET_CODE (op2_elt) == CONST_DO
[PATCH]AArch64: remove reliance on register allocator for simd/gpreg costing. [PR114741]
Hi All, In PR114741 we see that we have a regression in codegen when SVE is enable where the simple testcase: void foo(unsigned v, unsigned *p) { *p = v & 1; } generates foo: fmovs31, w0 and z31.s, z31.s, #1 str s31, [x1] ret instead of: foo: and w0, w0, 1 str w0, [x1] ret This causes an impact it not just codesize but also performance. This is caused by the use of the ^ constraint modifier in the pattern 3. The documentation states that this modifier should only have an effect on the alternative costing in that a particular alternative is to be preferred unless a non-psuedo reload is needed. The pattern was trying to convey that whenever both r and w are required, that it should prefer r unless a reload is needed. This is because if a reload is needed then we can construct the constants more flexibly on the SIMD side. We were using this so simplify the implementation and to get generic cases such as: double negabs (double x) { unsigned long long y; memcpy (, , sizeof(double)); y = y | (1UL << 63); memcpy (, , sizeof(double)); return x; } which don't go through an expander. However the implementation of ^ in the register allocator is not according to the documentation in that it also has an effect during coloring. During initial register class selection it applies a penalty to a class, similar to how ? does. In this example the penalty makes the use of GP regs expensive enough that it no longer considers them: r106: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS ;;3--> b 0: i 9 r106=r105&0x1 :cortex_a53_slot_any:GENERAL_REGS+0(-1)FP_REGS+1(1)PR_LO_REGS+0(0) PR_HI_REGS+0(0):model 4 which is not the expected behavior. For GCC 14 this is a conservative fix. 1. we remove the ^ modifier from the logical optabs. 2. In order not to regress copysign we then move the copysign expansion to directly use the SIMD variant. Since copysign only supports floating point modes this is fine and no longer relies on the register allocator to select the right alternative. It once again regresses the general case, but this case wasn't optimized in earlier GCCs either so it's not a regression in GCC 14. This change gives strict better codegen than earlier GCCs and still optimizes the important cases. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR target/114741 * config/aarch64/aarch64.md (3): Remove ^ from alt 2. (copysign3): Use SIMD version of IOR directly. gcc/testsuite/ChangeLog: PR target/114741 * gcc.target/aarch64/fneg-abs_2.c: Update codegen. * gcc.target/aarch64/fneg-abs_4.c: xfail for now. * gcc.target/aarch64/pr114741.c: New test. --- diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 385a669b9b3c31cc9108a660e881b9091c71fc7c..dbde066f7478bec51a8703b017ea553aa98be309 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4811,7 +4811,7 @@ (define_insn "3" "" {@ [ cons: =0 , 1 , 2; attrs: type , arch ] [ r, %r , r; logic_reg , * ] \t%0, %1, %2 - [ rk , ^r , ; logic_imm , * ] \t%0, %1, %2 + [ rk , r , ; logic_imm , * ] \t%0, %1, %2 [ w, 0 , ; * , sve ] \t%Z0., %Z0., #%2 [ w, w , w; neon_logic , simd ] \t%0., %1., %2. } @@ -7192,22 +7192,29 @@ (define_expand "copysign3" (match_operand:GPF 2 "nonmemory_operand")] "TARGET_SIMD" { - machine_mode int_mode = mode; - rtx bitmask = gen_reg_rtx (int_mode); - emit_move_insn (bitmask, GEN_INT (HOST_WIDE_INT_M1U - << (GET_MODE_BITSIZE (mode) - 1))); + rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U + << (GET_MODE_BITSIZE (mode) - 1)); /* copysign (x, -1) should instead be expanded as orr with the sign bit. */ rtx op2_elt = unwrap_const_vec_duplicate (operands[2]); if (GET_CODE (op2_elt) == CONST_DOUBLE && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt))) { - emit_insn (gen_ior3 ( - lowpart_subreg (int_mode, operands[0], mode), - lowpart_subreg (int_mode, operands[1], mode), bitmask)); + rtx v_bitmask + = force_reg (V2mode, +gen_const_vec_duplicate (V2mode, + signbit_const)); + + emit_insn (gen_iorv23 ( + lowpart_subreg (V2mode, operands[0], mode), + lowpart_subreg (V2mode, operands[1], mode), + v_bitmask)); DONE; } + machine_mode int_mode = mode; + rtx bitmask = gen_reg_rtx (int_mode); + emit_move_insn (bitmask, signbit_const); operands[2] = force_reg (mode, operands[2]); emit_insn (gen_copysign3_insn (operands[0], operands[1], operands[2],
gcc-wwwdocs branch master updated. 3530b8d820658fb3add4b06def91672a0053f2b2
This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "gcc-wwwdocs". The branch, master has been updated via 3530b8d820658fb3add4b06def91672a0053f2b2 (commit) from 794555052d5c1d9a92298aba1fc4b645042946dd (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log - commit 3530b8d820658fb3add4b06def91672a0053f2b2 Author: Tamar Christina Date: Mon Apr 15 16:00:21 2024 +0100 gcc-14/docs: document early break support and pragma novector diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html index 6035ae37..c98ebe5a 100644 --- a/htdocs/gcc-14/changes.html +++ b/htdocs/gcc-14/changes.html @@ -124,6 +124,34 @@ a work-in-progress. for indicating parameters that are expected to be null-terminated strings. + + The vectorizer now supports vectorizing loops which contain any number of early breaks. + This means loops such as: + + int z[100], y[100], x[100]; + int foo (int n) + { + int res = 0; + for (int i = 0; i < n; i++) + { + y[i] = x[i] * 2; + res += x[i] + y[i]; + + if (x[i] > 5) +break; + + if (z[i] > 5) +break; + + } + return res; + } + + can now be vectorized on a number of targets. In this first version any + input data sources must either have a statically known size at compile time + or the vectorizer must be able to determine based on auxillary information + that the accesses are aligned. + New Languages and Language specific improvements @@ -234,6 +262,9 @@ a work-in-progress. previous options -std=c2x, -std=gnu2x and -Wc11-c2x-compat, which are deprecated but remain supported. + GCC supports a new pragma pragma GCC novector to + indicate to the vectorizer not to vectorize the loop annotated with the + pragma. C++ @@ -403,6 +434,9 @@ a work-in-progress. warnings are enabled for C++ as well The DR 2237 code no longer gives an error, it emits a -Wtemplate-id-cdtor warning instead + GCC supports a new pragma pragma GCC novector to + indicate to the vectorizer not to vectorize the loop annotated with the + pragma. Runtime Library (libstdc++) --- Summary of changes: htdocs/gcc-14/changes.html | 34 ++ 1 file changed, 34 insertions(+) hooks/post-receive -- gcc-wwwdocs
[gcc r14-9997] testsuite: Fix data check loop on vect-early-break_124-pr114403.c
https://gcc.gnu.org/g:f438acf7ce2e6cb862cf62f2543c36639e2af233 commit r14-9997-gf438acf7ce2e6cb862cf62f2543c36639e2af233 Author: Tamar Christina Date: Tue Apr 16 20:56:26 2024 +0100 testsuite: Fix data check loop on vect-early-break_124-pr114403.c The testcase had the wrong indices in the buffer check loop. gcc/testsuite/ChangeLog: PR tree-optimization/114403 * gcc.dg/vect/vect-early-break_124-pr114403.c: Fix check loop. Diff: --- gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c index 1751296ab81..51abf245ccb 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c @@ -68,8 +68,8 @@ int main () int store_size = sizeof(PV); #pragma GCC novector - for (int i = 0; i < NUM - 1; i+=store_size) -if (0 != __builtin_memcmp (buffer+i, (char*)[i].Val, store_size)) + for (int i = 0; i < NUM - 1; i++) +if (0 != __builtin_memcmp (buffer+(i*store_size), (char*)[i].Val, store_size)) __builtin_abort (); return 0;
RE: [PATCH]middle-end: skip vectorization check on ilp32 on vect-early-break_124-pr114403.c
> On Tue, Apr 16, 2024 at 09:00:53AM +0200, Richard Biener wrote: > > > PR tree-optimization/114403 > > > * gcc.dg/vect/vect-early-break_124-pr114403.c: Skip in ilp32. > > > > > > --- > > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c > b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c > > > index > 1751296ab813fe85eaab1f58dc674bac10f6eb7a..db8e00556f116ca81c5a6558e > c6ecd3b222ec93d 100644 > > > --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c > > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c > > > @@ -2,11 +2,11 @@ > > > /* { dg-require-effective-target vect_early_break_hw } */ > > > /* { dg-require-effective-target vect_long_long } */ > > > > > > -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! > > > ilp32 } } } > } */ > > > > > > #include "tree-vect.h" > > > > > > -typedef unsigned long PV; > > > +typedef unsigned long long PV; > > > typedef struct _buff_t { > > > int foo; > > > PV Val; > > As discussed on IRC, I think we want > --- gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c.jj 2024-04- > 16 08:43:36.001729192 +0200 > +++ gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c 2024-04- > 16 08:55:11.296214959 +0200 > @@ -64,8 +64,8 @@ int main () > >int store_size = sizeof(PV); > #pragma GCC novector > - for (int i = 0; i < NUM - 1; i+=store_size) > -if (0 != __builtin_memcmp (buffer+i, (char*)[i].Val, store_size)) > + for (int i = 0; i < NUM - 1; i++) > +if (0 != __builtin_memcmp (buffer+i*store_size, (char*)[i].Val, > store_size)) >__builtin_abort (); > >return 0; > > instead (and then I think there is no need to switch PV from unsigned long > to unsigned long long, nor disabling on ilp32. > Regtested on x86_64-pc-linux-gnu with -m32,-m64 and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: PR tree-optimization/114403 * gcc.dg/vect/vect-early-break_124-pr114403.c: Fix check loop. -- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c index 1751296ab813fe85eaab1f58dc674bac10f6eb7a..51abf245ccb51b85f06916a8a0238698911ab551 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c @@ -68,8 +68,8 @@ int main () int store_size = sizeof(PV); #pragma GCC novector - for (int i = 0; i < NUM - 1; i+=store_size) -if (0 != __builtin_memcmp (buffer+i, (char*)[i].Val, store_size)) + for (int i = 0; i < NUM - 1; i++) +if (0 != __builtin_memcmp (buffer+(i*store_size), (char*)[i].Val, store_size)) __builtin_abort (); return 0; rb18418.patch Description: rb18418.patch
[PATCH]middle-end: skip vectorization check on ilp32 on vect-early-break_124-pr114403.c
Hi all, The testcase seems to fail vectorization on -m32 since the access pattern is determined as too complex. This skips the vectorization check on ilp32 systems as I couldn't find a better proxy for being able to do strided 64-bit loads and I suspect it would fail on all 32-bit targets. Regtested on x86_64-pc-linux-gnu with -m32 and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: PR tree-optimization/114403 * gcc.dg/vect/vect-early-break_124-pr114403.c: Skip in ilp32. --- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c index 1751296ab813fe85eaab1f58dc674bac10f6eb7a..db8e00556f116ca81c5a6558ec6ecd3b222ec93d 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c @@ -2,11 +2,11 @@ /* { dg-require-effective-target vect_early_break_hw } */ /* { dg-require-effective-target vect_long_long } */ -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! ilp32 } } } } */ #include "tree-vect.h" -typedef unsigned long PV; +typedef unsigned long long PV; typedef struct _buff_t { int foo; PV Val; -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c index 1751296ab813fe85eaab1f58dc674bac10f6eb7a..db8e00556f116ca81c5a6558ec6ecd3b222ec93d 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_124-pr114403.c @@ -2,11 +2,11 @@ /* { dg-require-effective-target vect_early_break_hw } */ /* { dg-require-effective-target vect_long_long } */ -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! ilp32 } } } } */ #include "tree-vect.h" -typedef unsigned long PV; +typedef unsigned long long PV; typedef struct _buff_t { int foo; PV Val;
docs: document early break support and pragma novector
docs: document early break support and pragma novector --- diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html index b4c602a523717c1d64333e44aefb60ba0ed02e7a..aceecb86f17443cfae637e90987427b98c42f6eb 100644 --- a/htdocs/gcc-14/changes.html +++ b/htdocs/gcc-14/changes.html @@ -200,6 +200,34 @@ a work-in-progress. for indicating parameters that are expected to be null-terminated strings. + +The vectorizer now supports vectorizing loops which contain any number of early breaks. +This means loops such as: + + int z[100], y[100], x[100]; + int foo (int n) + { + int res = 0; + for (int i = 0; i < n; i++) + { + y[i] = x[i] * 2; + res += x[i] + y[i]; + + if (x[i] > 5) +break; + + if (z[i] > 5) +break; + + } + return res; + } + +can now be vectorized on a number of targets. In this first version any +input data sources must either have a statically known size at compile time +or the vectorizer must be able to determine based on auxillary information +that the accesses are aligned. + New Languages and Language specific improvements @@ -231,6 +259,9 @@ a work-in-progress. previous options -std=c2x, -std=gnu2x and -Wc11-c2x-compat, which are deprecated but remain supported. + GCC supports a new pragma pragma GCC novector to + indicate to the vectorizer not to vectorize the loop annotated with the + pragma. C++ @@ -400,6 +431,9 @@ a work-in-progress. warnings are enabled for C++ as well The DR 2237 code no longer gives an error, it emits a -Wtemplate-id-cdtor warning instead + GCC supports a new pragma pragma GCC novector to + indicate to the vectorizer not to vectorize the loop annotated with the + pragma. Runtime Library (libstdc++) -- diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html index b4c602a523717c1d64333e44aefb60ba0ed02e7a..aceecb86f17443cfae637e90987427b98c42f6eb 100644 --- a/htdocs/gcc-14/changes.html +++ b/htdocs/gcc-14/changes.html @@ -200,6 +200,34 @@ a work-in-progress. for indicating parameters that are expected to be null-terminated strings. + +The vectorizer now supports vectorizing loops which contain any number of early breaks. +This means loops such as: + + int z[100], y[100], x[100]; + int foo (int n) + { + int res = 0; + for (int i = 0; i < n; i++) + { + y[i] = x[i] * 2; + res += x[i] + y[i]; + + if (x[i] > 5) + break; + + if (z[i] > 5) + break; + + } + return res; + } + +can now be vectorized on a number of targets. In this first version any +input data sources must either have a statically known size at compile time +or the vectorizer must be able to determine based on auxillary information +that the accesses are aligned. + New Languages and Language specific improvements @@ -231,6 +259,9 @@ a work-in-progress. previous options -std=c2x, -std=gnu2x and -Wc11-c2x-compat, which are deprecated but remain supported. + GCC supports a new pragma pragma GCC novector to + indicate to the vectorizer not to vectorize the loop annotated with the + pragma. C++ @@ -400,6 +431,9 @@ a work-in-progress. warnings are enabled for C++ as well The DR 2237 code no longer gives an error, it emits a -Wtemplate-id-cdtor warning instead + GCC supports a new pragma pragma GCC novector to + indicate to the vectorizer not to vectorize the loop annotated with the + pragma. Runtime Library (libstdc++)
[gcc r11-11323] [AArch64]: Do not allow SIMD clones with simdlen 1 [PR113552]
https://gcc.gnu.org/g:0c2fcf3ddfe93d1f403962c4bacbb5d55ab7d19d commit r11-11323-g0c2fcf3ddfe93d1f403962c4bacbb5d55ab7d19d Author: Tamar Christina Date: Mon Apr 15 12:32:24 2024 +0100 [AArch64]: Do not allow SIMD clones with simdlen 1 [PR113552] This is a backport of g:306713c953d509720dc394c43c0890548bb0ae07. The AArch64 vector PCS does not allow simd calls with simdlen 1, however due to a bug we currently do allow it for num == 0. This causes us to emit a symbol that doesn't exist and we fail to link. gcc/ChangeLog: PR tree-optimization/113552 * config/aarch64/aarch64.c (aarch64_simd_clone_compute_vecsize_and_simdlen): Block simdlen 1. gcc/testsuite/ChangeLog: PR tree-optimization/113552 * gcc.target/aarch64/pr113552.c: New test. * gcc.target/aarch64/simd_pcs_attribute-3.c: Remove bogus check. Diff: --- gcc/config/aarch64/aarch64.c | 18 ++ gcc/testsuite/gcc.target/aarch64/pr113552.c| 17 + .../gcc.target/aarch64/simd_pcs_attribute-3.c | 4 ++-- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 9bbbc5043af..4df72339952 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -25556,7 +25556,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, tree base_type, int num) { tree t, ret_type; - unsigned int elt_bits, count; + unsigned int elt_bits, count = 0; unsigned HOST_WIDE_INT const_simdlen; poly_uint64 vec_bits; @@ -25624,11 +25624,20 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type)); if (known_eq (clonei->simdlen, 0U)) { - count = 2; - vec_bits = (num == 0 ? 64 : 128); + /* We don't support simdlen == 1. */ + if (known_eq (elt_bits, 64)) + { + count = 1; + vec_bits = 128; + } + else + { + count = 2; + vec_bits = (num == 0 ? 64 : 128); + } clonei->simdlen = exact_div (vec_bits, elt_bits); } - else + else if (maybe_ne (clonei->simdlen, 1U)) { count = 1; vec_bits = clonei->simdlen * elt_bits; @@ -25643,6 +25652,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, return 0; } } + clonei->vecsize_int = vec_bits; clonei->vecsize_float = vec_bits; return count; diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c b/gcc/testsuite/gcc.target/aarch64/pr113552.c new file mode 100644 index 000..9c96b061ed2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=armv8-a" } */ + +__attribute__ ((__simd__ ("notinbranch"), const)) +double cos (double); + +void foo (float *a, double *b) +{ +for (int i = 0; i < 12; i+=3) + { +b[i] = cos (5.0 * a[i]); +b[i+1] = cos (5.0 * a[i+1]); +b[i+2] = cos (5.0 * a[i+2]); + } +} + +/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c index 95f6a6803e8..c6dac6b104c 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c +++ b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c @@ -18,7 +18,7 @@ double foo(double x) } /* { dg-final { scan-assembler-not {\.variant_pcs\tfoo} } } */ -/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM1v_foo} 1 } } */ +/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnM1v_foo} } } */ /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM2v_foo} 1 } } */ -/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN1v_foo} 1 } } */ +/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnN1v_foo} } } */ /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN2v_foo} 1 } } */
[gcc r12-10329] AArch64: Do not allow SIMD clones with simdlen 1 [PR113552]
https://gcc.gnu.org/g:642cfd049780f03335da9fe0a51415f130232334 commit r12-10329-g642cfd049780f03335da9fe0a51415f130232334 Author: Tamar Christina Date: Mon Apr 15 12:16:53 2024 +0100 AArch64: Do not allow SIMD clones with simdlen 1 [PR113552] This is a backport of g:306713c953d509720dc394c43c0890548bb0ae07. The AArch64 vector PCS does not allow simd calls with simdlen 1, however due to a bug we currently do allow it for num == 0. This causes us to emit a symbol that doesn't exist and we fail to link. gcc/ChangeLog: PR tree-optimization/113552 * config/aarch64/aarch64.cc (aarch64_simd_clone_compute_vecsize_and_simdlen): Block simdlen 1. gcc/testsuite/ChangeLog: PR tree-optimization/113552 * gcc.target/aarch64/pr113552.c: New test. * gcc.target/aarch64/simd_pcs_attribute-3.c: Remove bogus check. Diff: --- gcc/config/aarch64/aarch64.cc | 16 +--- gcc/testsuite/gcc.target/aarch64/pr113552.c | 17 + gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c | 4 ++-- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 2bbba323770..96976abdbf4 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -26898,7 +26898,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, tree base_type, int num) { tree t, ret_type; - unsigned int elt_bits, count; + unsigned int elt_bits, count = 0; unsigned HOST_WIDE_INT const_simdlen; poly_uint64 vec_bits; @@ -26966,8 +26966,17 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type)); if (known_eq (clonei->simdlen, 0U)) { - count = 2; - vec_bits = (num == 0 ? 64 : 128); + /* We don't support simdlen == 1. */ + if (known_eq (elt_bits, 64)) + { + count = 1; + vec_bits = 128; + } + else + { + count = 2; + vec_bits = (num == 0 ? 64 : 128); + } clonei->simdlen = exact_div (vec_bits, elt_bits); } else @@ -26985,6 +26994,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, return 0; } } + clonei->vecsize_int = vec_bits; clonei->vecsize_float = vec_bits; return count; diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c b/gcc/testsuite/gcc.target/aarch64/pr113552.c new file mode 100644 index 000..9c96b061ed2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=armv8-a" } */ + +__attribute__ ((__simd__ ("notinbranch"), const)) +double cos (double); + +void foo (float *a, double *b) +{ +for (int i = 0; i < 12; i+=3) + { +b[i] = cos (5.0 * a[i]); +b[i+1] = cos (5.0 * a[i+1]); +b[i+2] = cos (5.0 * a[i+2]); + } +} + +/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c index 95f6a6803e8..c6dac6b104c 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c +++ b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c @@ -18,7 +18,7 @@ double foo(double x) } /* { dg-final { scan-assembler-not {\.variant_pcs\tfoo} } } */ -/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM1v_foo} 1 } } */ +/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnM1v_foo} } } */ /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM2v_foo} 1 } } */ -/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN1v_foo} 1 } } */ +/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnN1v_foo} } } */ /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN2v_foo} 1 } } */
[gcc r13-8604] AArch64: Do not allow SIMD clones with simdlen 1 [PR113552]
https://gcc.gnu.org/g:1e08e39c743692afdd5d3546b2223474beac1dbc commit r13-8604-g1e08e39c743692afdd5d3546b2223474beac1dbc Author: Tamar Christina Date: Mon Apr 15 12:11:48 2024 +0100 AArch64: Do not allow SIMD clones with simdlen 1 [PR113552] This is a backport of g:306713c953d509720dc394c43c0890548bb0ae07. The AArch64 vector PCS does not allow simd calls with simdlen 1, however due to a bug we currently do allow it for num == 0. This causes us to emit a symbol that doesn't exist and we fail to link. gcc/ChangeLog: PR tree-optimization/113552 * config/aarch64/aarch64.cc (aarch64_simd_clone_compute_vecsize_and_simdlen): Block simdlen 1. gcc/testsuite/ChangeLog: PR tree-optimization/113552 * gcc.target/aarch64/pr113552.c: New test. * gcc.target/aarch64/simd_pcs_attribute-3.c: Remove bogus check. Diff: --- gcc/config/aarch64/aarch64.cc | 16 +--- gcc/testsuite/gcc.target/aarch64/pr113552.c | 17 + gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c | 4 ++-- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f6d14cd791a..b8a4ab1b980 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -27029,7 +27029,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, bool explicit_p) { tree t, ret_type; - unsigned int elt_bits, count; + unsigned int elt_bits, count = 0; unsigned HOST_WIDE_INT const_simdlen; poly_uint64 vec_bits; @@ -27102,8 +27102,17 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type)); if (known_eq (clonei->simdlen, 0U)) { - count = 2; - vec_bits = (num == 0 ? 64 : 128); + /* We don't support simdlen == 1. */ + if (known_eq (elt_bits, 64)) + { + count = 1; + vec_bits = 128; + } + else + { + count = 2; + vec_bits = (num == 0 ? 64 : 128); + } clonei->simdlen = exact_div (vec_bits, elt_bits); } else @@ -27123,6 +27132,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, return 0; } } + clonei->vecsize_int = vec_bits; clonei->vecsize_float = vec_bits; return count; diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c b/gcc/testsuite/gcc.target/aarch64/pr113552.c new file mode 100644 index 000..9c96b061ed2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=armv8-a" } */ + +__attribute__ ((__simd__ ("notinbranch"), const)) +double cos (double); + +void foo (float *a, double *b) +{ +for (int i = 0; i < 12; i+=3) + { +b[i] = cos (5.0 * a[i]); +b[i+1] = cos (5.0 * a[i+1]); +b[i+2] = cos (5.0 * a[i+2]); + } +} + +/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c index 95f6a6803e8..c6dac6b104c 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c +++ b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c @@ -18,7 +18,7 @@ double foo(double x) } /* { dg-final { scan-assembler-not {\.variant_pcs\tfoo} } } */ -/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM1v_foo} 1 } } */ +/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnM1v_foo} } } */ /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM2v_foo} 1 } } */ -/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN1v_foo} 1 } } */ +/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnN1v_foo} } } */ /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN2v_foo} 1 } } */
[gcc r14-9969] middle-end: adjust loop upper bounds when peeling for gaps and early break [PR114403].
https://gcc.gnu.org/g:85002f8085c25bb3e74ab013581a74e7c7ae006b commit r14-9969-g85002f8085c25bb3e74ab013581a74e7c7ae006b Author: Tamar Christina Date: Mon Apr 15 12:06:21 2024 +0100 middle-end: adjust loop upper bounds when peeling for gaps and early break [PR114403]. This fixes a bug with the interaction between peeling for gaps and early break. Before I go further, I'll first explain how I understand this to work for loops with a single exit. When peeling for gaps we peel N < VF iterations to scalar. This happens by removing N iterations from the calculation of niters such that vect_iters * VF == niters is always false. In other words, when we exit the vector loop we always fall to the scalar loop. The loop bounds adjustment guarantees this. Because of this we potentially execute a vector loop iteration less. That is, if you're at the boundary condition where niters % VF by peeling one or more scalar iterations the vector loop executes one less. This is accounted for by the adjustments in vect_transform_loops. This adjustment happens differently based on whether the the vector loop can be partial or not: Peeling for gaps sets the bias to 0 and then: when not partial: we take the floor of (scalar_upper_bound / VF) - 1 to get the vector latch iteration count. when loop is partial: For a single exit this means the loop is masked, we take the ceil to account for the fact that the loop can handle the final partial iteration using masking. Note that there's no difference between ceil an floor on the boundary condition. There is a difference however when you're slightly above it. i.e. if scalar iterates 14 times and VF = 4 and we peel 1 iteration for gaps. The partial loop does ((13 + 0) / 4) - 1 == 2 vector iterations. and in effect the partial iteration is ignored and it's done as scalar. This is fine because the niters modification has capped the vector iteration at 2. So that when we reduce the induction values you end up entering the scalar code with ind_var.2 = ind_var.1 + 2 * VF. Now lets look at early breaks. To make it esier I'll focus on the specific testcase: char buffer[64]; __attribute__ ((noipa)) buff_t *copy (buff_t *first, buff_t *last) { char *buffer_ptr = buffer; char *const buffer_end = [SZ-1]; int store_size = sizeof(first->Val); while (first != last && (buffer_ptr + store_size) <= buffer_end) { const char *value_data = (const char *)(>Val); __builtin_memcpy(buffer_ptr, value_data, store_size); buffer_ptr += store_size; ++first; } if (first == last) return 0; return first; } Here the first, early exit is on the condition: (buffer_ptr + store_size) <= buffer_end and the main exit is on condition: first != last This is important, as this bug only manifests itself when the first exit has a known constant iteration count that's lower than the latch exit count. because buffer holds 64 bytes, and VF = 4, unroll = 2, we end up processing 16 bytes per iteration. So the exit has a known bounds of 8 + 1. The vectorizer correctly analizes this: Statement (exit)if (ivtmp_21 != 0) is executed at most 8 (bounded by 8) + 1 times in loop 1. and as a consequence the IV is bound by 9: # vect_vec_iv_.14_117 = PHI <_118(9), { 9, 8, 7, 6 }(20)> ... vect_ivtmp_21.16_124 = vect_vec_iv_.14_117 + { 18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615 }; mask_patt_22.17_126 = vect_ivtmp_21.16_124 != { 0, 0, 0, 0 }; if (mask_patt_22.17_126 == { -1, -1, -1, -1 }) goto ; [88.89%] else goto ; [11.11%] The imporant bits are this: In this example the value of last - first = 416. the calculated vector iteration count, is: x = (((ptr2 - ptr1) - 16) / 16) + 1 = 27 the bounds generated, adjusting for gaps: x == (((x - 1) >> 2) << 2) which means we'll always fall through to the scalar code. as intended. Here are two key things to note: 1. In this loop, the early exit will always be the one taken. When it's taken we enter the scalar loop with the correct induction value to apply the gap peeling. 2. If the main exit is taken, the induction values assumes you've finished all vector iterations. i.e. it assumes you have completed 24 iterations, as we treat the main exit the same for normal loop vect and early break when not PEELED. This means the
[PATCH]middle-end: adjust loop upper bounds when peeling for gaps and early break [PR114403].
Hi All, This is a story all about how the peeling for gaps introduces a bug in the upper bounds. Before I go further, I'll first explain how I understand this to work for loops with a single exit. When peeling for gaps we peel N < VF iterations to scalar. This happens by removing N iterations from the calculation of niters such that vect_iters * VF == niters is always false. In other words, when we exit the vector loop we always fall to the scalar loop. The loop bounds adjustment guarantees this. Because of this we potentially execute a vector loop iteration less. That is, if you're at the boundary condition where niters % VF by peeling one or more scalar iterations the vector loop executes one less. This is accounted for by the adjustments in vect_transform_loops. This adjustment happens differently based on whether the the vector loop can be partial or not: Peeling for gaps sets the bias to 0 and then: when not partial: we take the floor of (scalar_upper_bound / VF) - 1 to get the vector latch iteration count. when loop is partial: For a single exit this means the loop is masked, we take the ceil to account for the fact that the loop can handle the final partial iteration using masking. Note that there's no difference between ceil an floor on the boundary condition. There is a difference however when you're slightly above it. i.e. if scalar iterates 14 times and VF = 4 and we peel 1 iteration for gaps. The partial loop does ((13 + 0) / 4) - 1 == 2 vector iterations. and in effect the partial iteration is ignored and it's done as scalar. This is fine because the niters modification has capped the vector iteration at 2. So that when we reduce the induction values you end up entering the scalar code with ind_var.2 = ind_var.1 + 2 * VF. Now lets look at early breaks. To make it esier I'll focus on the specific testcase: char buffer[64]; __attribute__ ((noipa)) buff_t *copy (buff_t *first, buff_t *last) { char *buffer_ptr = buffer; char *const buffer_end = [SZ-1]; int store_size = sizeof(first->Val); while (first != last && (buffer_ptr + store_size) <= buffer_end) { const char *value_data = (const char *)(>Val); __builtin_memcpy(buffer_ptr, value_data, store_size); buffer_ptr += store_size; ++first; } if (first == last) return 0; return first; } Here the first, early exit is on the condition: (buffer_ptr + store_size) <= buffer_end and the main exit is on condition: first != last This is important, as this bug only manifests itself when the first exit has a known constant iteration count that's lower than the latch exit count. because buffer holds 64 bytes, and VF = 4, unroll = 2, we end up processing 16 bytes per iteration. So the exit has a known bounds of 8 + 1. The vectorizer correctly analizes this: Statement (exit)if (ivtmp_21 != 0) is executed at most 8 (bounded by 8) + 1 times in loop 1. and as a consequence the IV is bound by 9: # vect_vec_iv_.14_117 = PHI <_118(9), { 9, 8, 7, 6 }(20)> ... vect_ivtmp_21.16_124 = vect_vec_iv_.14_117 + { 18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615 }; mask_patt_22.17_126 = vect_ivtmp_21.16_124 != { 0, 0, 0, 0 }; if (mask_patt_22.17_126 == { -1, -1, -1, -1 }) goto ; [88.89%] else goto ; [11.11%] The imporant bits are this: In this example the value of last - first = 416. the calculated vector iteration count, is: x = (((ptr2 - ptr1) - 16) / 16) + 1 = 27 the bounds generated, adjusting for gaps: x == (((x - 1) >> 2) << 2) which means we'll always fall through to the scalar code. as intended. Here are two key things to note: 1. In this loop, the early exit will always be the one taken. When it's taken we enter the scalar loop with the correct induction value to apply the gap peeling. 2. If the main exit is taken, the induction values assumes you've finished all vector iterations. i.e. it assumes you have completed 24 iterations, as we treat the main exit the same for normal loop vect and early break when not PEELED. This means the induction value is adjusted to ind_var.2 = ind_var.1 + 24 * VF; So what's going wrong. The vectorizer's codegen is correct and efficient, however when we adjust the upper bounds, that code knows that the loops upper bound is based on the early exit. i.e. 8 latch iterations. or in other words. It thinks the loop iterates once. This is incorrect as the vector loop iterates twice, as it has set up the induction value such that it exits at the early exit. So it in effect iterates 2.5x times. Becuase the upper bound is incorrect, when we unroll it now exits from the main exit which uses the incorrect induction value. So there are three ways to fix this: 1. If we take the position that the main exit should support both premature exits and final exits then vect_update_ivs_after_vectorizer
[PATCH]middle-end vect: adjust loop upper bounds when peeling for gaps and early break [PR114403]
Hi All, The report shows that we end up in a situation where the code has been peeled for gaps and we have an early break. The code for peeling for gaps assume that a scalar loop needs to perform at least one iteration. However this doesn't take into account early break where the scalar loop may not need to be executed. That the early break loop can be partial is not accounted for in this scenario. loop partiality is normally handled by setting bias_for_lowest to 1, but when peeling for gaps we end up with 0, which when the loop upper bounds are calculated means that a partial loop iteration loses the final partial iter: Analyzing # of iterations of loop 1 exit condition [8, + , 18446744073709551615] != 0 bounds on difference of bases: -8 ... -8 result: # of iterations 8, bounded by 8 and a VF=4 calculating: Loop 1 iterates at most 1 times. Loop 1 likely iterates at most 1 times. Analyzing # of iterations of loop 1 exit condition [1, + , 1](no_overflow) < bnd.5505_39 bounds on difference of bases: 0 ... 4611686018427387902 Matching expression match.pd:2011, generic-match-8.cc:27 Applying pattern match.pd:2067, generic-match-1.cc:4813 result: # of iterations bnd.5505_39 + 18446744073709551615, bounded by 4611686018427387902 Estimating sizes for loop 1 ... Induction variable computation will be folded away. size: 2 if (ivtmp_312 < bnd.5505_39) Exit condition will be eliminated in last copy. size: 24-3, last_iteration: 24-5 Loop size: 24 Estimated size after unrolling: 26 ;; Guessed iterations of loop 1 is 0.858446. New upper bound 1. upper bound should be 2 not 1. This patch forced the bias_for_lowest to be 1 even when peeling for gaps. I have however not been able to write a standalone reproducer for this so I have no tests but bootstrap and LLVM build fine now. The testcase: #define COUNT 9 #define SIZE COUNT * 4 #define TYPE unsigned long TYPE x[SIZE], y[SIZE]; void __attribute__((noipa)) loop (TYPE val) { for (int i = 0; i < COUNT; ++i) { if (x[i * 4] > val || x[i * 4 + 1] > val) return; x[i * 4] = y[i * 2] + 1; x[i * 4 + 1] = y[i * 2] + 2; x[i * 4 + 2] = y[i * 2 + 1] + 3; x[i * 4 + 3] = y[i * 2 + 1] + 4; } } does perform the peeling for gaps and early beak, however it creates a hybrid loop which works fine. adjusting the indices to non linear also works. So I'd like to submit the fix and work on a testcase separately if needed. Bootstrapped Regtested on x86_64-pc-linux-gnu no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/114403 * tree-vect-loop.cc (vect_transform_loop): Adjust upper bounds for when peeling for gaps and early break. --- diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 4375ebdcb493a90fd0501cbb4b07466077b525c3..bf1bb9b005c68fbb13ee1b1279424865b237245a 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -12139,7 +12139,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* The minimum number of iterations performed by the epilogue. This is 1 when peeling for gaps because we always need a final scalar iteration. */ - int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; + int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo) ? 1 : 0; /* +1 to convert latch counts to loop iteration counts, -min_epilogue_iters to remove iterations that cannot be performed by the vector code. */ -- diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 4375ebdcb493a90fd0501cbb4b07466077b525c3..bf1bb9b005c68fbb13ee1b1279424865b237245a 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -12139,7 +12139,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* The minimum number of iterations performed by the epilogue. This is 1 when peeling for gaps because we always need a final scalar iteration. */ - int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; + int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) + && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo) ? 1 : 0; /* +1 to convert latch counts to loop iteration counts, -min_epilogue_iters to remove iterations that cannot be performed by the vector code. */
[gcc r14-9493] match.pd: Only merge truncation with conversion for -fno-signed-zeros
https://gcc.gnu.org/g:7dd3b2b09cbeb6712ec680a0445cb0ad41070423 commit r14-9493-g7dd3b2b09cbeb6712ec680a0445cb0ad41070423 Author: Joe Ramsay Date: Fri Mar 15 09:20:45 2024 + match.pd: Only merge truncation with conversion for -fno-signed-zeros This optimisation does not honour signed zeros, so should not be enabled except with -fno-signed-zeros. gcc/ChangeLog: * match.pd: Fix truncation pattern for -fno-signed-zeroes gcc/testsuite/ChangeLog: * gcc.target/aarch64/no_merge_trunc_signed_zero.c: New test. Diff: --- gcc/match.pd | 1 + .../aarch64/no_merge_trunc_signed_zero.c | 24 ++ 2 files changed, 25 insertions(+) diff --git a/gcc/match.pd b/gcc/match.pd index 9ce313323a3..15a1e7350d4 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -4858,6 +4858,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (simplify (float (fix_trunc @0)) (if (!flag_trapping_math + && !HONOR_SIGNED_ZEROS (type) && types_match (type, TREE_TYPE (@0)) && direct_internal_fn_supported_p (IFN_TRUNC, type, OPTIMIZE_FOR_BOTH)) diff --git a/gcc/testsuite/gcc.target/aarch64/no_merge_trunc_signed_zero.c b/gcc/testsuite/gcc.target/aarch64/no_merge_trunc_signed_zero.c new file mode 100644 index 000..b2c93e55567 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/no_merge_trunc_signed_zero.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-trapping-math -fsigned-zeros" } */ + +#include + +float +f1 (float x) +{ + return (int) rintf(x); +} + +double +f2 (double x) +{ + return (long) rint(x); +} + +/* { dg-final { scan-assembler "frintx\\ts\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "cvtzs\\ts\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "scvtf\\ts\[0-9\]+, s\[0-9\]+" } } */ +/* { dg-final { scan-assembler "frintx\\td\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "cvtzs\\td\[0-9\]+, d\[0-9\]+" } } */ +/* { dg-final { scan-assembler "scvtf\\td\[0-9\]+, d\[0-9\]+" } } */ +
Summary: [PATCH][committed]AArch64: Do not allow SIMD clones with simdlen 1 [PR113552][GCC 13/12/11 backport]
Hi All, This is a backport of g:306713c953d509720dc394c43c0890548bb0ae07. The AArch64 vector PCS does not allow simd calls with simdlen 1, however due to a bug we currently do allow it for num == 0. This causes us to emit a symbol that doesn't exist and we fail to link. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Committed to GCC 13,12,11 branches as previously approved. Thanks, Tamar gcc/ChangeLog: PR tree-optimization/113552 * config/aarch64/aarch64.cc (aarch64_simd_clone_compute_vecsize_and_simdlen): Block simdlen 1. gcc/testsuite/ChangeLog: PR tree-optimization/113552 * gcc.target/aarch64/pr113552.c: New test. * gcc.target/aarch64/simd_pcs_attribute-3.c: Remove bogus check. --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f546c48ae2d2bad2e34c6b72e5e3e30aba3c3bd6..d19a9c16cc97ae75afd4e29f4339d65d39cfb73a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -27027,7 +27027,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, bool explicit_p) { tree t, ret_type; - unsigned int elt_bits, count; + unsigned int elt_bits, count = 0; unsigned HOST_WIDE_INT const_simdlen; poly_uint64 vec_bits; @@ -27104,7 +27104,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, vec_bits = (num == 0 ? 64 : 128); clonei->simdlen = exact_div (vec_bits, elt_bits); } - else + else if (maybe_ne (clonei->simdlen, 1U)) { count = 1; vec_bits = clonei->simdlen * elt_bits; diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c b/gcc/testsuite/gcc.target/aarch64/pr113552.c new file mode 100644 index ..9c96b061ed2b4fcc57e58925277f74d14f79c51f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=armv8-a" } */ + +__attribute__ ((__simd__ ("notinbranch"), const)) +double cos (double); + +void foo (float *a, double *b) +{ +for (int i = 0; i < 12; i+=3) + { +b[i] = cos (5.0 * a[i]); +b[i+1] = cos (5.0 * a[i+1]); +b[i+2] = cos (5.0 * a[i+2]); + } +} + +/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c index 95f6a6803e889c02177ef10972962ed62d2095eb..c6dac6b104c94c9de89ed88dc5a73e185d2be125 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c +++ b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c @@ -18,7 +18,7 @@ double foo(double x) } /* { dg-final { scan-assembler-not {\.variant_pcs\tfoo} } } */ -/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM1v_foo} 1 } } */ +/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnM1v_foo} } } */ /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnM2v_foo} 1 } } */ -/* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN1v_foo} 1 } } */ +/* { dg-final { scan-assembler-not {\.variant_pcs\t_ZGVnN1v_foo} } } */ /* { dg-final { scan-assembler-times {\.variant_pcs\t_ZGVnN2v_foo} 1 } } */ -- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f546c48ae2d2bad2e34c6b72e5e3e30aba3c3bd6..d19a9c16cc97ae75afd4e29f4339d65d39cfb73a 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -27027,7 +27027,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, bool explicit_p) { tree t, ret_type; - unsigned int elt_bits, count; + unsigned int elt_bits, count = 0; unsigned HOST_WIDE_INT const_simdlen; poly_uint64 vec_bits; @@ -27104,7 +27104,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, vec_bits = (num == 0 ? 64 : 128); clonei->simdlen = exact_div (vec_bits, elt_bits); } - else + else if (maybe_ne (clonei->simdlen, 1U)) { count = 1; vec_bits = clonei->simdlen * elt_bits; diff --git a/gcc/testsuite/gcc.target/aarch64/pr113552.c b/gcc/testsuite/gcc.target/aarch64/pr113552.c new file mode 100644 index ..9c96b061ed2b4fcc57e58925277f74d14f79c51f --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr113552.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -march=armv8-a" } */ + +__attribute__ ((__simd__ ("notinbranch"), const)) +double cos (double); + +void foo (float *a, double *b) +{ +for (int i = 0; i < 12; i+=3) + { +b[i] = cos (5.0 * a[i]); +b[i+1] = cos (5.0 * a[i+1]); +b[i+2] = cos (5.0 * a[i+2]); + } +} + +/* { dg-final { scan-assembler-times {bl\t_ZGVnN2v_cos} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c b/gcc/testsuite/gcc.target/aarch64/simd_pcs_attribute-3.c index
RE: [PATCH] vect: Do not peel epilogue for partial vectors [PR114196].
> -Original Message- > From: Richard Biener > Sent: Thursday, March 7, 2024 8:47 AM > To: Robin Dapp > Cc: gcc-patches ; Tamar Christina > > Subject: Re: [PATCH] vect: Do not peel epilogue for partial vectors > [PR114196]. > > On Wed, Mar 6, 2024 at 9:21 PM Robin Dapp wrote: > > > > Hi, > > > > r14-7036-gcbf569486b2dec added an epilogue vectorization guard for early > > break but PR114196 shows that we also run into the problem without early > > break. Therefore remove early break from the conditions. > > > > gcc/ChangeLog: > > > > PR middle-end/114196 > > > > * tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p): Remove > > early break check from guards. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/pr114196.c: New test. > > * gcc.target/riscv/rvv/autovec/pr114196.c: New test. > > --- > > gcc/testsuite/gcc.target/aarch64/pr114196.c | 19 +++ > > .../gcc.target/riscv/rvv/autovec/pr114196.c | 19 +++ > > gcc/tree-vect-loop-manip.cc | 6 +++--- > > 3 files changed, 41 insertions(+), 3 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/pr114196.c > > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr114196.c > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr114196.c > b/gcc/testsuite/gcc.target/aarch64/pr114196.c > > new file mode 100644 > > index 000..15e4b0e31b8 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/pr114196.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-options { -O3 -fno-vect-cost-model -march=armv9-a -msve-vector- > bits=256 } } */ > > + > > +unsigned a; > > +int b; > > +long *c; > > + > > +int > > +main () > > +{ > > + for (int d = 0; d < 22; d += 4) { > > + b = ({ > > + int e = c[d]; > > + e; > > + }) > > + ? 0 : -c[d]; > > + a *= 3; > > + } > > +} > > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr114196.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr114196.c > > new file mode 100644 > > index 000..7ba9cbbed70 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr114196.c > > @@ -0,0 +1,19 @@ > > +/* { dg-do compile } */ > > +/* { dg-options { -O3 -fno-vect-cost-model -march=rv64gcv_zvl256b - > mabi=lp64d -mrvv-vector-bits=zvl } } */ > > + > > +unsigned a; > > +int b; > > +long *c; > > + > > +int > > +main () > > +{ > > + for (int d = 0; d < 22; d += 4) { > > + b = ({ > > + int e = c[d]; > > + e; > > + }) > > + ? 0 : -c[d]; > > + a *= 3; > > + } > > +} > > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc > > index f72da915103..c3cd20eef70 100644 > > --- a/gcc/tree-vect-loop-manip.cc > > +++ b/gcc/tree-vect-loop-manip.cc > > @@ -2183,9 +2183,9 @@ vect_can_peel_nonlinear_iv_p (loop_vec_info > loop_vinfo, > > perform the peeling. The below condition mirrors that of > > vect_gen_vector_loop_niters where niters_vector_mult_vf_var then sets > > step_vector to VF rather than 1. This is what creates the nonlinear > > - IV. PR113163. */ > > - if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) > > - && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () > > + IV. PR113163. > > + This also happens without early breaks, see PR114196. */ > > Can you instead reword to not mention early breaks, maybe instead > say PR113163 (with early breaks), PR114196 (without)? > > The dump message also needs adjustments, it mentions early breaks as > well. > > The comment says it matches a condition in vect_gen_vector_loop_niters > but I can't see what that means ... Tamar? > The comment was trying to say that this case is when you manage to get here: https://github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L2847 because that makes you fall into https://github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L3528 which creates the nonlinear IV variable. The vect_step_op_neg exception is because vect_update_ivs_after_vectorizer can deal with that case specifically https://github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L2398 which is what the previous check is also explaining https://github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L2133 If this also happens for non-early breaks it's just better to merge the check into the earlier one at github.com/gcc-mirror/gcc/blob/95b6ee96348041eaee9133f082b57f3e57ef0b11/gcc/tree-vect-loop-manip.cc#L2133 Tamar > > + if (LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () > >&& LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) > >&& induction_type != vect_step_op_neg) > > { > > -- > > 2.43.2
RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> Thanks Tamar. > > > Those two cases also *completely* stop vectorization because of either the > > control flow or the fact the vectorizer can't handle complex types. > > Yes, we eventually would like to vectorize the SAT ALU but we start with > scalar part > first. > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works > well with some additions as below. > Feel free to correct me if any misunderstandings. > > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to > usadd$a3(as well as gen_int_libfunc) for int. > 2. We need to implement a default implementation of SAT_ADD if > direct_binary_optab_supported_p is false. > It looks like the default implementation is difficult to make every > backend happy. > That is why you suggest just normal > DEF_INTERNAL_SIGNED_OPTAB_FN in another thread. > > Thanks Richard. > > > But what I'd like to see is that we do more instruction selection on GIMPLE > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel > > passes doing what I'd call instruction selection). But that means not > > adding > > match.pd patterns for that or at least have a separate isel-match.pd > > machinery for that. > > > So as a start I would go for a direct optab and see to recognize it during > > ISEL? > > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good > to know isel and I am happy to > try that once we have conclusion. > So after a lively discussion on IRC, the conclusion is that before we proceed Richi would like to see some examples of various operations. The problem is that unsigned saturating addition is the simplest example and it may lead to an implementation strategy that doesn't scale. So I'd suggest writing some example of both signed and unsigned saturating add and multiply Because signed addition, will likely require a branch and signed multiplication would require a larger type. This would allow us to better understand what kind of gimple would have to to deal with in ISEL and VECT if we decide not to lower early. Thanks, Tamar > Pan > > -Original Message- > From: Tamar Christina > Sent: Tuesday, February 27, 2024 5:57 PM > To: Richard Biener > Cc: Li, Pan2 ; gcc-patches@gcc.gnu.org; > juzhe.zh...@rivai.ai; > Wang, Yanzhang ; kito.ch...@gmail.com; > richard.sandiford@arm.com2; jeffreya...@gmail.com > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation > US_PLUS > > > -Original Message- > > From: Richard Biener > > Sent: Tuesday, February 27, 2024 9:44 AM > > To: Tamar Christina > > Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; > > yanzhang.w...@intel.com; kito.ch...@gmail.com; > > richard.sandiford@arm.com2; jeffreya...@gmail.com > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation > > US_PLUS > > > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina > > wrote: > > > > > > Hi Pan, > > > > > > > From: Pan Li > > > > > > > > Hi Richard & Tamar, > > > > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion. By mapping > > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def. > > > > And then expand_US_PLUS in internal-fn.cc. Not very sure if my > > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN. > > > > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given > > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already. > > > > > > > > > > I think a couple of things are being confused here. So lets break it > > > down: > > > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE > > > we only want one internal function for both signed and unsigned SAT_ADD. > > > with this definition we don't need SAT_UADD and SAT_SADD but instead > > > we will only have SAT_ADD, which will expand to us_plus or ss_plus. > > > > > > Now the downside of this is that this is a direct internal optab. This > > > means > > > that for the representation to be used the target *must* have the optab > > > implemented. This is a bit annoying because it doesn't allow us to > > > generically > > > assume that all targets use SAT_ADD for saturating add and thus only have > > > to > > > write optimization for this representation. > > > > > > This is why Richi said we may need to use a new tree_code because we c
RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU
> Am 19.02.24 um 08:36 schrieb Richard Biener: > > On Sat, Feb 17, 2024 at 11:30 AM wrote: > >> > >> From: Pan Li > >> > >> This patch would like to add the middle-end presentation for the > >> unsigned saturation add. Aka set the result of add to the max > >> when overflow. It will take the pattern similar as below. > >> > >> SAT_ADDU (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > Does this even try to wort out the costs? > > For example, with the following example > > > #define T __UINT16_TYPE__ > > T sat_add1 (T x, T y) > { >return (x + y) | (- (T)((T)(x + y) < x)); > } > > T sat_add2 (T x, T y) > { > T z = x + y; > if (z < x) > z = (T) -1; > return z; > } > > And then "avr-gcc -S -Os -dp" the code is > > > sat_add1: > add r22,r24 ; 7 [c=8 l=2] *addhi3/0 > adc r23,r25 > ldi r18,lo8(1) ; 8 [c=4 l=2] *movhi/4 > ldi r19,0 > cp r22,r24 ; 9 [c=8 l=2] cmphi3/2 > cpc r23,r25 > brlo .L2 ; 10 [c=16 l=1] branch > ldi r19,0; 31 [c=4 l=1] movqi_insn/0 > ldi r18,0; 32 [c=4 l=1] movqi_insn/0 > .L2: > clr r24 ; 13 [c=12 l=4] neghi2/1 > clr r25 > sub r24,r18 > sbc r25,r19 > or r24,r22 ; 29 [c=4 l=1] iorqi3/0 > or r25,r23 ; 30 [c=4 l=1] iorqi3/0 > ret ; 35 [c=0 l=1] return > > sat_add2: > add r22,r24 ; 8 [c=8 l=2] *addhi3/0 > adc r23,r25 > cp r22,r24 ; 9 [c=8 l=2] cmphi3/2 > cpc r23,r25 > brsh .L3 ; 10 [c=16 l=1] branch > ldi r22,lo8(-1) ; 5 [c=4 l=2] *movhi/4 > ldi r23,lo8(-1) > .L3: > mov r25,r23 ; 21 [c=4 l=1] movqi_insn/0 > mov r24,r22 ; 22 [c=4 l=1] movqi_insn/0 > ret ; 25 [c=0 l=1] return > > i.e. the conditional jump is better than overly smart arithmetic > (smaller and faster code with less register pressure). > With larger dypes the difference is even more pronounced- > *on AVR. https://godbolt.org/z/7jaExbTa8 shows the branchless code is better. And the branchy code will vectorize worse if at all https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51492 But looking at that output it just seems like it's your expansion that's inefficient. But fair point, perhaps it should be just a normal DEF_INTERNAL_SIGNED_OPTAB_FN so that we provide the additional optimization only for targets that want it. Tamar > >> Take uint8_t as example, we will have: > >> > >> * SAT_ADDU (1, 254) => 255. > >> * SAT_ADDU (1, 255) => 255. > >> * SAT_ADDU (2, 255) => 255. > >> * SAT_ADDU (255, 255) => 255. > >> > >> The patch also implement the SAT_ADDU in the riscv backend as > >> the sample. Given below example: > >> > >> uint64_t sat_add_u64 (uint64_t x, uint64_t y) > >> { > >>return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > >> } > >> > >> Before this patch: > >> > >> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > >> { > >>long unsigned int _1; > >>_Bool _2; > >>long unsigned int _3; > >>long unsigned int _4; > >>uint64_t _7; > >>long unsigned int _10; > >>__complex__ long unsigned int _11; > >> > >> ;; basic block 2, loop depth 0 > >> ;;pred: ENTRY > >>_11 = .ADD_OVERFLOW (x_5(D), y_6(D)); > >>_1 = REALPART_EXPR <_11>; > >>_10 = IMAGPART_EXPR <_11>; > >>_2 = _10 != 0; > >>_3 = (long unsigned int) _2; > >>_4 = -_3; > >>_7 = _1 | _4; > >>return _7; > >> ;;succ: EXIT > >> > >> } > >> > >> After this patch: > >> > >> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > >> { > >>uint64_t _7; > >> > >> ;; basic block 2, loop depth 0 > >> ;;pred: ENTRY > >>_7 = .SAT_ADDU (x_5(D), y_6(D)); [tail call] > >>return _7; > >> ;;succ: EXIT > >> > >> } > >> > >> Then we will have the middle-end representation like .SAT_ADDU after > >> this patch. > > > > I'll note that on RTL we already have SS_PLUS/US_PLUS and friends and > > the corresponding ssadd/usadd optabs. There's not much documentation > > unfortunately besides the use of gen_*_fixed_libfunc usage where the comment > > suggests this is used for fixed-point operations. It looks like arm uses > > fractional/accumulator modes for this but for example bfin has ssaddsi3. > > > > So the question is whether the fixed-point case can be distinguished from > > the integer case based on mode. > > > > There's also FIXED_POINT_TYPE on the GENERIC/GIMPLE side and > > no special tree operator codes for them. So compared to what appears > > to be the case on RTL we'd need a way to represent saturating integer > > operations on GIMPLE. > > > > The natural thing is to use direct optab internal functions (that's what you > > basically did, but you added a new optab, IMO without good reason). > > More
RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
> -Original Message- > From: Richard Biener > Sent: Tuesday, February 27, 2024 9:44 AM > To: Tamar Christina > Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; > yanzhang.w...@intel.com; kito.ch...@gmail.com; > richard.sandiford@arm.com2; jeffreya...@gmail.com > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation > US_PLUS > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina > wrote: > > > > Hi Pan, > > > > > From: Pan Li > > > > > > Hi Richard & Tamar, > > > > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion. By mapping > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def. > > > And then expand_US_PLUS in internal-fn.cc. Not very sure if my > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN. > > > > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already. > > > > > > > I think a couple of things are being confused here. So lets break it down: > > > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE > > we only want one internal function for both signed and unsigned SAT_ADD. > > with this definition we don't need SAT_UADD and SAT_SADD but instead > > we will only have SAT_ADD, which will expand to us_plus or ss_plus. > > > > Now the downside of this is that this is a direct internal optab. This > > means > > that for the representation to be used the target *must* have the optab > > implemented. This is a bit annoying because it doesn't allow us to > > generically > > assume that all targets use SAT_ADD for saturating add and thus only have to > > write optimization for this representation. > > > > This is why Richi said we may need to use a new tree_code because we can > > override tree code expansions. However the same can be done with the > > _EXT_FN > > internal functions. > > > > So what I meant was that we want to have a combination of the two. i.e. a > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN. > > Whether we want/need _EXT or only direct depends mainly on how we want to > leverage support. If it's only during vectorization and possibly instruction > selection a direct optab is IMO the way to go. Generic optimization only > marginally improves when you explode the number of basic operations you > expose - in fact it gets quite unwieldly to support all of them in > simplifications > and/or canonicalization and you possibly need to translate them back to what > the target CPU supports. > > We already do have too many (IMO) "special" operations exposed "early" > in the GIMPLE pipeline. > > But what I'd like to see is that we do more instruction selection on GIMPLE > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel > passes doing what I'd call instruction selection). But that means not adding > match.pd patterns for that or at least have a separate isel-match.pd > machinery for that. > > So as a start I would go for a direct optab and see to recognize it during > ISEL? > The problem with ISEL and the reason I suggested an indirect IFN is that there Are benefit to be had from recognizing it early. Saturating arithmetic can be optimized Differently from non-saturating ones. But additionally a common way of specifying them decomposes to branches and/or using COMPLEX_EXPR (see the various PRs on saturating arithmetic). These two representation can be detected in PHI-opts and it's beneficial to all targets to canonicalize them to the branchless code. Those two cases also *completely* stop vectorization because of either the control flow or the fact the vectorizer can't handle complex types. So really, gimple ISEL would fix just 1 of the 3 very common cases, and then We'd still need to hack the vectorizer cost models for targets with saturating vector instructions. I of course defer to you, but it seems quite suboptimal to do it this way and doesn't get us first class saturation support. Additionally there have been discussions whether both clang and gcc should provide __builtin_saturate_* methods, which the non-direct IFN would help support. Tamar. > > If Richi agrees, the below is what I meant. It creates the infrastructure > > for this > > and for now only allows a default fallback for unsigned saturating add and > > makes > > it easier for us to add the rest later > > > > Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus > > are > the > > RTL expressi
RE: [PATCH]middle-end: delay updating of dominators until later during vectorization. [PR114081]
> > The testcase shows an interesting case where we have multiple loops sharing > > a > > live value and have an early exit that go to the same location. The > > additional > > complication is that on x86_64 with -mavx we seem to also do prologue > > peeling > > on the loops. > > > > We correctly identify which BB we need their dominators updated for, but we > > do > > so too early. > > > > Instead of adding more dominator update we can solve this by for the cases > > with > > multiple exits not to verify dominators at the end of peeling if peeling for > > vectorization. > > > > We can then perform the final dominator updates just before vectorization > > when > > all loop transformations are done. > > What's the actual CFG transform that happens between the old and the new > place? I see a possible edge splitting but where is the one that makes > this patch work? It's not one but two. 1. loop 1 is prologue peeled. This ICEs because the dominator update is only happening for epilogue peeling. Note that loop 1 here dominates 21 and the ICE is: ice.c: In function 'void php_zval_filter(int, int)': ice.c:7:6: error: dominator of 14 should be 21, not 3 7 | void php_zval_filter(int filter, int id1) { | ^~~ ice.c:7:6: error: dominator of 10 should be 21, not 3 during GIMPLE pass: vect dump file: a-ice.c.179t.vect This can be simply fixed by just moving the dom update code down: diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index a5202f32e27..e88948370c6 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -1845,13 +1845,7 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit, to the original function exit we recorded. Other exits are already correct. */ if (multiple_exits_p) - { - update_loop = new_loop; - doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header); - for (unsigned i = 0; i < doms.length (); ++i) - if (flow_bb_inside_loop_p (loop, doms[i])) - doms.unordered_remove (i); - } + update_loop = new_loop; } else /* Add the copy at entry. */ { @@ -1906,6 +1900,11 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit, if (multiple_exits_p) { + doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header); + for (unsigned i = 0; i < doms.length (); ++i) + if (flow_bb_inside_loop_p (loop, doms[i])) + doms.unordered_remove (i); + for (edge e : get_loop_exit_edges (update_loop)) { edge ex; with that done, the next ICE comes along. Loop 1 is peeled again, but this time for epilogue. however loop 1 no longer dominates the exits as the prologue peeled loop does. So we don't find anything to update and ice with the second ICE: ice.c: In function 'void php_zval_filter(int, int)': ice.c:7:6: error: dominator of 14 should be 2, not 21 7 | void php_zval_filter(int filter, int id1) { | ^~~ ice.c:7:6: error: dominator of 10 should be 2, not 21 during GIMPLE pass: vect dump file: a-ice.c.179t.vect because the prologue loop no longer dominates them due to the skip edge. This is why delaying works because we know we have to update the dominators of 14 and 10, but to what we don't know yet. Tamar > > > This also means we reduce the number of dominator updates needed by at least > > 50% and fixes the ICE. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and > > x86_64-pc-linux-gnu no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > PR tree-optimization/114081 > > PR tree-optimization/113290 > > * tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg): > > Skip dominator update when multiple exit. > > (vect_do_peeling): Remove multiple exit dominator update. > > * tree-vect-loop.cc (vect_transform_loop): Update dominators when > > multiple exits. > > * tree-vectorizer.h (LOOP_VINFO_DOMS_NEED_UPDATE, > > dominators_needing_update): New. > > > > gcc/testsuite/ChangeLog: > > > > PR tree-optimization/114081 > > PR tree-optimization/113290 > > * gcc.dg/vect/vect-early-break_120-pr114081.c: New test. > > * gcc.dg/vect/vect-early-break_121-pr114081.c: New test. > > > > --- inline copy of patch -- > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c > b/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c > > new file mode 100644 > > index > ..2cd4ce1e4ac573ba6e4173 > 0fd2216f0ec8061376 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c > > @@ -0,0 +1,38 @@ > > +/* { dg-do compile } */ > > +/* { dg-add-options vect_early_break } */ > > +/* { dg-require-effective-target vect_early_break } */ > > +/* { dg-require-effective-target vect_int } */ > > +/* {
[PATCH]middle-end: delay updating of dominators until later during vectorization. [PR114081]
Hi All, The testcase shows an interesting case where we have multiple loops sharing a live value and have an early exit that go to the same location. The additional complication is that on x86_64 with -mavx we seem to also do prologue peeling on the loops. We correctly identify which BB we need their dominators updated for, but we do so too early. Instead of adding more dominator update we can solve this by for the cases with multiple exits not to verify dominators at the end of peeling if peeling for vectorization. We can then perform the final dominator updates just before vectorization when all loop transformations are done. This also means we reduce the number of dominator updates needed by at least 50% and fixes the ICE. Bootstrapped Regtested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/114081 PR tree-optimization/113290 * tree-vect-loop-manip.cc (slpeel_tree_duplicate_loop_to_edge_cfg): Skip dominator update when multiple exit. (vect_do_peeling): Remove multiple exit dominator update. * tree-vect-loop.cc (vect_transform_loop): Update dominators when multiple exits. * tree-vectorizer.h (LOOP_VINFO_DOMS_NEED_UPDATE, dominators_needing_update): New. gcc/testsuite/ChangeLog: PR tree-optimization/114081 PR tree-optimization/113290 * gcc.dg/vect/vect-early-break_120-pr114081.c: New test. * gcc.dg/vect/vect-early-break_121-pr114081.c: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c new file mode 100644 index ..2cd4ce1e4ac573ba6e41730fd2216f0ec8061376 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_120-pr114081.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +typedef struct filter_list_entry { + const char *name; + int id; + void (*function)(); +} filter_list_entry; + +static const filter_list_entry filter_list[9] = {0}; + +void php_zval_filter(int filter, int id1) { + filter_list_entry filter_func; + + int size = 9; + for (int i = 0; i < size; ++i) { +if (filter_list[i].id == filter) { + filter_func = filter_list[i]; + goto done; +} + } + +#pragma GCC novector + for (int i = 0; i < size; ++i) { +if (filter_list[i].id == 0x0204) { + filter_func = filter_list[i]; + goto done; +} + } +done: + if (!filter_func.id) +filter_func.function(); +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c new file mode 100644 index ..feebdb7a6c9b8981d7be31dd1c741f9e36738515 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +typedef struct filter_list_entry { + const char *name; + int id; + void (*function)(); +} filter_list_entry; + +static const filter_list_entry filter_list[9] = {0}; + +void php_zval_filter(int filter, int id1) { + filter_list_entry filter_func; + + int size = 9; + for (int i = 0; i < size; ++i) { +if (filter_list[i].id == filter) { + filter_func = filter_list[i]; + goto done; +} + } + + for (int i = 0; i < size; ++i) { +if (filter_list[i].id == 0x0204) { + filter_func = filter_list[i]; + goto done; +} + } +done: + if (!filter_func.id) +filter_func.function(); +} diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc index 3f974d6d839e32516ae316f28ca25316e43d7d86..b5e158bc5cfb5107d5ff461e489d306f81e090d0 100644 --- a/gcc/tree-vect-loop-manip.cc +++ b/gcc/tree-vect-loop-manip.cc @@ -1917,7 +1917,6 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit, doms.safe_push (e->dest); } - iterate_fix_dominators (CDI_DOMINATORS, doms, false); if (updated_doms) updated_doms->safe_splice (doms); } @@ -1925,7 +1924,9 @@ slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, edge loop_exit, free (new_bbs); free (bbs); - checking_verify_dominators (CDI_DOMINATORS); + /* If we're peeling for vectorization then delay verifying dominators. */ + if (!flow_loops || !multiple_exits_p) +checking_verify_dominators (CDI_DOMINATORS); return
RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS
Hi Pan, > From: Pan Li > > Hi Richard & Tamar, > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion. By mapping > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def. > And then expand_US_PLUS in internal-fn.cc. Not very sure if my > understanding is correct for DEF_INTERNAL_INT_EXT_FN. > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already. > I think a couple of things are being confused here. So lets break it down: The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE we only want one internal function for both signed and unsigned SAT_ADD. with this definition we don't need SAT_UADD and SAT_SADD but instead we will only have SAT_ADD, which will expand to us_plus or ss_plus. Now the downside of this is that this is a direct internal optab. This means that for the representation to be used the target *must* have the optab implemented. This is a bit annoying because it doesn't allow us to generically assume that all targets use SAT_ADD for saturating add and thus only have to write optimization for this representation. This is why Richi said we may need to use a new tree_code because we can override tree code expansions. However the same can be done with the _EXT_FN internal functions. So what I meant was that we want to have a combination of the two. i.e. a DEF_INTERNAL_SIGNED_OPTAB_EXT_FN. If Richi agrees, the below is what I meant. It creates the infrastructure for this and for now only allows a default fallback for unsigned saturating add and makes it easier for us to add the rest later Also, unless I'm wrong (and Richi can correct me here), us_plus and ss_plus are the RTL expression, but the optab for saturation are ssadd and usadd. So you don't need to make new us_plus and ss_plus ones. diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index a07f25f3aee..aaf9f8991b3 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -4103,6 +4103,17 @@ direct_internal_fn_supported_p (internal_fn fn, tree_pair types, return direct_##TYPE##_optab_supported_p (which_optab, types, \ opt_type);\ } +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \ +UNSIGNED_OPTAB, TYPE) \ +case IFN_##CODE: \ + { \ + optab which_optab = (TYPE_UNSIGNED (types.SELECTOR) \ +? UNSIGNED_OPTAB ## _optab \ +: SIGNED_OPTAB ## _optab); \ + return direct_##TYPE##_optab_supported_p (which_optab, types, \ + opt_type) \ + || internal_##CODE##_fn_supported_p (types.SELECTOR, opt_type); \ + } #include "internal-fn.def" case IFN_LAST: @@ -4303,6 +4314,8 @@ set_edom_supported_p (void) optab which_optab = direct_internal_fn_optab (fn, types); \ expand_##TYPE##_optab_fn (fn, stmt, which_optab); \ } +#define DEF_INTERNAL_SIGNED_OPTAB_EXT_FN(CODE, FLAGS, SELECTOR, SIGNED_OPTAB, \ +UNSIGNED_OPTAB, TYPE) #include "internal-fn.def" /* Routines to expand each internal function, indexed by function number. @@ -5177,3 +5190,45 @@ expand_POPCOUNT (internal_fn fn, gcall *stmt) emit_move_insn (plhs, cmp); } } + +void +expand_SAT_ADD (internal_fn fn, gcall *stmt) +{ + /* Check if the target supports the expansion through an IFN. */ + tree_pair types = direct_internal_fn_types (fn, stmt); + optab which_optab = direct_internal_fn_optab (fn, types); + if (direct_binary_optab_supported_p (which_optab, types, + insn_optimization_type ())) +{ + expand_binary_optab_fn (fn, stmt, which_optab); + return; +} + + /* Target does not support the optab, but we can de-compose it. */ + /* + ... decompose to a canonical representation ... + if (TYPE_UNSIGNED (types.SELECTOR)) +{ + ... + decompose back to (X + Y) | - ((X + Y) < X) +} + else +{ + ... +} + */ +} + +bool internal_SAT_ADD_fn_supported_p (tree type, optimization_type /* optype */) +{ + /* For now, don't support decomposing vector ops. */ + if (VECTOR_TYPE_P (type)) +return false; + + /* Signed saturating arithmetic is harder to do since we'll so for now + lets ignore. */ + if (!TYPE_UNSIGNED (type)) +return false; + + return TREE_CODE (type) == INTEGER_TYPE; +} \ No newline at end of file diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index c14d30365c1..5a2491228d5 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -92,6 +92,10 @@ along with GCC; see the file
[PATCH]middle-end: update vuses out of loop which use a vdef that's moved [PR114068]
Hi All, In certain cases we can have a situation where the merge block has a vUSE virtual PHI and the exits do not. In this case for instance the exits lead to an abort so they have no virtual PHIs. If we have a store before the first exit and we move it to a later block during vectorization we update the vUSE chain. However the merge block is not an exit and is not visited by the update code. This patch fixes it by checking during moving if there are any out of loop uses of the vDEF that is the last_seen_vuse. Normally there wouldn't be any and things are skipped, but if there is then update that to the last vDEF in the exit block. Bootstrapped Regtested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimizations/114068 * tree-vect-loop.cc (move_early_exit_stmts): Update vUSE chain in merge block. gcc/testsuite/ChangeLog: PR tree-optimizations/114068 * gcc.dg/vect/vect-early-break_118-pr114068.c: New test. * gcc.dg/vect/vect-early-break_119-pr114068.c: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c new file mode 100644 index ..b462a464b6603e718c5a283513ea586fc13e37ce --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +struct h { + int b; + int f; +} k; + +void n(int m) { + struct h a = k; + for (int o = m; o; ++o) { +if (a.f) + __builtin_unreachable(); +if (o > 1) + __builtin_unreachable(); +*( + o) = 1; + } +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_119-pr114068.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_119-pr114068.c new file mode 100644 index ..a65ef7b8c4901b2ada585f38fda436dc07d1e1de --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_119-pr114068.c @@ -0,0 +1,25 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +struct h { + int b; + int c; + int f; +} k; + +void n(int m) { + struct h a = k; + for (int o = m; o; ++o) { +if (a.f) + __builtin_unreachable(); +if (o > 1) + __builtin_unreachable(); +*( + o) = 1; +*( + o*m) = 2; + } +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 35f1f8c7d4245135ace740ff9be548919587..44bd8032b55b1ef84fdf4fa9d6117304b7709d6f 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -11837,6 +11837,27 @@ move_early_exit_stmts (loop_vec_info loop_vinfo) update_stmt (p); } + /* last_seen_vuse should now be the PHI in the loop header. Check for + any out of loop uses and update them to the vUSE on the loop latch. */ + auto vuse_stmt = loop_vinfo->lookup_def (last_seen_vuse); + gphi *vuse_def; + if (vuse_stmt + && (vuse_def = dyn_cast (STMT_VINFO_STMT (vuse_stmt +{ + imm_use_iterator iter; + use_operand_p use_p; + gimple *use_stmt; + auto loop = LOOP_VINFO_LOOP (loop_vinfo); + tree vuse = PHI_ARG_DEF_FROM_EDGE (vuse_def, loop_latch_edge (loop)); + FOR_EACH_IMM_USE_STMT (use_stmt, iter, last_seen_vuse) + { + if (flow_bb_inside_loop_p (loop, use_stmt->bb)) + continue; + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + SET_USE (use_p, vuse); + } +} + /* And update the LC PHIs on exits. */ for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo))) if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb)) -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c new file mode 100644 index ..b462a464b6603e718c5a283513ea586fc13e37ce --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_118-pr114068.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +struct h { + int b; + int f; +} k; + +void n(int m) { + struct h a = k; + for (int o = m; o; ++o) { +if (a.f) + __builtin_unreachable(); +if (o > 1) + __builtin_unreachable(); +*( + o) = 1; + } +} diff --git
RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU
> -Original Message- > From: Li, Pan2 > Sent: Monday, February 19, 2024 12:59 PM > To: Tamar Christina ; Richard Biener > > Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang > ; kito.ch...@gmail.com > Subject: RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU > > Thanks Tamar for comments and explanations. > > > I think we should actually do an indirect optab here, because the IFN can > > be used > > to replace the general representation of saturating arithmetic. > > > e.g. the __builtin_add_overflow case in > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600 > > is inefficient on all targets and so the IFN can always expand to something > > that's > more > > efficient like the branchless version add_sat2. > > > I think this is why you suggested a new tree code below, but we don't > > really need > > tree-codes for this. It can be done cleaner using the same way as > DEF_INTERNAL_INT_EXT_FN > > Yes, the backend could choose a branchless(of course we always hate branch for > performance) code-gen or even better there is one saturation insn. > Good to learn DEF_INTERNAL_INT_EXT_FN, and will have a try for it. > > > Richard means that there shouldn't be .SAT_ADDU and .SAT_ADDS and that the > sign > > should be determined by the types at expansion time. i.e. there should > > only be > > .SAT_ADD. > > Got it, my initial idea comes from that we may have two insns for saturation > add, > mostly these insns need to be signed or unsigned. > For example, slt/sltu in riscv scalar. But I am not very clear about a > scenario like this. > During define_expand in backend, we hit the standard name > sat_add_3 but can we tell it is signed or not here? AFAIK, we only have > QI, HI, > SI and DI. Yeah, the way DEF_INTERNAL_SIGNED_OPTAB_FN works is that you give it two optabs, one for when it's signed and one for when it's unsigned, and the right one is picked automatically during expansion. But in GIMPLE you'd only have one IFN. > Maybe I will have the answer after try DEF_INTERNAL_SIGNED_OPTAB_FN, will > keep you posted. Awesome, Thanks! Tamar > > Pan > > -Original Message- > From: Tamar Christina > Sent: Monday, February 19, 2024 4:55 PM > To: Li, Pan2 ; Richard Biener > Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang > ; kito.ch...@gmail.com > Subject: RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU > > Thanks for doing this! > > > -Original Message- > > From: Li, Pan2 > > Sent: Monday, February 19, 2024 8:42 AM > > To: Richard Biener > > Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang > > ; kito.ch...@gmail.com; Tamar Christina > > > > Subject: RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU > > > > Thanks Richard for comments. > > > > > I'll note that on RTL we already have SS_PLUS/US_PLUS and friends and > > > the corresponding ssadd/usadd optabs. There's not much documentation > > > unfortunately besides the use of gen_*_fixed_libfunc usage where the > comment > > > suggests this is used for fixed-point operations. It looks like arm uses > > > fractional/accumulator modes for this but for example bfin has ssaddsi3. > > > > I find the related description about plus family in GCC internals doc but > > it doesn't > > mention > > anything about mode m here. > > > > (plus:m x y) > > (ss_plus:m x y) > > (us_plus:m x y) > > These three expressions all represent the sum of the values represented by x > > and y carried out in machine mode m. They diff er in their behavior on > > overflow > > of integer modes. plus wraps round modulo the width of m; ss_plus saturates > > at the maximum signed value representable in m; us_plus saturates at the > > maximum unsigned value. > > > > > The natural thing is to use direct optab internal functions (that's what > > > you > > > basically did, but you added a new optab, IMO without good reason). > > I think we should actually do an indirect optab here, because the IFN can be > used > to replace the general representation of saturating arithmetic. > > e.g. the __builtin_add_overflow case in > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600 > is inefficient on all targets and so the IFN can always expand to something > that's > more > efficient like the branchless version add_sat2. > > I think this is why you suggested a new tree code below, but we don't really > need > tree-codes for this. It can be done cleaner using the sam
RE: [PATCH]AArch64: xfail modes_1.f90 [PR107071]
> -Original Message- > From: Tamar Christina > Sent: Thursday, February 15, 2024 11:05 AM > To: Richard Earnshaw (lists) ; gcc- > patc...@gcc.gnu.org > Cc: nd ; Marcus Shawcroft ; Kyrylo > Tkachov ; Richard Sandiford > > Subject: RE: [PATCH]AArch64: xfail modes_1.f90 [PR107071] > > > -Original Message- > > From: Richard Earnshaw (lists) > > Sent: Thursday, February 15, 2024 11:01 AM > > To: Tamar Christina ; gcc-patches@gcc.gnu.org > > Cc: nd ; Marcus Shawcroft ; > Kyrylo > > Tkachov ; Richard Sandiford > > > > Subject: Re: [PATCH]AArch64: xfail modes_1.f90 [PR107071] > > > > On 15/02/2024 10:57, Tamar Christina wrote: > > > Hi All, > > > > > > This test has never worked on AArch64 since the day it was committed. It > > > has > > > a number of issues that prevent it from working on AArch64: > > > > > > 1. IEEE does not require that FP operations raise a SIGFPE for FP > > > operations, > > > only that an exception is raised somehow. > > > > > > 2. Most Arm designed cores don't raise SIGFPE and instead set a status > > > register > > > and some partner cores raise a SIGILL instead. > > > > > > 3. The way it checks for feenableexcept doesn't really work for AArch64. > > > > > > As such this test doesn't seem to really provide much value on AArch64 so > > > we > > > should just xfail it. > > > > > > Regtested on aarch64-none-linux-gnu and no issues. > > > > > > Ok for master? > > > > Wouldn't it be better to just skip the test. XFAIL just adds clutter to > > verbose > output > > and suggests that someday the tools might be fixed for this case. > > > > Better still would be a new dg-requires fp_exceptions_raise_sigfpe as a > > guard for > > the test. > It looks like this is similar to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78314 so I'll just similarly skip it. --- inline copy of patch --- diff --git a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 index 205c47f38007d06116289c19d6b23cf3bf83bd48..e29d8c678e6e51c3f2e5dac53c7703bb18a99ac4 100644 --- a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 +++ b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 @@ -1,5 +1,5 @@ ! { dg-do run } -! +! { dg-skip-if "PR libfortran/78314" { aarch64*-*-gnu* arm*-*-gnueabi arm*-*-gnueabihf } } ! Test IEEE_MODES_TYPE, IEEE_GET_MODES and IEEE_SET_MODES Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: PR fortran/107071 * gfortran.dg/ieee/modes_1.f90: skip aarch64, arm. rb18274.patch Description: rb18274.patch
RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU
Thanks for doing this! > -Original Message- > From: Li, Pan2 > Sent: Monday, February 19, 2024 8:42 AM > To: Richard Biener > Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang > ; kito.ch...@gmail.com; Tamar Christina > > Subject: RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU > > Thanks Richard for comments. > > > I'll note that on RTL we already have SS_PLUS/US_PLUS and friends and > > the corresponding ssadd/usadd optabs. There's not much documentation > > unfortunately besides the use of gen_*_fixed_libfunc usage where the comment > > suggests this is used for fixed-point operations. It looks like arm uses > > fractional/accumulator modes for this but for example bfin has ssaddsi3. > > I find the related description about plus family in GCC internals doc but it > doesn't > mention > anything about mode m here. > > (plus:m x y) > (ss_plus:m x y) > (us_plus:m x y) > These three expressions all represent the sum of the values represented by x > and y carried out in machine mode m. They diff er in their behavior on > overflow > of integer modes. plus wraps round modulo the width of m; ss_plus saturates > at the maximum signed value representable in m; us_plus saturates at the > maximum unsigned value. > > > The natural thing is to use direct optab internal functions (that's what you > > basically did, but you added a new optab, IMO without good reason). I think we should actually do an indirect optab here, because the IFN can be used to replace the general representation of saturating arithmetic. e.g. the __builtin_add_overflow case in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600 is inefficient on all targets and so the IFN can always expand to something that's more efficient like the branchless version add_sat2. I think this is why you suggested a new tree code below, but we don't really need tree-codes for this. It can be done cleaner using the same way as DEF_INTERNAL_INT_EXT_FN. > > That makes sense to me, I will try to leverage US_PLUS instead here. > > > More GIMPLE-like would be to let the types involved decide whether > > it's signed or unsigned saturation. That's actually what I'd prefer here > > and if we don't map 1:1 to optabs then instead use tree codes like > > S_PLUS_EXPR (mimicing RTL here). > > Sorry I don't get the point here for GIMPLE-like way. For the .SAT_ADDU, I > add one > restriction > like unsigned_p (type) in match.pd. Looks we have a better way here. > Richard means that there shouldn't be .SAT_ADDU and .SAT_ADDS and that the sign should be determined by the types at expansion time. i.e. there should only be .SAT_ADD. i.e. instead of this +DEF_INTERNAL_OPTAB_FN (SAT_ADDU, ECF_CONST | ECF_NOTHROW, sat_addu, binary) You should use DEF_INTERNAL_SIGNED_OPTAB_FN. Regards, Tamar > > Any other opinions? Anyone knows more about fixed-point and RTL/modes? > > AFAIK, the scalar of the riscv backend doesn't have fixed-point but the > vector does > have. They > share the same mode as vector integer. For example, RVVM1SI in vector- > iterators.md. Kito > and Juzhe can help to correct me if any misunderstandings. > > Pan > > -Original Message- > From: Richard Biener > Sent: Monday, February 19, 2024 3:36 PM > To: Li, Pan2 > Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang > ; kito.ch...@gmail.com; tamar.christ...@arm.com > Subject: Re: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU > > On Sat, Feb 17, 2024 at 11:30 AM wrote: > > > > From: Pan Li > > > > This patch would like to add the middle-end presentation for the > > unsigned saturation add. Aka set the result of add to the max > > when overflow. It will take the pattern similar as below. > > > > SAT_ADDU (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x)) > > > > Take uint8_t as example, we will have: > > > > * SAT_ADDU (1, 254) => 255. > > * SAT_ADDU (1, 255) => 255. > > * SAT_ADDU (2, 255) => 255. > > * SAT_ADDU (255, 255) => 255. > > > > The patch also implement the SAT_ADDU in the riscv backend as > > the sample. Given below example: > > > > uint64_t sat_add_u64 (uint64_t x, uint64_t y) > > { > > return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x)); > > } > > > > Before this patch: > > > > uint64_t sat_add_uint64_t (uint64_t x, uint64_t y) > > { > > long unsigned int _1; > > _Bool _2; > > long unsigned int _3; > > long unsigned int _4; > > uint64_t _7; > > long unsigned int _10; > > __complex__ long unsigned int _11
RE: [PATCH] aarch64: Improve PERM<{0}, a, ...> (64bit) by adding whole vector shift right [PR113872]
> -Original Message- > From: Richard Sandiford > Sent: Thursday, February 15, 2024 2:56 PM > To: Andrew Pinski > Cc: gcc-patches@gcc.gnu.org; Tamar Christina > Subject: Re: [PATCH] aarch64: Improve PERM<{0}, a, ...> (64bit) by adding > whole > vector shift right [PR113872] > > Andrew Pinski writes: > > The backend currently defines a whole vector shift left for 64bit vectors, > > adding > the > > shift right can also improve code for some PERMs too. So this adds that > > pattern. > > Is this reversed? It looks like we have the shift right and the patch is > adding the shift left (at least in GCC internal and little-endian terms). > > But on many Arm cores, EXT has a higher throughput than SHL, so I don't think > we should do this unconditionally. Yeah, on most (if not all) all Arm cores the EXT has higher throughput than SHL and on Cortex-A75 the EXT has both higher throughput and lower latency. I guess the expected gain here is that we wouldn't need to create the zero vector, However on modern Arm cores the zero vector creation is free using movi and EXT being three operand also means we only need one copy if e.g in a loop. Kind Regards, Tamar > > Thanks, > Richard > > > > > I added a testcase for the shift left also. I also fixed the instruction > > template > > there which was using a space instead of a tab after the instruction. > > > > Built and tested on aarch64-linux-gnu. > > > > PR target/113872 > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-simd.md (vec_shr_): > Use tab instead of space after > > the instruction in the template. > > (vec_shl_): New pattern > > * config/aarch64/iterators.md (unspec): Add UNSPEC_VEC_SHL > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/perm_zero-1.c: New test. > > * gcc.target/aarch64/perm_zero-2.c: New test. > > > > Signed-off-by: Andrew Pinski > > --- > > gcc/config/aarch64/aarch64-simd.md | 18 -- > > gcc/config/aarch64/iterators.md| 1 + > > gcc/testsuite/gcc.target/aarch64/perm_zero-1.c | 15 +++ > > gcc/testsuite/gcc.target/aarch64/perm_zero-2.c | 15 +++ > > 4 files changed, 47 insertions(+), 2 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-1.c > > create mode 100644 gcc/testsuite/gcc.target/aarch64/perm_zero-2.c > > > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > > index f8bb973a278..0d2f1ea3902 100644 > > --- a/gcc/config/aarch64/aarch64-simd.md > > +++ b/gcc/config/aarch64/aarch64-simd.md > > @@ -1592,9 +1592,23 @@ (define_insn "vec_shr_" > >"TARGET_SIMD" > >{ > > if (BYTES_BIG_ENDIAN) > > - return "shl %d0, %d1, %2"; > > + return "shl\t%d0, %d1, %2"; > > else > > - return "ushr %d0, %d1, %2"; > > + return "ushr\t%d0, %d1, %2"; > > + } > > + [(set_attr "type" "neon_shift_imm")] > > +) > > +(define_insn "vec_shl_" > > + [(set (match_operand:VD 0 "register_operand" "=w") > > +(unspec:VD [(match_operand:VD 1 "register_operand" "w") > > + (match_operand:SI 2 "immediate_operand" "i")] > > + UNSPEC_VEC_SHL))] > > + "TARGET_SIMD" > > + { > > +if (BYTES_BIG_ENDIAN) > > + return "ushr\t%d0, %d1, %2"; > > +else > > + return "shl\t%d0, %d1, %2"; > >} > >[(set_attr "type" "neon_shift_imm")] > > ) > > diff --git a/gcc/config/aarch64/iterators.md > > b/gcc/config/aarch64/iterators.md > > index 99cde46f1ba..3aebe9cf18a 100644 > > --- a/gcc/config/aarch64/iterators.md > > +++ b/gcc/config/aarch64/iterators.md > > @@ -758,6 +758,7 @@ (define_c_enum "unspec" > > UNSPEC_PMULL; Used in aarch64-simd.md. > > UNSPEC_PMULL2 ; Used in aarch64-simd.md. > > UNSPEC_REV_REGLIST ; Used in aarch64-simd.md. > > +UNSPEC_VEC_SHL ; Used in aarch64-simd.md. > > UNSPEC_VEC_SHR ; Used in aarch64-simd.md. > > UNSPEC_SQRDMLAH ; Used in aarch64-simd.md. > > UNSPEC_SQRDMLSH ; Used in aarch64-simd.md. > > diff --git a/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c > b/gcc/testsuite/gcc.target/aarch64/perm_zero-1.c > > new file mode 100644 > >
RE: [PATCH]AArch64: xfail modes_1.f90 [PR107071]
> -Original Message- > From: Richard Earnshaw (lists) > Sent: Thursday, February 15, 2024 11:01 AM > To: Tamar Christina ; gcc-patches@gcc.gnu.org > Cc: nd ; Marcus Shawcroft ; Kyrylo > Tkachov ; Richard Sandiford > > Subject: Re: [PATCH]AArch64: xfail modes_1.f90 [PR107071] > > On 15/02/2024 10:57, Tamar Christina wrote: > > Hi All, > > > > This test has never worked on AArch64 since the day it was committed. It > > has > > a number of issues that prevent it from working on AArch64: > > > > 1. IEEE does not require that FP operations raise a SIGFPE for FP > > operations, > > only that an exception is raised somehow. > > > > 2. Most Arm designed cores don't raise SIGFPE and instead set a status > > register > > and some partner cores raise a SIGILL instead. > > > > 3. The way it checks for feenableexcept doesn't really work for AArch64. > > > > As such this test doesn't seem to really provide much value on AArch64 so we > > should just xfail it. > > > > Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > Wouldn't it be better to just skip the test. XFAIL just adds clutter to > verbose output > and suggests that someday the tools might be fixed for this case. > > Better still would be a new dg-requires fp_exceptions_raise_sigfpe as a guard > for > the test. There seems to be check_effective_target_fenv_exceptions which seems to test for if the target can raise FP exceptions. I'll see if that works. Thanks, Tamar > > R. > > > > > Thanks, > > Tamar > > > > gcc/testsuite/ChangeLog: > > > > PR fortran/107071 > > * gfortran.dg/ieee/modes_1.f90: xfail aarch64. > > > > --- inline copy of patch -- > > diff --git a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 > b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 > > index > 205c47f38007d06116289c19d6b23cf3bf83bd48..3667571969427ae7b2b9668 > 4ec1af8b3fdd4985f 100644 > > --- a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 > > +++ b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 > > @@ -1,4 +1,4 @@ > > -! { dg-do run } > > +! { dg-do run { xfail { aarch64*-*-* } } } > > ! > > ! Test IEEE_MODES_TYPE, IEEE_GET_MODES and IEEE_SET_MODES > > > > > > > > > > > > --
[PATCH]AArch64: xfail modes_1.f90 [PR107071]
Hi All, This test has never worked on AArch64 since the day it was committed. It has a number of issues that prevent it from working on AArch64: 1. IEEE does not require that FP operations raise a SIGFPE for FP operations, only that an exception is raised somehow. 2. Most Arm designed cores don't raise SIGFPE and instead set a status register and some partner cores raise a SIGILL instead. 3. The way it checks for feenableexcept doesn't really work for AArch64. As such this test doesn't seem to really provide much value on AArch64 so we should just xfail it. Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: PR fortran/107071 * gfortran.dg/ieee/modes_1.f90: xfail aarch64. --- inline copy of patch -- diff --git a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 index 205c47f38007d06116289c19d6b23cf3bf83bd48..3667571969427ae7b2b96684ec1af8b3fdd4985f 100644 --- a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 +++ b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 @@ -1,4 +1,4 @@ -! { dg-do run } +! { dg-do run { xfail { aarch64*-*-* } } } ! ! Test IEEE_MODES_TYPE, IEEE_GET_MODES and IEEE_SET_MODES -- diff --git a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 index 205c47f38007d06116289c19d6b23cf3bf83bd48..3667571969427ae7b2b96684ec1af8b3fdd4985f 100644 --- a/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 +++ b/gcc/testsuite/gfortran.dg/ieee/modes_1.f90 @@ -1,4 +1,4 @@ -! { dg-do run } +! { dg-do run { xfail { aarch64*-*-* } } } ! ! Test IEEE_MODES_TYPE, IEEE_GET_MODES and IEEE_SET_MODES
RE: [PATCH]AArch64: remove ls64 from being mandatory on armv8.7-a..
Hi, this I a new version of the patch updating some additional tests because some of the LTO tests required a newer binutils than my distro had. --- The Arm Architectural Reference Manual (Version J.a, section A2.9 on FEAT_LS64) shows that ls64 is an optional extensions and should not be enabled by default for Armv8.7-a. This drops it from the mandatory bits for the architecture and brings GCC inline with LLVM and the achitecture. Note that we will not be changing binutils to preserve compatibility with older released compilers. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? and backport to GCC 13,12,11? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-arches.def (AARCH64_ARCH): Remove LS64 from Armv8.7-a. gcc/testsuite/ChangeLog: * g++.target/aarch64/acle/ls64.C: Add +ls64. * g++.target/aarch64/acle/ls64_lto.C: Likewise. * gcc.target/aarch64/acle/ls64_lto.c: Likewise. * gcc.target/aarch64/acle/pr110100.c: Likewise. * gcc.target/aarch64/acle/pr110132.c: Likewise. * gcc.target/aarch64/options_set_28.c: Drop check for nols64. * gcc.target/aarch64/pragma_cpp_predefs_2.c: Correct header checks. --- inline copy of patch --- diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def index b7115ff7c3d4a7ee7abbedcb091ef15a7efacc79..9bec30e9203bac01155281ef3474846c402bb29e 100644 --- a/gcc/config/aarch64/aarch64-arches.def +++ b/gcc/config/aarch64/aarch64-arches.def @@ -37,7 +37,7 @@ AARCH64_ARCH("armv8.3-a", generic_armv8_a, V8_3A, 8, (V8_2A, PAUTH, R AARCH64_ARCH("armv8.4-a", generic_armv8_a, V8_4A, 8, (V8_3A, F16FML, DOTPROD, FLAGM)) AARCH64_ARCH("armv8.5-a", generic_armv8_a, V8_5A, 8, (V8_4A, SB, SSBS, PREDRES)) AARCH64_ARCH("armv8.6-a", generic_armv8_a, V8_6A, 8, (V8_5A, I8MM, BF16)) -AARCH64_ARCH("armv8.7-a", generic_armv8_a, V8_7A, 8, (V8_6A, LS64)) +AARCH64_ARCH("armv8.7-a", generic_armv8_a, V8_7A, 8, (V8_6A)) AARCH64_ARCH("armv8.8-a", generic_armv8_a, V8_8A, 8, (V8_7A, MOPS)) AARCH64_ARCH("armv8.9-a", generic_armv8_a, V8_9A, 8, (V8_8A)) AARCH64_ARCH("armv8-r", generic_armv8_a, V8R , 8, (V8_4A)) diff --git a/gcc/testsuite/g++.target/aarch64/acle/ls64.C b/gcc/testsuite/g++.target/aarch64/acle/ls64.C index d9002785b578741bde1202761f0881dc3d47e608..dcfe6f1af6711a7f3ec2562f6aabf56baecf417d 100644 --- a/gcc/testsuite/g++.target/aarch64/acle/ls64.C +++ b/gcc/testsuite/g++.target/aarch64/acle/ls64.C @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-march=armv8.7-a" } */ +/* { dg-additional-options "-march=armv8.7-a+ls64" } */ #include int main() { diff --git a/gcc/testsuite/g++.target/aarch64/acle/ls64_lto.C b/gcc/testsuite/g++.target/aarch64/acle/ls64_lto.C index 274a4771e1c1d13bcb1a7bdc77c2e499726f024c..0198fe2a1b78627b873bf22e3d8416dbdcc77078 100644 --- a/gcc/testsuite/g++.target/aarch64/acle/ls64_lto.C +++ b/gcc/testsuite/g++.target/aarch64/acle/ls64_lto.C @@ -1,5 +1,5 @@ /* { dg-do link { target aarch64_asm_ls64_ok } } */ -/* { dg-additional-options "-march=armv8.7-a -flto" } */ +/* { dg-additional-options "-march=armv8.7-a+ls64 -flto" } */ #include int main() { diff --git a/gcc/testsuite/gcc.target/aarch64/acle/ls64_lto.c b/gcc/testsuite/gcc.target/aarch64/acle/ls64_lto.c index 8b4f24277717675badc39dd145d365f75f5ceb27..0e5ae0b052b50b08d35151f4bc113617c1569bd3 100644 --- a/gcc/testsuite/gcc.target/aarch64/acle/ls64_lto.c +++ b/gcc/testsuite/gcc.target/aarch64/acle/ls64_lto.c @@ -1,5 +1,5 @@ /* { dg-do link { target aarch64_asm_ls64_ok } } */ -/* { dg-additional-options "-march=armv8.7-a -flto" } */ +/* { dg-additional-options "-march=armv8.7-a+ls64 -flto" } */ #include int main(void) { diff --git a/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c b/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c index f56d5e619e8ac23cdf720574bd6ee08fbfd36423..62a82b97c56debad092cc8fd1ed48f0219109cd7 100644 --- a/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c +++ b/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=armv8.7-a -O2" } */ +/* { dg-options "-march=armv8.7-a+ls64 -O2" } */ #include void do_st64b(data512_t data) { __arm_st64b((void*)0x1000, data); diff --git a/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c b/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c index fb88d633dd20772fd96e976a400fe52ae0bc3647..423d91b9a99f269d01d07428414ade7cc518c711 100644 --- a/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c +++ b/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-march=armv8.7-a" } */ +/* { dg-additional-options "-march=armv8.7-a+ls64" } */ /* Check that ls64 builtins can be invoked using a preprocesed testcase without triggering bogus builtin warnings,
RE: [PATCH]AArch64: update vget_set_lane_1.c test output
> -Original Message- > From: Richard Sandiford > Sent: Thursday, February 1, 2024 4:42 PM > To: Tamar Christina > Cc: Andrew Pinski ; gcc-patches@gcc.gnu.org; nd > ; Richard Earnshaw ; Marcus > Shawcroft ; Kyrylo Tkachov > > Subject: Re: [PATCH]AArch64: update vget_set_lane_1.c test output > > Tamar Christina writes: > >> -Original Message- > >> From: Richard Sandiford > >> Sent: Thursday, February 1, 2024 2:24 PM > >> To: Andrew Pinski > >> Cc: Tamar Christina ; gcc-patches@gcc.gnu.org; nd > >> ; Richard Earnshaw ; Marcus > >> Shawcroft ; Kyrylo Tkachov > >> > >> Subject: Re: [PATCH]AArch64: update vget_set_lane_1.c test output > >> > >> Andrew Pinski writes: > >> > On Thu, Feb 1, 2024 at 1:26 AM Tamar Christina > >> wrote: > >> >> > >> >> Hi All, > >> >> > >> >> In the vget_set_lane_1.c test the following entries now generate a zip1 > instead > >> of an INS > >> >> > >> >> BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0) > >> >> BUILD_TEST (int32x2_t, int32x2_t, , , s32, 1, 0) > >> >> BUILD_TEST (uint32x2_t, uint32x2_t, , , u32, 1, 0) > >> >> > >> >> This is because the non-Q variant for indices 0 and 1 are just > >> >> shuffling values. > >> >> There is no perf difference between INS SIMD to SIMD and ZIP, as such > >> >> just > >> update the > >> >> test file. > >> > Hmm, is this true on all cores? I suspect there is a core out there > >> > where INS is implemented with a much lower latency than ZIP. > >> > If we look at config/aarch64/thunderx.md, we can see INS is 2 cycles > >> > while ZIP is 6 cycles (3/7 for q versions). > >> > Now I don't have any invested interest in that core any more but I > >> > just wanted to point out that is not exactly true for all cores. > >> > >> Thanks for the pointer. In that case, perhaps we should prefer > >> aarch64_evpc_ins over aarch64_evpc_zip in > aarch64_expand_vec_perm_const_1? > >> That's enough to fix this failure, but it'll probably require other > >> tests to be adjusted... > > > > I think given that Thundex-X is a 10 year old micro-architecture that is > > several > cases where > > often used instructions have very high latencies that generic codegen > > should not > be blocked > > from progressing because of it. > > > > we use zips in many things and if thunderx codegen is really of that much > importance then I > > think the old codegen should be gated behind -mcpu=thunderx rather than > preventing generic > > changes. > > But you said there was no perf difference between INS and ZIP, so it > sounds like for all known cases, using INS rather than ZIP is either > neutral or better. > > There's also the possible secondary benefit that the INS patterns use > standard RTL operations whereas the ZIP patterns use unspecs. > > Keeping ZIP seems OK there's a specific reason to prefer it over INS for > more modern cores though. Ok, that's a fair point. Doing some due diligence, Neoverse-E1 and Cortex-A65 SWoGs seem to imply that there ZIPs have better throughput than INSs. However the entries are inconsistent and I can't measure the difference so I believe this to be a documentation bug. That said, switching the operands seems to show one issue in that preferring INS degenerates code in cases where we are inserting the top bits of the first parameter into the bottom of the second parameter and returning, Zip being a Three operand instruction allows us to put the result into the final destination register with one operation whereas INS requires an fmov: foo_uzp1_s32: ins v0.s[1], v1.s[0] fmovd0, d0 ret foo_uzp2_s32: ins v1.s[0], v0.s[1] fmovd0, d1 ret I've posted uzp but zip has the same issue. So I guess it's not better to flip the order but perhaps I should add a case to the zip/unzip RTL patterns for when op0 == op1? Thanks, Tamar > > Thanks, > Richard
[PATCH]AArch64: remove ls64 from being mandatory on armv8.7-a..
Hi All, The Arm Architectural Reference Manual (Version J.a, section A2.9 on FEAT_LS64) shows that ls64 is an optional extensions and should not be enabled by default for Armv8.7-a. This drops it from the mandatory bits for the architecture and brings GCC inline with LLVM and the achitecture. Note that we will not be changing binutils to preserve compatibility with older released compilers. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? and backport to GCC 13,12,11? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-arches.def (AARCH64_ARCH): Remove LS64 from Armv8.7-a. gcc/testsuite/ChangeLog: * g++.target/aarch64/acle/ls64.C: Add +ls64. * gcc.target/aarch64/acle/pr110100.c: Likewise. * gcc.target/aarch64/acle/pr110132.c: Likewise. * gcc.target/aarch64/options_set_28.c: Drop check for nols64. * gcc.target/aarch64/pragma_cpp_predefs_2.c: Correct header checks. --- inline copy of patch -- diff --git a/gcc/config/aarch64/aarch64-arches.def b/gcc/config/aarch64/aarch64-arches.def index b7115ff7c3d4a7ee7abbedcb091ef15a7efacc79..9bec30e9203bac01155281ef3474846c402bb29e 100644 --- a/gcc/config/aarch64/aarch64-arches.def +++ b/gcc/config/aarch64/aarch64-arches.def @@ -37,7 +37,7 @@ AARCH64_ARCH("armv8.3-a", generic_armv8_a, V8_3A, 8, (V8_2A, PAUTH, R AARCH64_ARCH("armv8.4-a", generic_armv8_a, V8_4A, 8, (V8_3A, F16FML, DOTPROD, FLAGM)) AARCH64_ARCH("armv8.5-a", generic_armv8_a, V8_5A, 8, (V8_4A, SB, SSBS, PREDRES)) AARCH64_ARCH("armv8.6-a", generic_armv8_a, V8_6A, 8, (V8_5A, I8MM, BF16)) -AARCH64_ARCH("armv8.7-a", generic_armv8_a, V8_7A, 8, (V8_6A, LS64)) +AARCH64_ARCH("armv8.7-a", generic_armv8_a, V8_7A, 8, (V8_6A)) AARCH64_ARCH("armv8.8-a", generic_armv8_a, V8_8A, 8, (V8_7A, MOPS)) AARCH64_ARCH("armv8.9-a", generic_armv8_a, V8_9A, 8, (V8_8A)) AARCH64_ARCH("armv8-r", generic_armv8_a, V8R , 8, (V8_4A)) diff --git a/gcc/testsuite/g++.target/aarch64/acle/ls64.C b/gcc/testsuite/g++.target/aarch64/acle/ls64.C index d9002785b578741bde1202761f0881dc3d47e608..dcfe6f1af6711a7f3ec2562f6aabf56baecf417d 100644 --- a/gcc/testsuite/g++.target/aarch64/acle/ls64.C +++ b/gcc/testsuite/g++.target/aarch64/acle/ls64.C @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-march=armv8.7-a" } */ +/* { dg-additional-options "-march=armv8.7-a+ls64" } */ #include int main() { diff --git a/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c b/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c index f56d5e619e8ac23cdf720574bd6ee08fbfd36423..62a82b97c56debad092cc8fd1ed48f0219109cd7 100644 --- a/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c +++ b/gcc/testsuite/gcc.target/aarch64/acle/pr110100.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=armv8.7-a -O2" } */ +/* { dg-options "-march=armv8.7-a+ls64 -O2" } */ #include void do_st64b(data512_t data) { __arm_st64b((void*)0x1000, data); diff --git a/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c b/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c index fb88d633dd20772fd96e976a400fe52ae0bc3647..423d91b9a99f269d01d07428414ade7cc518c711 100644 --- a/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c +++ b/gcc/testsuite/gcc.target/aarch64/acle/pr110132.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-march=armv8.7-a" } */ +/* { dg-additional-options "-march=armv8.7-a+ls64" } */ /* Check that ls64 builtins can be invoked using a preprocesed testcase without triggering bogus builtin warnings, see PR110132. diff --git a/gcc/testsuite/gcc.target/aarch64/options_set_28.c b/gcc/testsuite/gcc.target/aarch64/options_set_28.c index 9e63768581e9d429e9408863942051b1b04761ac..d5b15f8bc5831de56fe667179d83d9c853529aaf 100644 --- a/gcc/testsuite/gcc.target/aarch64/options_set_28.c +++ b/gcc/testsuite/gcc.target/aarch64/options_set_28.c @@ -1,9 +1,9 @@ /* { dg-do compile } */ -/* { dg-additional-options "-march=armv9.3-a+nopredres+nols64+nomops" } */ +/* { dg-additional-options "-march=armv9.3-a+nopredres+nomops" } */ int main () { return 0; } -/* { dg-final { scan-assembler-times {\.arch armv9\.3\-a\+crc\+nopredres\+nols64\+nomops\n} 1 } } */ +/* { dg-final { scan-assembler-times {\.arch armv9\.3\-a\+crc\+nopredres\+nomops\n} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c index 2d76bfc23dfdcd78a74ec0e4845a3bd8d110b010..d8fc86d1557895f91ffe8be2f65d6581abe51568 100644 --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_2.c @@ -242,8 +242,8 @@ #pragma GCC push_options #pragma GCC target ("arch=armv8.7-a") -#ifndef __ARM_FEATURE_LS64 -#error "__ARM_FEATURE_LS64 is not defined but should be!" +#ifdef __ARM_FEATURE_LS64 +#error
RE: [PATCH]middle-end: inspect all exits for additional annotations for loop.
> > I think this isn't entirely good. For simple cases for > do {} while the condition ends up in the latch while for while () {} > loops it ends up in the header. In your case the latch isn't empty > so it doesn't end up with the conditional. > > I think your patch is OK to the point of looking at all loop exit > sources but you should elide the special-casing of header and > latch since it's really only exit conditionals that matter. > That makes sense, since in both cases the edges are in the respective blocks. Should have thought about it more. So how about this one. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * tree-cfg.cc (replace_loop_annotate): Inspect loop edges for annotations. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-novect_gcond.c: New test. --- inline copy of patch --- diff --git a/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c new file mode 100644 index ..01e69cbef9d51b234c08a400c78dc078d53252f1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c @@ -0,0 +1,39 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break_hw } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#include "tree-vect.h" + +#define N 306 +#define NEEDLE 136 + +int table[N]; + +__attribute__ ((noipa)) +int foo (int i, unsigned short parse_tables_n) +{ + parse_tables_n >>= 9; + parse_tables_n += 11; +#pragma GCC novector + while (i < N && parse_tables_n--) +table[i++] = 0; + + return table[NEEDLE]; +} + +int main () +{ + check_vect (); + +#pragma GCC novector + for (int j = 0; j < N; j++) +table[j] = -1; + + if (foo (0, 0x) != 0) +__builtin_abort (); + + return 0; +} diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc index cdd439fe7506e7bc33654ffa027b493f23d278ac..bdffc3b4ed277724e81b7dd67fe7966e8ece0c13 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@ -320,12 +320,9 @@ replace_loop_annotate (void) for (auto loop : loops_list (cfun, 0)) { - /* First look into the header. */ - replace_loop_annotate_in_block (loop->header, loop); - - /* Then look into the latch, if any. */ - if (loop->latch) - replace_loop_annotate_in_block (loop->latch, loop); + /* Check all exit source blocks for annotations. */ + for (auto e : get_loop_exit_edges (loop)) + replace_loop_annotate_in_block (e->src, loop); /* Push the global flag_finite_loops state down to individual loops. */ loop->finite_p = flag_finite_loops; rb18267.patch Description: rb18267.patch
[PATCH]middle-end: inspect all exits for additional annotations for loop.
Hi All, Attaching a pragma to a loop which has a complex condition often gets the pragma dropped. e.g. #pragma GCC novector while (i < N && parse_tables_n--) before lowering this is represented as: if (ANNOTATE_EXPR ) ... But after lowering the condition is broken appart and attached to the final component of the expression: if (parse_tables_n.2_2 != 0) goto ; else goto ; : iftmp.1D.4452 = 1; goto ; : iftmp.1D.4452 = 0; : D.4451 = .ANNOTATE (iftmp.1D.4452, 2, 0); if (D.4451 != 0) goto ; else goto ; : and it's never heard from again because during replace_loop_annotate we only inspect the loop header and latch for annotations. Since annotations were supposed to apply to the loop as a whole this fixes it by also checking the loop exit src blocks for annotations. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * tree-cfg.cc (replace_loop_annotate): Inspect loop edges for annotations. gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-novect_gcond.c: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c new file mode 100644 index ..01e69cbef9d51b234c08a400c78dc078d53252f1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c @@ -0,0 +1,39 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break_hw } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#include "tree-vect.h" + +#define N 306 +#define NEEDLE 136 + +int table[N]; + +__attribute__ ((noipa)) +int foo (int i, unsigned short parse_tables_n) +{ + parse_tables_n >>= 9; + parse_tables_n += 11; +#pragma GCC novector + while (i < N && parse_tables_n--) +table[i++] = 0; + + return table[NEEDLE]; +} + +int main () +{ + check_vect (); + +#pragma GCC novector + for (int j = 0; j < N; j++) +table[j] = -1; + + if (foo (0, 0x) != 0) +__builtin_abort (); + + return 0; +} diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc index cdd439fe7506e7bc33654ffa027b493f23d278ac..a29681bffb902d2d05e3f18764ab519aacb3c5bc 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@ -327,6 +327,10 @@ replace_loop_annotate (void) if (loop->latch) replace_loop_annotate_in_block (loop->latch, loop); + /* Then also check all other exits. */ + for (auto e : get_loop_exit_edges (loop)) + replace_loop_annotate_in_block (e->src, loop); + /* Push the global flag_finite_loops state down to individual loops. */ loop->finite_p = flag_finite_loops; } -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c new file mode 100644 index ..01e69cbef9d51b234c08a400c78dc078d53252f1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-novect_gcond.c @@ -0,0 +1,39 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break_hw } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#include "tree-vect.h" + +#define N 306 +#define NEEDLE 136 + +int table[N]; + +__attribute__ ((noipa)) +int foo (int i, unsigned short parse_tables_n) +{ + parse_tables_n >>= 9; + parse_tables_n += 11; +#pragma GCC novector + while (i < N && parse_tables_n--) +table[i++] = 0; + + return table[NEEDLE]; +} + +int main () +{ + check_vect (); + +#pragma GCC novector + for (int j = 0; j < N; j++) +table[j] = -1; + + if (foo (0, 0x) != 0) +__builtin_abort (); + + return 0; +} diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc index cdd439fe7506e7bc33654ffa027b493f23d278ac..a29681bffb902d2d05e3f18764ab519aacb3c5bc 100644 --- a/gcc/tree-cfg.cc +++ b/gcc/tree-cfg.cc @@ -327,6 +327,10 @@ replace_loop_annotate (void) if (loop->latch) replace_loop_annotate_in_block (loop->latch, loop); + /* Then also check all other exits. */ + for (auto e : get_loop_exit_edges (loop)) + replace_loop_annotate_in_block (e->src, loop); + /* Push the global flag_finite_loops state down to individual loops. */ loop->finite_p = flag_finite_loops; }
[PATCH]middle-end: update vector loop upper bounds when early break vect [PR113734]
Hi All, When doing early break vectorization we should treat the final iteration as possibly being partial. This so that when we calculate the vector loop upper bounds we take into account that final iteration could have done some work. The attached testcase shows that if we don't then cunroll may unroll the loop an if the upper bound is wrong we lose a vector iteration. This is similar to how we adjust the scalar loop bounds for the PEELED case. Bootstrapped Regtested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/113734 * tree-vect-loop.cc (vect_transform_loop): Treat the final iteration of an early break loop as partial. gcc/testsuite/ChangeLog: PR tree-optimization/113734 * gcc.dg/vect/vect-early-break_117-pr113734.c: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c new file mode 100644 index ..36ae09483dfd426f977a3d92cf24a78d76de6961 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c @@ -0,0 +1,37 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break_hw } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#include "tree-vect.h" + +#define N 306 +#define NEEDLE 136 + +int table[N]; + +__attribute__ ((noipa)) +int foo (int i, unsigned short parse_tables_n) +{ + parse_tables_n >>= 9; + parse_tables_n += 11; + while (i < N && parse_tables_n--) +table[i++] = 0; + + return table[NEEDLE]; +} + +int main () +{ + check_vect (); + + for (int j = 0; j < N; j++) +table[j] = -1; + + if (foo (0, 0x) != 0) +__builtin_abort (); + + return 0; +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 854e9d78bc71721e6559a6bc5dff78c813603a78..0b1656fef2fed83f30295846c382ad9fb318454a 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -12171,7 +12171,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* True if the final iteration might not handle a full vector's worth of scalar iterations. */ bool final_iter_may_be_partial -= LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo); += LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo); /* The minimum number of iterations performed by the epilogue. This is 1 when peeling for gaps because we always need a final scalar iteration. */ -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c new file mode 100644 index ..36ae09483dfd426f977a3d92cf24a78d76de6961 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_117-pr113734.c @@ -0,0 +1,37 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break_hw } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-O3" } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#include "tree-vect.h" + +#define N 306 +#define NEEDLE 136 + +int table[N]; + +__attribute__ ((noipa)) +int foo (int i, unsigned short parse_tables_n) +{ + parse_tables_n >>= 9; + parse_tables_n += 11; + while (i < N && parse_tables_n--) +table[i++] = 0; + + return table[NEEDLE]; +} + +int main () +{ + check_vect (); + + for (int j = 0; j < N; j++) +table[j] = -1; + + if (foo (0, 0x) != 0) +__builtin_abort (); + + return 0; +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 854e9d78bc71721e6559a6bc5dff78c813603a78..0b1656fef2fed83f30295846c382ad9fb318454a 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -12171,7 +12171,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) /* True if the final iteration might not handle a full vector's worth of scalar iterations. */ bool final_iter_may_be_partial -= LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo); += LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) + || LOOP_VINFO_EARLY_BREAKS (loop_vinfo); /* The minimum number of iterations performed by the epilogue. This is 1 when peeling for gaps because we always need a final scalar iteration. */
RE: [PATCH]middle-end: add two debug counters for early-break vectorization debugging
> -Original Message- > From: Richard Biener > Sent: Thursday, February 8, 2024 2:16 PM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; j...@ventanamicro.com > Subject: Re: [PATCH]middle-end: add two debug counters for early-break > vectorization debugging > > On Thu, 8 Feb 2024, Tamar Christina wrote: > > > Hi All, > > > > This adds two new debug counter to aid in debugging early break code. > > > > - vect_force_last_exit: when reached will always force the final loop exit. > > - vect_skip_exit: when reached will skip selecting the current candidate > > exit > > as the loop exit. > > > > The first counter essentially allows you to turn off the PEELED case and the > > second counter to pick a different exit, which may mean you pick no exit at > > all. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > * dbgcnt.def (vect_force_last_exit, vect_skip_exit): New. > > * tree-vect-loop.cc (vec_init_loop_exit_info): Use them. > > > > --- inline copy of patch -- > > diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def > > index > ed9f062eac2c28c52df76b39d4312dd9fde1c800..8f7bebf93fceabdf6ae86c2df5 > 91eae4848b8a5c 100644 > > --- a/gcc/dbgcnt.def > > +++ b/gcc/dbgcnt.def > > @@ -213,5 +213,7 @@ DEBUG_COUNTER (stv_conversion) > > DEBUG_COUNTER (tail_call) > > DEBUG_COUNTER (tree_sra) > > DEBUG_COUNTER (treepre_insert) > > +DEBUG_COUNTER (vect_force_last_exit) > > DEBUG_COUNTER (vect_loop) > > +DEBUG_COUNTER (vect_skip_exit) > > DEBUG_COUNTER (vect_slp) > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index > eed2268e9bae7e7ad36d13da03e0b54eab26ef6f..854e9d78bc71721e6559a6bc > 5dff78c813603a78 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -58,6 +58,7 @@ along with GCC; see the file COPYING3. If not see > > #include "tree-eh.h" > > #include "case-cfn-macros.h" > > #include "langhooks.h" > > +#include "dbgcnt.h" > > > > /* Loop Vectorization Pass. > > > > @@ -977,6 +978,20 @@ vec_init_loop_exit_info (class loop *loop) > >if (exits.length () == 1) > > return exits[0]; > > > > + /* Check to see if we've been asked to force the last exit. */ > > + if (!dbg_cnt (vect_force_last_exit)) > > +{ > > + basic_block bb = ip_normal_pos (loop); > > + if (!bb) > > + return NULL; > > + > > + edge exit = EDGE_SUCC (bb, 0); > > + if (exit->dest == loop->latch) > > + return EDGE_SUCC (bb, 1); > > + > > + return exit; > > Err, that's quite odd. Why not just below do > > > +} > > + > >/* If we have multiple exits we only support counting IV at the moment. > > Analyze all exits and return the last one we can analyze. */ > >class tree_niter_desc niter_desc; > > @@ -998,6 +1013,7 @@ vec_init_loop_exit_info (class loop *loop) > >&& exit->src == single_pred (loop->latch) > >&& (integer_nonzerop (may_be_zero) > >|| COMPARISON_CLASS_P (may_be_zero > > + && dbg_cnt (vect_skip_exit) > > && (dbg_cnt (vect_force_last_exit) > || exit->src == single_pred (loop->latch)) > > (also computed above already)? It's also oddly named, it's more like > vect_allow_peeled_exit or so. Because this isn't deterministic. If a loop has n exits the above always forces you to pick the final one regardless of n, rather than just skip consideration of an exit. And in that case is there a point in analyzing all the exits just to throw away the information? Doing in inside the consideration check would only skip one exit unless I'm misunderstanding. > > It's also seemingly redundant with vect_skip_exit, no? > > Note the counter gets incremented even if we'd not consider the exit > because we have a later candidate already. > > I fear it's going to be quite random even with the debug counter. It is, I think the first counter is more useful. But in general the reason I kept the second counter which kinda does what was suggested in the RFC I sent before was that it should in theory at least allow us to test forcing of a PEELED case. Since we generally prefer the non-PEELED case if possible. At least that was the intention. Thanks, Tamar > > Can you see whether it really helps you? > > > && (!candidate > > || dominated_by_p (CDI_DOMINATORS, exit->src, > > candidate->src))) > > > > > > > > > > > > -- > Richard Biener > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
[PATCH]middle-end: add two debug counters for early-break vectorization debugging
Hi All, This adds two new debug counter to aid in debugging early break code. - vect_force_last_exit: when reached will always force the final loop exit. - vect_skip_exit: when reached will skip selecting the current candidate exit as the loop exit. The first counter essentially allows you to turn off the PEELED case and the second counter to pick a different exit, which may mean you pick no exit at all. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * dbgcnt.def (vect_force_last_exit, vect_skip_exit): New. * tree-vect-loop.cc (vec_init_loop_exit_info): Use them. --- inline copy of patch -- diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def index ed9f062eac2c28c52df76b39d4312dd9fde1c800..8f7bebf93fceabdf6ae86c2df591eae4848b8a5c 100644 --- a/gcc/dbgcnt.def +++ b/gcc/dbgcnt.def @@ -213,5 +213,7 @@ DEBUG_COUNTER (stv_conversion) DEBUG_COUNTER (tail_call) DEBUG_COUNTER (tree_sra) DEBUG_COUNTER (treepre_insert) +DEBUG_COUNTER (vect_force_last_exit) DEBUG_COUNTER (vect_loop) +DEBUG_COUNTER (vect_skip_exit) DEBUG_COUNTER (vect_slp) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index eed2268e9bae7e7ad36d13da03e0b54eab26ef6f..854e9d78bc71721e6559a6bc5dff78c813603a78 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -58,6 +58,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-eh.h" #include "case-cfn-macros.h" #include "langhooks.h" +#include "dbgcnt.h" /* Loop Vectorization Pass. @@ -977,6 +978,20 @@ vec_init_loop_exit_info (class loop *loop) if (exits.length () == 1) return exits[0]; + /* Check to see if we've been asked to force the last exit. */ + if (!dbg_cnt (vect_force_last_exit)) +{ + basic_block bb = ip_normal_pos (loop); + if (!bb) + return NULL; + + edge exit = EDGE_SUCC (bb, 0); + if (exit->dest == loop->latch) + return EDGE_SUCC (bb, 1); + + return exit; +} + /* If we have multiple exits we only support counting IV at the moment. Analyze all exits and return the last one we can analyze. */ class tree_niter_desc niter_desc; @@ -998,6 +1013,7 @@ vec_init_loop_exit_info (class loop *loop) && exit->src == single_pred (loop->latch) && (integer_nonzerop (may_be_zero) || COMPARISON_CLASS_P (may_be_zero + && dbg_cnt (vect_skip_exit) && (!candidate || dominated_by_p (CDI_DOMINATORS, exit->src, candidate->src))) -- diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def index ed9f062eac2c28c52df76b39d4312dd9fde1c800..8f7bebf93fceabdf6ae86c2df591eae4848b8a5c 100644 --- a/gcc/dbgcnt.def +++ b/gcc/dbgcnt.def @@ -213,5 +213,7 @@ DEBUG_COUNTER (stv_conversion) DEBUG_COUNTER (tail_call) DEBUG_COUNTER (tree_sra) DEBUG_COUNTER (treepre_insert) +DEBUG_COUNTER (vect_force_last_exit) DEBUG_COUNTER (vect_loop) +DEBUG_COUNTER (vect_skip_exit) DEBUG_COUNTER (vect_slp) diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index eed2268e9bae7e7ad36d13da03e0b54eab26ef6f..854e9d78bc71721e6559a6bc5dff78c813603a78 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -58,6 +58,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-eh.h" #include "case-cfn-macros.h" #include "langhooks.h" +#include "dbgcnt.h" /* Loop Vectorization Pass. @@ -977,6 +978,20 @@ vec_init_loop_exit_info (class loop *loop) if (exits.length () == 1) return exits[0]; + /* Check to see if we've been asked to force the last exit. */ + if (!dbg_cnt (vect_force_last_exit)) +{ + basic_block bb = ip_normal_pos (loop); + if (!bb) + return NULL; + + edge exit = EDGE_SUCC (bb, 0); + if (exit->dest == loop->latch) + return EDGE_SUCC (bb, 1); + + return exit; +} + /* If we have multiple exits we only support counting IV at the moment. Analyze all exits and return the last one we can analyze. */ class tree_niter_desc niter_desc; @@ -998,6 +1013,7 @@ vec_init_loop_exit_info (class loop *loop) && exit->src == single_pred (loop->latch) && (integer_nonzerop (may_be_zero) || COMPARISON_CLASS_P (may_be_zero + && dbg_cnt (vect_skip_exit) && (!candidate || dominated_by_p (CDI_DOMINATORS, exit->src, candidate->src)))
RE: [PATCH]middle-end: don't cache restart_loop in vectorizable_live_operations [PR113808]
> Please either drop lastprivate(k) clause or use linear(k:1) > The iteration var of simd loop without collapse or with > collapse(1) is implicitly linear with the step, and even linear > means the value from the last iteration can be used after the > simd construct. Overriding the data sharing to something different > has been only added recently to OpenMP and isn't really needed here. > Sorry I know very little about fortran, is this ok? Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: PR tree-optimization/113808 * gfortran.dg/vect/vect-early-break_1-PR113808.f90: Moved to... * gfortran.dg/vect/vect-early-break_1-pr113808.f90: ...here. --- inline copy of patch --- diff --git a/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-pr113808.f90 similarity index 93% rename from gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 rename to gcc/testsuite/gfortran.dg/vect/vect-early-break_1-pr113808.f90 index 5c339fa7a348fac5527bbbf456a535da96b5c1ed..6f92e9095bdee08a5a9db2816f57da6c14d91b11 100644 --- a/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 +++ b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-pr113808.f90 @@ -9,7 +9,7 @@ program main integer :: n, i,k n = 11 do i = 1, n,2 -!$omp simd lastprivate(k) +!$omp simd do k = 1, i + 41 if (k > 11 + 41 .or. k < 1) error stop end do rb18253.patch Description: rb18253.patch
[PATCH]middle-end: don't cache restart_loop in vectorizable_live_operations [PR113808]
Hi All, There's a bug in vectorizable_live_operation that restart_loop is defined outside the loop. This variable is supposed to indicate whether we are doing a first or last index reduction. The problem is that by defining it outside the loop it becomes dependent on the order we visit the USE/DEFs. In the given example, the loop isn't PEELED, but we visit the early exit uses first. This then sets the boolean to true and it can't get to false again. So when we visit the main exit we still treat it as an early exit for that SSA name. This cleans it up and renames the variables to something that's hopefully clearer to their intention. Bootstrapped Regtested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/113808 * tree-vect-loop.cc (vectorizable_live_operation): Don't cache the value cross iterations. gcc/testsuite/ChangeLog: PR tree-optimization/113808 * gfortran.dg/vect/vect-early-break_1-PR113808.f90: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 new file mode 100644 index ..5c339fa7a348fac5527bbbf456a535da96b5c1ed --- /dev/null +++ b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 @@ -0,0 +1,21 @@ +! { dg-add-options vect_early_break } +! { dg-require-effective-target vect_early_break } +! { dg-require-effective-target vect_long_long } +! { dg-additional-options "-fopenmp-simd" } + +! { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } + +program main + integer :: n, i,k + n = 11 + do i = 1, n,2 +!$omp simd lastprivate(k) +do k = 1, i + 41 + if (k > 11 + 41 .or. k < 1) error stop +end do + end do + if (k /= 53) then +print *, k, 53 +error stop + endif +end diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 190df9ec7741fd05aa0b9abe150baf06b2ca9a57..eed2268e9bae7e7ad36d13da03e0b54eab26ef6f 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10950,7 +10950,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, did. For the live values we want the value at the start of the iteration rather than at the end. */ edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo); - bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo); + bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo); FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) if (!is_gimple_debug (use_stmt) && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) @@ -10966,8 +10966,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, /* For early exit where the exit is not in the BB that leads to the latch then we're restarting the iteration in the scalar loop. So get the first live value. */ - restart_loop = restart_loop || !main_exit_edge; - if (restart_loop + if ((all_exits_as_early_p || !main_exit_edge) && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) { tmp_vec_lhs = vec_lhs0; -- diff --git a/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 new file mode 100644 index ..5c339fa7a348fac5527bbbf456a535da96b5c1ed --- /dev/null +++ b/gcc/testsuite/gfortran.dg/vect/vect-early-break_1-PR113808.f90 @@ -0,0 +1,21 @@ +! { dg-add-options vect_early_break } +! { dg-require-effective-target vect_early_break } +! { dg-require-effective-target vect_long_long } +! { dg-additional-options "-fopenmp-simd" } + +! { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } + +program main + integer :: n, i,k + n = 11 + do i = 1, n,2 +!$omp simd lastprivate(k) +do k = 1, i + 41 + if (k > 11 + 41 .or. k < 1) error stop +end do + end do + if (k /= 53) then +print *, k, 53 +error stop + endif +end diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 190df9ec7741fd05aa0b9abe150baf06b2ca9a57..eed2268e9bae7e7ad36d13da03e0b54eab26ef6f 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -10950,7 +10950,7 @@ vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info, did. For the live values we want the value at the start of the iteration rather than at the end. */ edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo); - bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo); + bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo); FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) if (!is_gimple_debug (use_stmt) && !flow_bb_inside_loop_p
[PATCH][committed]middle-end: fix pointer conversion error in testcase vect-early-break_110-pr113467.c
Hi All, I had missed a conversion from unsigned long to uint64_t. This fixes the failing test on -m32. Regtested on x86_64-pc-linux-gnu with -m32 and no issues. Committed as obvious. Thanks, Tamar gcc/testsuite/ChangeLog: * gcc.dg/vect/vect-early-break_110-pr113467.c: Change unsigned long * to uint64_t *. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c index 1e2c47be5fdf1e1fed88e4b5f45d7eda6c3b85d1..12d0ea1e871b51742c040c909ea5741bc820206e 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c @@ -10,7 +10,7 @@ typedef struct gcry_mpi *gcry_mpi_t; struct gcry_mpi { int nlimbs; - unsigned long *d; + uint64_t *d; }; long gcry_mpi_add_ui_up; -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c index 1e2c47be5fdf1e1fed88e4b5f45d7eda6c3b85d1..12d0ea1e871b51742c040c909ea5741bc820206e 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c @@ -10,7 +10,7 @@ typedef struct gcry_mpi *gcry_mpi_t; struct gcry_mpi { int nlimbs; - unsigned long *d; + uint64_t *d; }; long gcry_mpi_add_ui_up;
RE: [PATCH]middle-end: fix ICE when moving statements to empty BB [PR113731]
> It looks like LOOP_VINFO_EARLY_BRK_STORES is "reverse"? Is that > why you are doing gsi_move_before + gsi_prev? Why do gsi_prev > at all? > As discussed on IRC, then how about this one. Incremental building passed all tests and bootstrap is running. Ok for master if bootstrap and regtesting clean? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/113731 * gimple-iterator.cc (gsi_move_before): Take new parameter for update method. * gimple-iterator.h (gsi_move_before): Default new param to GSI_SAME_STMT. * tree-vect-loop.cc (move_early_exit_stmts): Call gsi_move_before with GSI_NEW_STMT. gcc/testsuite/ChangeLog: PR tree-optimization/113731 * gcc.dg/vect/vect-early-break_111-pr113731.c: New test. --- inline copy of patch --- diff --git a/gcc/gimple-iterator.cc b/gcc/gimple-iterator.cc index 517c53376f0511af59e124f52ec7be566a6c4789..f67bcfbfdfdd7c6cb0ad0130972f5b1dc4429bcf 100644 --- a/gcc/gimple-iterator.cc +++ b/gcc/gimple-iterator.cc @@ -666,10 +666,11 @@ gsi_move_after (gimple_stmt_iterator *from, gimple_stmt_iterator *to) /* Move the statement at FROM so it comes right before the statement - at TO. */ + at TO using method M. */ void -gsi_move_before (gimple_stmt_iterator *from, gimple_stmt_iterator *to) +gsi_move_before (gimple_stmt_iterator *from, gimple_stmt_iterator *to, +gsi_iterator_update m = GSI_SAME_STMT) { gimple *stmt = gsi_stmt (*from); gsi_remove (from, false); @@ -677,7 +678,7 @@ gsi_move_before (gimple_stmt_iterator *from, gimple_stmt_iterator *to) /* For consistency with gsi_move_after, it might be better to have GSI_NEW_STMT here; however, that breaks several places that expect that TO does not change. */ - gsi_insert_before (to, stmt, GSI_SAME_STMT); + gsi_insert_before (to, stmt, m); } diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c new file mode 100644 index ..2d6db91df97625a7f11609d034e89af0461129b2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +char* inet_net_pton_ipv4_bits; +char inet_net_pton_ipv4_odst; +void __errno_location(); +void inet_net_pton_ipv4(); +void inet_net_pton() { inet_net_pton_ipv4(); } +void inet_net_pton_ipv4(char *dst, int size) { + while ((inet_net_pton_ipv4_bits > dst) & inet_net_pton_ipv4_odst) { +if (size-- <= 0) + goto emsgsize; +*dst++ = '\0'; + } +emsgsize: + __errno_location(); +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 30b90d99925bea74caf14833d8ab1695607d0fe9..9aba94bd6ca2061a19487ac4a2735a16d03bcbee 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -11800,8 +11800,7 @@ move_early_exit_stmts (loop_vec_info loop_vinfo) dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt); gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt); - gsi_move_before (_gsi, _gsi); - gsi_prev (_gsi); + gsi_move_before (_gsi, _gsi, GSI_NEW_STMT); } /* Update all the stmts with their new reaching VUSES. */ rb18247.patch Description: rb18247.patch
RE: [PATCH]middle-end: add additional runtime test for [PR113467]
> > Ok for master? > > I think you need a lp64 target check for the large constants or > alternatively use uint64_t? > Ok, how about this one. Regtested on x86_64-pc-linux-gnu with -m32,-m64 and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: PR tree-optimization/113467 * gcc.dg/vect/vect-early-break_110-pr113467.c: New test. --- inline copy of patch --- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c new file mode 100644 index ..1e2c47be5fdf1e1fed88e4b5f45d7eda6c3b85d1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c @@ -0,0 +1,52 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_long_long } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#include "tree-vect.h" +#include + +typedef struct gcry_mpi *gcry_mpi_t; +struct gcry_mpi { + int nlimbs; + unsigned long *d; +}; + +long gcry_mpi_add_ui_up; +void gcry_mpi_add_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned v) { + gcry_mpi_add_ui_up = *w->d; + if (u) { +uint64_t *res_ptr = w->d, *s1_ptr = w->d; +int s1_size = u->nlimbs; +unsigned s2_limb = v, x = *s1_ptr++; +s2_limb += x; +*res_ptr++ = s2_limb; +if (x) + while (--s1_size) { +x = *s1_ptr++ + 1; +*res_ptr++ = x; +if (x) { + break; +} + } + } +} + +int main() +{ + check_vect (); + + static struct gcry_mpi sv; + static uint64_t vals[] = {4294967288ULL, 191ULL,4160749568ULL, 4294963263ULL, +127ULL,4294950912ULL, 255ULL, 4294901760ULL, +534781951ULL, 33546240ULL, 4294967292ULL, 4294960127ULL, +4292872191ULL, 4294967295ULL, 4294443007ULL, 3ULL}; + gcry_mpi_t v = + v->nlimbs = 16; + v->d = vals; + + gcry_mpi_add_ui(v, v, 8); + if (v->d[1] != 192) +__builtin_abort(); +} rb18246.patch Description: rb18246.patch
RE: [PATCH]middle-end: fix ICE when moving statements to empty BB [PR113731]
> -Original Message- > From: Richard Biener > Sent: Monday, February 5, 2024 1:22 PM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; j...@ventanamicro.com > Subject: Re: [PATCH]middle-end: fix ICE when moving statements to empty BB > [PR113731] > > On Mon, 5 Feb 2024, Tamar Christina wrote: > > > Hi All, > > > > We use gsi_move_before (_gsi, _gsi); to request that the new > statement > > be placed before any other statement. Typically this then moves the current > > pointer to be after the statement we just inserted. > > > > However it looks like when the BB is empty, this does not happen and the CUR > > pointer stays NULL. There's a comment in the source of gsi_insert_before > > that > > explains: > > > > /* If CUR is NULL, we link at the end of the sequence (this case happens > > > > so it adds it to the end instead of start like you asked. This means that > > in > > this case there's nothing to move and so we shouldn't move the pointer if > > we're > > already at the HEAD. > > The issue is that a gsi_end_p () is ambiguous, it could be the start > or the end. gsi_insert_before treats it as "end" while gsi_insert_after > treats it as "start" since you can't really insert "after" the "end". > > gsi_move_before doesn't update the insertion pointer (using > GSI_SAME_STMT), so with a gsi_end_p () you get what you ask for. > > Btw, > > /* Move all stmts that need moving. */ > basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo); > gimple_stmt_iterator dest_gsi = gsi_start_bb (dest_bb); > > should probably use gsi_after_labels (dest_bb) just in case. See next patch. > > It looks like LOOP_VINFO_EARLY_BRK_STORES is "reverse"? Is that > why you are doing gsi_move_before + gsi_prev? Why do gsi_prev > at all? > Yes, it stores them reverse because we record them from the latch on up. So we either have to iterate backwards, insert them to the front or move gsi. I guess I could remove it by removing the for-each loop and iterating in reverse. Is that preferred? Tamar. > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > > > Thanks, > > Tamar > > > > gcc/ChangeLog: > > > > PR tree-optimization/113731 > > * tree-vect-loop.cc (move_early_exit_stmts): Conditionally move pointer. > > > > gcc/testsuite/ChangeLog: > > > > PR tree-optimization/113731 > > * gcc.dg/vect/vect-early-break_111-pr113731.c: New test. > > > > --- inline copy of patch -- > > diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c > b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c > > new file mode 100644 > > index > ..2d6db91df97625a7f1160 > 9d034e89af0461129b2 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c > > @@ -0,0 +1,21 @@ > > +/* { dg-do compile } */ > > +/* { dg-add-options vect_early_break } */ > > +/* { dg-require-effective-target vect_early_break } */ > > +/* { dg-require-effective-target vect_int } */ > > + > > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > > + > > +char* inet_net_pton_ipv4_bits; > > +char inet_net_pton_ipv4_odst; > > +void __errno_location(); > > +void inet_net_pton_ipv4(); > > +void inet_net_pton() { inet_net_pton_ipv4(); } > > +void inet_net_pton_ipv4(char *dst, int size) { > > + while ((inet_net_pton_ipv4_bits > dst) & inet_net_pton_ipv4_odst) { > > +if (size-- <= 0) > > + goto emsgsize; > > +*dst++ = '\0'; > > + } > > +emsgsize: > > + __errno_location(); > > +} > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index > 30b90d99925bea74caf14833d8ab1695607d0fe9..e2587315020a35a7d4ebd3e > 7a9842caa36bb5d3c 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -11801,7 +11801,8 @@ move_early_exit_stmts (loop_vec_info loop_vinfo) > > > >gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt); > >gsi_move_before (_gsi, _gsi); > > - gsi_prev (_gsi); > > + if (!gsi_end_p (dest_gsi)) > > + gsi_prev (_gsi); > > } > > > >/* Update all the stmts with their new reaching VUSES. */ > > > > > > > > > > > > -- > Richard Biener > SUSE Software Solutions Germany GmbH, > Frankenstrasse 146, 90461 Nuernberg, Germany; > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
[PATCH]middle-end: fix ICE when destination BB for stores starts with a label [PR113750]
Hi All, The report shows that if the FE leaves a label as the first thing in the dest BB then we ICE because we move the stores before the label. This is easy to fix if we know that there's still only one way into the BB. We would have already rejected the loop if there was multiple paths into the BB however I added an additional check just for early break in case the other constraints are relaxed later with an explanation. After that we fix the issue just by getting the GSI after the labels and I add a bunch of testcases for different positions the label can be added. Only the vect-early-break_112-pr113750.c one results in the label being kept. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/113750 * tree-vect-data-refs.cc (vect_analyze_early_break_dependences): Check for single predecessor when doing early break vect. * tree-vect-loop.cc (move_early_exit_stmts): Get gsi at the start but after labels. gcc/testsuite/ChangeLog: PR tree-optimization/113750 * gcc.dg/vect/vect-early-break_112-pr113750.c: New test. * gcc.dg/vect/vect-early-break_113-pr113750.c: New test. * gcc.dg/vect/vect-early-break_114-pr113750.c: New test. * gcc.dg/vect/vect-early-break_115-pr113750.c: New test. * gcc.dg/vect/vect-early-break_116-pr113750.c: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_112-pr113750.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_112-pr113750.c new file mode 100644 index ..559ebd84d5c39881e694e7c8c31be29d846866ed --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_112-pr113750.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 800 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] != x) + break; +foo: + vect_a[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_113-pr113750.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_113-pr113750.c new file mode 100644 index ..ba85780a46b1378aaec238ff9eb5f906be9a44dd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_113-pr113750.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 800 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; + if (vect_a[i] != x) + break; + vect_a[i] = x; +foo: + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_114-pr113750.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_114-pr113750.c new file mode 100644 index ..37af2998688f5d60e2cdb372ab43afcaa52a3146 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_114-pr113750.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 800 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { + vect_b[i] = x + i; +foo: + if (vect_a[i] != x) + break; + vect_a[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_115-pr113750.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_115-pr113750.c new file mode 100644 index ..502686d308e298cd84e9e3b74d7b4ad1979602a9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_115-pr113750.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +#ifndef N +#define N 800 +#endif +unsigned vect_a[N]; +unsigned vect_b[N]; + +unsigned test4(unsigned x) +{ + unsigned ret = 0; + for (int i = 0; i < N; i++) + { +foo: + vect_b[i] = x + i; + if (vect_a[i] != x) + break; + vect_a[i] = x; + } + return ret; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_116-pr113750.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_116-pr113750.c new file mode
[PATCH]middle-end: fix ICE when moving statements to empty BB [PR113731]
Hi All, We use gsi_move_before (_gsi, _gsi); to request that the new statement be placed before any other statement. Typically this then moves the current pointer to be after the statement we just inserted. However it looks like when the BB is empty, this does not happen and the CUR pointer stays NULL. There's a comment in the source of gsi_insert_before that explains: /* If CUR is NULL, we link at the end of the sequence (this case happens so it adds it to the end instead of start like you asked. This means that in this case there's nothing to move and so we shouldn't move the pointer if we're already at the HEAD. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/113731 * tree-vect-loop.cc (move_early_exit_stmts): Conditionally move pointer. gcc/testsuite/ChangeLog: PR tree-optimization/113731 * gcc.dg/vect/vect-early-break_111-pr113731.c: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c new file mode 100644 index ..2d6db91df97625a7f11609d034e89af0461129b2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +char* inet_net_pton_ipv4_bits; +char inet_net_pton_ipv4_odst; +void __errno_location(); +void inet_net_pton_ipv4(); +void inet_net_pton() { inet_net_pton_ipv4(); } +void inet_net_pton_ipv4(char *dst, int size) { + while ((inet_net_pton_ipv4_bits > dst) & inet_net_pton_ipv4_odst) { +if (size-- <= 0) + goto emsgsize; +*dst++ = '\0'; + } +emsgsize: + __errno_location(); +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 30b90d99925bea74caf14833d8ab1695607d0fe9..e2587315020a35a7d4ebd3e7a9842caa36bb5d3c 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -11801,7 +11801,8 @@ move_early_exit_stmts (loop_vec_info loop_vinfo) gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt); gsi_move_before (_gsi, _gsi); - gsi_prev (_gsi); + if (!gsi_end_p (dest_gsi)) + gsi_prev (_gsi); } /* Update all the stmts with their new reaching VUSES. */ -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c new file mode 100644 index ..2d6db91df97625a7f11609d034e89af0461129b2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_111-pr113731.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ + +char* inet_net_pton_ipv4_bits; +char inet_net_pton_ipv4_odst; +void __errno_location(); +void inet_net_pton_ipv4(); +void inet_net_pton() { inet_net_pton_ipv4(); } +void inet_net_pton_ipv4(char *dst, int size) { + while ((inet_net_pton_ipv4_bits > dst) & inet_net_pton_ipv4_odst) { +if (size-- <= 0) + goto emsgsize; +*dst++ = '\0'; + } +emsgsize: + __errno_location(); +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 30b90d99925bea74caf14833d8ab1695607d0fe9..e2587315020a35a7d4ebd3e7a9842caa36bb5d3c 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -11801,7 +11801,8 @@ move_early_exit_stmts (loop_vec_info loop_vinfo) gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt); gsi_move_before (_gsi, _gsi); - gsi_prev (_gsi); + if (!gsi_end_p (dest_gsi)) + gsi_prev (_gsi); } /* Update all the stmts with their new reaching VUSES. */
[PATCH]middle-end: add additional runtime test for [PR113467]
Hi All, This just adds an additional runtime testcase for the fixed issue. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: PR tree-optimization/113467 * gcc.dg/vect/vect-early-break_110-pr113467.c: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c new file mode 100644 index ..2d8a071c0e922ccfd5fa8c7b2704852dbd95 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c @@ -0,0 +1,51 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#include "tree-vect.h" + +typedef struct gcry_mpi *gcry_mpi_t; +struct gcry_mpi { + int nlimbs; + unsigned long *d; +}; + +long gcry_mpi_add_ui_up; +void gcry_mpi_add_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned v) { + gcry_mpi_add_ui_up = *w->d; + if (u) { +unsigned long *res_ptr = w->d, *s1_ptr = w->d; +int s1_size = u->nlimbs; +unsigned s2_limb = v, x = *s1_ptr++; +s2_limb += x; +*res_ptr++ = s2_limb; +if (x) + while (--s1_size) { +x = *s1_ptr++ + 1; +*res_ptr++ = x; +if (x) { + break; +} + } + } +} + +int main() +{ + check_vect (); + + static struct gcry_mpi sv; + static unsigned long vals[] = {4294967288, 191,4160749568, 4294963263, + 127,4294950912, 255, 4294901760, + 534781951, 33546240, 4294967292, 4294960127, + 4292872191, 4294967295, 4294443007, 3}; + gcry_mpi_t v = + v->nlimbs = 16; + v->d = vals; + + gcry_mpi_add_ui(v, v, 8); + if (v->d[1] != 192) +__builtin_abort(); +} -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c new file mode 100644 index ..2d8a071c0e922ccfd5fa8c7b2704852dbd95 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_110-pr113467.c @@ -0,0 +1,51 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#include "tree-vect.h" + +typedef struct gcry_mpi *gcry_mpi_t; +struct gcry_mpi { + int nlimbs; + unsigned long *d; +}; + +long gcry_mpi_add_ui_up; +void gcry_mpi_add_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned v) { + gcry_mpi_add_ui_up = *w->d; + if (u) { +unsigned long *res_ptr = w->d, *s1_ptr = w->d; +int s1_size = u->nlimbs; +unsigned s2_limb = v, x = *s1_ptr++; +s2_limb += x; +*res_ptr++ = s2_limb; +if (x) + while (--s1_size) { +x = *s1_ptr++ + 1; +*res_ptr++ = x; +if (x) { + break; +} + } + } +} + +int main() +{ + check_vect (); + + static struct gcry_mpi sv; + static unsigned long vals[] = {4294967288, 191,4160749568, 4294963263, + 127,4294950912, 255, 4294901760, + 534781951, 33546240, 4294967292, 4294960127, + 4292872191, 4294967295, 4294443007, 3}; + gcry_mpi_t v = + v->nlimbs = 16; + v->d = vals; + + gcry_mpi_add_ui(v, v, 8); + if (v->d[1] != 192) +__builtin_abort(); +}
RE: [PATCH]middle-end: check memory accesses in the destination block [PR113588].
> > > > If the above is correct then I think I understand what you're saying and > > will update the patch and do some Checks. > > Yes, I think that's what I wanted to say. > As discussed: Bootstrapped Regtested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu no issues. Also checked both with --enable-lto --with-build-config='bootstrap-O3 bootstrap-lto' --enable-multilib and --enable-lto --with-build-config=bootstrap-O3 --enable-checking=release,yes,rtl,extra; and checked the libcrypt testsuite as reported on PR113467. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/113588 PR tree-optimization/113467 (vect_analyze_data_ref_dependence): Choose correct dest and fix checks. (vect_analyze_early_break_dependences): Update comments. gcc/testsuite/ChangeLog: PR tree-optimization/113588 PR tree-optimization/113467 * gcc.dg/vect/vect-early-break_108-pr113588.c: New test. * gcc.dg/vect/vect-early-break_109-pr113588.c: New test. --- inline copy of patch --- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c new file mode 100644 index ..e488619c9aac41fafbcf479818392a6bb7c6924f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +int foo (const char *s, unsigned long n) +{ + unsigned long len = 0; + while (*s++ && n--) + ++len; + return len; +} + diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c new file mode 100644 index ..488c19d3ede809631d1a7ede0e7f7bcdc7a1ae43 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c @@ -0,0 +1,44 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target mmap } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#include +#include + +#include "tree-vect.h" + +__attribute__((noipa)) +int foo (const char *s, unsigned long n) +{ + unsigned long len = 0; + while (*s++ && n--) + ++len; + return len; +} + +int main() +{ + + check_vect (); + + long pgsz = sysconf (_SC_PAGESIZE); + void *p = mmap (NULL, pgsz * 3, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); + if (p == MAP_FAILED) +return 0; + mprotect (p, pgsz, PROT_NONE); + mprotect (p+2*pgsz, pgsz, PROT_NONE); + char *p1 = p + pgsz; + p1[0] = 1; + p1[1] = 0; + foo (p1, 1000); + p1 = p + 2*pgsz - 2; + p1[0] = 1; + p1[1] = 0; + foo (p1, 1000); + return 0; +} + diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index f592aeb8028afd4fd70e2175104efab2a2c0d82e..53fdfc25d7dc2deb7788176252697d2e45fc 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -619,10 +619,10 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, return opt_result::success (); } -/* Funcion vect_analyze_early_break_dependences. +/* Function vect_analyze_early_break_dependences. - Examime all the data references in the loop and make sure that if we have - mulitple exits that we are able to safely move stores such that they become + Examine all the data references in the loop and make sure that if we have + multiple exits that we are able to safely move stores such that they become safe for vectorization. The function also calculates the place where to move the instructions to and computes what the new vUSE chain should be. @@ -639,7 +639,7 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, - Multiple loads are allowed as long as they don't alias. NOTE: - This implemementation is very conservative. Any overlappig loads/stores + This implementation is very conservative. Any overlapping loads/stores that take place before the early break statement gets rejected aside from WAR dependencies. @@ -668,7 +668,6 @@ vect_analyze_early_break_dependences (loop_vec_info loop_vinfo) auto_vec bases; basic_block dest_bb = NULL; - hash_set visited; class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); class loop *loop_nest = loop_outer (loop); @@ -677,19 +676,33 @@ vect_analyze_early_break_dependences (loop_vec_info loop_vinfo) "loop contains multiple exits, analyzing" " statement dependencies.\n"); + if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) +if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, +
RE: [PATCH]AArch64: update vget_set_lane_1.c test output
> -Original Message- > From: Richard Sandiford > Sent: Thursday, February 1, 2024 2:24 PM > To: Andrew Pinski > Cc: Tamar Christina ; gcc-patches@gcc.gnu.org; nd > ; Richard Earnshaw ; Marcus > Shawcroft ; Kyrylo Tkachov > > Subject: Re: [PATCH]AArch64: update vget_set_lane_1.c test output > > Andrew Pinski writes: > > On Thu, Feb 1, 2024 at 1:26 AM Tamar Christina > wrote: > >> > >> Hi All, > >> > >> In the vget_set_lane_1.c test the following entries now generate a zip1 > >> instead > of an INS > >> > >> BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0) > >> BUILD_TEST (int32x2_t, int32x2_t, , , s32, 1, 0) > >> BUILD_TEST (uint32x2_t, uint32x2_t, , , u32, 1, 0) > >> > >> This is because the non-Q variant for indices 0 and 1 are just shuffling > >> values. > >> There is no perf difference between INS SIMD to SIMD and ZIP, as such just > update the > >> test file. > > Hmm, is this true on all cores? I suspect there is a core out there > > where INS is implemented with a much lower latency than ZIP. > > If we look at config/aarch64/thunderx.md, we can see INS is 2 cycles > > while ZIP is 6 cycles (3/7 for q versions). > > Now I don't have any invested interest in that core any more but I > > just wanted to point out that is not exactly true for all cores. > > Thanks for the pointer. In that case, perhaps we should prefer > aarch64_evpc_ins over aarch64_evpc_zip in aarch64_expand_vec_perm_const_1? > That's enough to fix this failure, but it'll probably require other > tests to be adjusted... I think given that Thundex-X is a 10 year old micro-architecture that is several cases where often used instructions have very high latencies that generic codegen should not be blocked from progressing because of it. we use zips in many things and if thunderx codegen is really of that much importance then I think the old codegen should be gated behind -mcpu=thunderx rather than preventing generic changes. Regards, Tamar. > > Richard
[PATCH]AArch64: update vget_set_lane_1.c test output
Hi All, In the vget_set_lane_1.c test the following entries now generate a zip1 instead of an INS BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0) BUILD_TEST (int32x2_t, int32x2_t, , , s32, 1, 0) BUILD_TEST (uint32x2_t, uint32x2_t, , , u32, 1, 0) This is because the non-Q variant for indices 0 and 1 are just shuffling values. There is no perf difference between INS SIMD to SIMD and ZIP, as such just update the test file. Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: * gcc.target/aarch64/vget_set_lane_1.c: Update test output. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c index 07a77de319206c5c6dad1c0d2d9bcc998583f9c1..a3978f68e4ff5899f395a98615a5e86c3b1389cb 100644 --- a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c +++ b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c @@ -22,7 +22,7 @@ BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2) BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0) BUILD_TEST (int32x2_t, int32x2_t, , , s32, 1, 0) BUILD_TEST (uint32x2_t, uint32x2_t, , , u32, 1, 0) -/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */ +/* { dg-final { scan-assembler-times "zip1\\tv0.2s, v0.2s, v1.2s" 3 } } */ BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15) BUILD_TEST (int8x8_t, int8x16_t, , q, s8, 7, 15) -- diff --git a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c index 07a77de319206c5c6dad1c0d2d9bcc998583f9c1..a3978f68e4ff5899f395a98615a5e86c3b1389cb 100644 --- a/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c +++ b/gcc/testsuite/gcc.target/aarch64/vget_set_lane_1.c @@ -22,7 +22,7 @@ BUILD_TEST (uint16x4_t, uint16x4_t, , , u16, 3, 2) BUILD_TEST (float32x2_t, float32x2_t, , , f32, 1, 0) BUILD_TEST (int32x2_t, int32x2_t, , , s32, 1, 0) BUILD_TEST (uint32x2_t, uint32x2_t, , , u32, 1, 0) -/* { dg-final { scan-assembler-times "ins\\tv0.s\\\[1\\\], v1.s\\\[0\\\]" 3 } } */ +/* { dg-final { scan-assembler-times "zip1\\tv0.2s, v0.2s, v1.2s" 3 } } */ BUILD_TEST (poly8x8_t, poly8x16_t, , q, p8, 7, 15) BUILD_TEST (int8x8_t, int8x16_t, , q, s8, 7, 15)
[PATCH 2/2][libsanitizer] hwasan: Remove testsuite check for a complaint message [PR112644]
Hi All, With recent updates to hwasan runtime libraries, the error reporting for this particular check is has been reworked. I would question why it has lost this message. To me it looks strange that num_descriptions_printed is incremented whenever we call PrintHeapOrGlobalCandidate whether that function prints anything or not. (See PrintAddressDescription in libsanitizer/hwasan/hwasan_report.cpp). The message is no longer printed because we increment this num_descriptions_printed variable indicating that we have found some description. I would like to question this upstream, but it doesn't look that much of a problem and if pressed for time we should just change our testsuite. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/testsuite/ChangeLog: PR sanitizer/112644 * c-c++-common/hwasan/hwasan-thread-clears-stack.c: Update testcase. --- inline copy of patch -- diff --git a/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c b/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c index 09c72a56f0f50a8c301d89217aa8c7df70087e6c..6c70684d72a887c49b02ecb17ca097da81a9168f 100644 --- a/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c +++ b/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c @@ -52,5 +52,4 @@ main (int argc, char **argv) /* { dg-output "HWAddressSanitizer: tag-mismatch on address 0x\[0-9a-f\]*.*" } */ /* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: \[\[:xdigit:\]\]\[\[:xdigit:\]\]/00 \\(ptr/mem\\) in thread T0.*" } */ -/* { dg-output "HWAddressSanitizer can not describe address in more detail\..*" } */ /* { dg-output "SUMMARY: HWAddressSanitizer: tag-mismatch \[^\n\]*.*" } */ -- diff --git a/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c b/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c index 09c72a56f0f50a8c301d89217aa8c7df70087e6c..6c70684d72a887c49b02ecb17ca097da81a9168f 100644 --- a/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c +++ b/gcc/testsuite/c-c++-common/hwasan/hwasan-thread-clears-stack.c @@ -52,5 +52,4 @@ main (int argc, char **argv) /* { dg-output "HWAddressSanitizer: tag-mismatch on address 0x\[0-9a-f\]*.*" } */ /* { dg-output "READ of size 4 at 0x\[0-9a-f\]* tags: \[\[:xdigit:\]\]\[\[:xdigit:\]\]/00 \\(ptr/mem\\) in thread T0.*" } */ -/* { dg-output "HWAddressSanitizer can not describe address in more detail\..*" } */ /* { dg-output "SUMMARY: HWAddressSanitizer: tag-mismatch \[^\n\]*.*" } */
[PATCH 1/2][libsanitizer] hwasan: Remove testsuite check for a complaint message [PR112644]
Hi All, Recent libhwasan updates[1] intercept various string and memory functions. These functions have checking in them, which means there's no need to inline the checking. This patch marks said functions as intercepted, and adjusts a testcase to handle the difference. It also looks for HWASAN in a check in expand_builtin. This check originally is there to avoid using expand to inline the behaviour of builtins like memset which are intercepted by ASAN and hence which we rely on the function call staying as a function call. With the new reliance on function calls in HWASAN we need to do the same thing for HWASAN too. HWASAN and ASAN don't seem to however instrument the same functions. Looking into libsanitizer/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc it looks like the common ones are memset, memmove and memcpy. The rest of the routines for asan seem to be defined in compiler-rt/lib/asan/asan_interceptors.h however compiler-rt/lib/hwasan/ does not have such a file but it does have compiler-rt/lib/hwasan/hwasan_platform_interceptors.h which it looks like is forcing off everything but memset, memmove, memcpy, memcmp and bcmp. As such I've taken those as the final list that hwasan currently supports. This also means that on future updates this list should be cross checked. [1] https://discourse.llvm.org/t/hwasan-question-about-the-recent-interceptors-being-added/75351 Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR sanitizer/112644 * asan.h (asan_intercepted_p): Incercept memset, memmove, memcpy and memcmp. * builtins.cc (expand_builtin): Include HWASAN when checking for builtin inlining. gcc/testsuite/ChangeLog: PR sanitizer/112644 * c-c++-common/hwasan/builtin-special-handling.c: Update testcase. Co-Authored-By: Matthew Malcomson --- inline copy of patch -- diff --git a/gcc/asan.h b/gcc/asan.h index 82811bdbe697665652aba89f2ee1c3ac07970df9..d1bf8b1e701b15525c6a900d324f2aebfb778cba 100644 --- a/gcc/asan.h +++ b/gcc/asan.h @@ -185,8 +185,13 @@ extern hash_set *asan_handled_variables; inline bool asan_intercepted_p (enum built_in_function fcode) { + /* This list should be kept up-to-date with upstream's version at + compiler-rt/lib/hwasan/hwasan_platform_interceptors.h. */ if (hwasan_sanitize_p ()) -return false; +return fcode == BUILT_IN_MEMCMP +|| fcode == BUILT_IN_MEMCPY +|| fcode == BUILT_IN_MEMMOVE +|| fcode == BUILT_IN_MEMSET; return fcode == BUILT_IN_INDEX || fcode == BUILT_IN_MEMCHR diff --git a/gcc/builtins.cc b/gcc/builtins.cc index a0bd82c7981c05caf2764de70c62fe83bef9ad29..12cc7a54e99555d0f4b21fa2cc32ffa7bb548f18 100644 --- a/gcc/builtins.cc +++ b/gcc/builtins.cc @@ -7792,7 +7792,8 @@ expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, default: break; } - if (sanitize_flags_p (SANITIZE_ADDRESS) && asan_intercepted_p (fcode)) + if (sanitize_flags_p (SANITIZE_ADDRESS | SANITIZE_HWADDRESS) + && asan_intercepted_p (fcode)) return expand_call (exp, target, ignore); /* When not optimizing, generate calls to library functions for a certain diff --git a/gcc/testsuite/c-c++-common/hwasan/builtin-special-handling.c b/gcc/testsuite/c-c++-common/hwasan/builtin-special-handling.c index a7a6d91693ae48c20f33ab28f28d27b01af4722c..f975b1cc397bc0d6fd475dbfed5ccc8ac386 100644 --- a/gcc/testsuite/c-c++-common/hwasan/builtin-special-handling.c +++ b/gcc/testsuite/c-c++-common/hwasan/builtin-special-handling.c @@ -8,24 +8,24 @@ /* { dg-skip-if "" { *-*-* } { "-flto" } { "-flto-partition=none" } } */ typedef __SIZE_TYPE__ size_t; -/* Functions to observe that HWASAN instruments memory builtins in the expected - manner. */ +/* HWASAN used to instrument calls to memset, memcpy, and memmove. It no + longer does this. Many other string and memory builtins are intercepted by + the runtime (and hence the codegen need not do anything). */ void * __attribute__((noinline)) memset_builtin (void *dest, int value, size_t len) { return __builtin_memset (dest, value, len); } -/* HWASAN avoids strlen because it doesn't know the size of the memory access - until *after* the function call. */ size_t __attribute__ ((noinline)) strlen_builtin (char *element) { return __builtin_strlen (element); } -/* First test ensures that the HWASAN_CHECK was emitted before the - memset. Second test ensures there was only HWASAN_CHECK (which demonstrates - that strlen was not instrumented). */ -/* { dg-final { scan-tree-dump-times "HWASAN_CHECK.*memset" 1 "asan1" } } */ -/* { dg-final { scan-tree-dump-times "HWASAN_CHECK" 1 "asan1" } } */ +/* First check here ensures there is no inline instrumentation generated for + these builtins. Second checks that we end up calling memset (i.e. that it's + not optimised
RE: [PATCH][libsanitizer]: Sync fixes for asan interceptors from upstream [PR112644]
> -Original Message- > From: Andrew Pinski > Sent: Monday, January 29, 2024 9:55 PM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; ja...@redhat.com; > do...@redhat.com; k...@google.com; dvyu...@google.com > Subject: Re: [PATCH][libsanitizer]: Sync fixes for asan interceptors from > upstream > [PR112644] > > On Mon, Jan 29, 2024 at 7:04 AM Tamar Christina > wrote: > > > > Hi All, > > > > This cherry-picks and squashes the differences between commits > > > > > d3e5c20ab846303874a2a25e5877c72271fc798b..76e1e45922e6709392fb82aa > c44bebe3dbc2ea63 > > from LLVM upstream from compiler-rt/lib/hwasan/ to GCC on the changes > relevant > > for GCC. > > > > This is required to fix the linked PR. > > > > As mentioned in the PR the last sync brought in a bug from upstream[1] where > > operations became non-recoverable and as such the tests in AArch64 started > > failing. This cherry picks the fix and there are minor updates needed to > > GCC > > after this to fix the cases. > > > > [1] https://github.com/llvm/llvm-project/pull/74000 > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > Thanks for handling this; though I wonder how this slipped through > testing upstream in LLVM. I see they added some new testcases for > this. I Know GCC's testsuite for sanitizer is slightly different from > LLVM's. Is it the case, GCC has more tests in this area? Is someone > adding the testcases that GCC has in this area upstream to LLVM; > basically so merging won't bring in regressions like this in the > future? There were two parts here. The first one is that their testsuite didn't have any test for the recovery case. Which they've now added. But the second parts (which I'm not posting patches for) is that the change In hwasan means that the runtime can now instrument some additional library methods which it couldn't before. And GCC now needs to not inline these anymore. This does mean that on future updates one needs to take a look at the Instrumentation list and make sure to keep it in sync with GCC's otherwise we'll lose instrumentation. Regards, Tamar > > Thanks, > Andrew > > > > > Thanks, > > Tamar > > > > libsanitizer/ChangeLog: > > > > PR sanitizer/112644 > > * hwasan/hwasan_interceptors.cpp (ACCESS_MEMORY_RANGE, > > HWASAN_READ_RANGE, HWASAN_WRITE_RANGE, > COMMON_SYSCALL_PRE_READ_RANGE, > > COMMON_SYSCALL_PRE_WRITE_RANGE, > COMMON_INTERCEPTOR_WRITE_RANGE, > > COMMON_INTERCEPTOR_READ_RANGE): Make recoverable. > > > > --- inline copy of patch -- > > diff --git a/libsanitizer/hwasan/hwasan_interceptors.cpp > b/libsanitizer/hwasan/hwasan_interceptors.cpp > > index > d9237cf9b8e3bf982cf213123ef22e73ec027c9e..96df4dd0c24d7d3db28fa2557 > cf63da0f295e33f 100644 > > --- a/libsanitizer/hwasan/hwasan_interceptors.cpp > > +++ b/libsanitizer/hwasan/hwasan_interceptors.cpp > > @@ -36,16 +36,16 @@ struct HWAsanInterceptorContext { > >const char *interceptor_name; > > }; > > > > -# define ACCESS_MEMORY_RANGE(ctx, offset, size, access) > > \ > > -do { > > \ > > - __hwasan::CheckAddressSized > access>((uptr)offset, \ > > - size); > > \ > > +# define ACCESS_MEMORY_RANGE(offset, size, access) > >\ > > +do { > >\ > > + __hwasan::CheckAddressSized > access>((uptr)offset, \ > > +size); > >\ > > } while (0) > > > > -# define HWASAN_READ_RANGE(ctx, offset, size) \ > > -ACCESS_MEMORY_RANGE(ctx, offset, size, AccessType::Load) > > -# define HWASAN_WRITE_RANGE(ctx, offset, size) \ > > -ACCESS_MEMORY_RANGE(ctx, offset, size, AccessType::Store) > > +# define HWASAN_READ_RANGE(offset, size) \ > > +ACCESS_MEMORY_RANGE(offset, size, AccessType::Load) > > +# define HWASAN_WRITE_RANGE(offset, size) \ > > +ACCESS_MEMORY_RANGE(offset, size, AccessType::Store) > > > > # if !SANITIZER_APPLE > > #define HWASAN_INTERCEPT_FUNC(name) > > \ > > @@ -74,9 +74,8 @@ struct HWAsanInterceptorContext { > > > > # if HWASAN_WITH_INTERCEPTORS > > > > -#define COMMON_SYSC
RE: [PATCH]middle-end: check memory accesses in the destination block [PR113588].
> -Original Message- > From: Richard Biener > Sent: Tuesday, January 30, 2024 9:51 AM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; j...@ventanamicro.com > Subject: Re: [PATCH]middle-end: check memory accesses in the destination block > [PR113588]. > > On Mon, 29 Jan 2024, Tamar Christina wrote: > > > Hi All, > > > > When analyzing loads for early break it was always the intention that > > for the exit where things get moved to we only check the loads that can > > be reached from the condition. > > Looking at the code I'm a bit confused that we always move to > single_pred (loop->latch) - IIRC that was different at some point? > > Shouldn't we move stores after the last early exit condition instead? Yes it was changed during another PR fix. The rationale at that time didn't take into account the peeled case. It used to be that we would "search" for the the exit to place it in. At that time the rational was, well it doesn't make sense. It has to go in the block that is the last to be executed. With the non-peeled case it's always the one before the latch. Or put differently, I think the destination should be the main IV block. I am not quite sure I'm following why you want to put the peeled cases inside the latch block. Ah, is it because the latch block is always going to only be executed when you make a full iteration? That makes sense, but then I think we should also analyze the stores in all blocks (which your change maybe already does, let me check) since we'll also lifting past the final block we need to update the vuses there too. If the above is correct then I think I understand what you're saying and will update the patch and do some Checks. Thanks, Tamar > > In particular for the peeled case single_pred (loop->latch) is the > block with the actual early exit condition? So for that case we'd > need to move to the latch itself instead? For non-peeled we move > to the block with the IV condition which looks OK. > > > However the main loop checks all loads and we skip the destination BB. > > As such we never actually check the loads reachable from the COND in the > > last BB unless this BB was also the exit chosen by the vectorizer. > > > > This leads us to incorrectly vectorize the loop in the PR and in doing so > > access > > out of bounds. > > > > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > > > Ok for master? > > The patch ends up with a worklist and another confusing comment > > + /* For the destination BB we need to only analyze loads reachable from > the early > + break statement itself. */ > > But I think it's a downstream issue from the issue above. That said, > even for the non-peeled case we need to check ref_within_array_bound, > no? > > So what about re-doing that initial loop like the following instead > (and also fix dest_bb, but I'd like clarification here). Basically > walk all blocks, do the ref_within_array_bound first and only > after we've seen 'dest_bb' do the checks required for moving > stores for all upstream BBs. > > And dest_bb should be > > /* Move side-effects to the in-loop destination of the last early > exit. */ > if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) > dest_bb = loop->latch; > else > dest_bb = single_pred (loop->latch); > > > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > index f592aeb8028..d6c8910dd6c 100644 > --- a/gcc/tree-vect-data-refs.cc > +++ b/gcc/tree-vect-data-refs.cc > @@ -668,7 +668,6 @@ vect_analyze_early_break_dependences (loop_vec_info > loop_vinfo) >auto_vec bases; >basic_block dest_bb = NULL; > > - hash_set visited; >class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); >class loop *loop_nest = loop_outer (loop); > > @@ -681,15 +680,11 @@ vect_analyze_early_break_dependences > (loop_vec_info loop_vinfo) > side-effects to is always the latch connected exit. When we support > general control flow we can do better but for now this is fine. */ >dest_bb = single_pred (loop->latch); > - basic_block bb = dest_bb; > + basic_block bb = loop->latch; > + bool check_deps = false; > >do > { > - /* If the destination block is also the header then we have nothing to > do. */ > - if (!single_pred_p (bb)) > - continue; > - > - bb = single_pred (bb); >gimple_stmt_iterator gsi = gsi_last_bb (bb); > >/* Now analyze all the remaining statements and try to determine which > @@ -707,6 +702,25 @@ vect_analyze_early_break_dependences (loop_vec_info > loop_vi
[PATCH]middle-end: check memory accesses in the destination block [PR113588].
Hi All, When analyzing loads for early break it was always the intention that for the exit where things get moved to we only check the loads that can be reached from the condition. However the main loop checks all loads and we skip the destination BB. As such we never actually check the loads reachable from the COND in the last BB unless this BB was also the exit chosen by the vectorizer. This leads us to incorrectly vectorize the loop in the PR and in doing so access out of bounds. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/113588 * tree-vect-data-refs.cc (vect_analyze_early_break_dependences_1): New. (vect_analyze_data_ref_dependence): Use it. (vect_analyze_early_break_dependences): Update comments. gcc/testsuite/ChangeLog: PR tree-optimization/113588 * gcc.dg/vect/vect-early-break_108-pr113588.c: New test. * gcc.dg/vect/vect-early-break_109-pr113588.c: New test. --- inline copy of patch -- diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c new file mode 100644 index ..e488619c9aac41fafbcf479818392a6bb7c6924f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_108-pr113588.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +int foo (const char *s, unsigned long n) +{ + unsigned long len = 0; + while (*s++ && n--) + ++len; + return len; +} + diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c new file mode 100644 index ..488c19d3ede809631d1a7ede0e7f7bcdc7a1ae43 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_109-pr113588.c @@ -0,0 +1,44 @@ +/* { dg-add-options vect_early_break } */ +/* { dg-require-effective-target vect_early_break } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target mmap } */ + +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ + +#include +#include + +#include "tree-vect.h" + +__attribute__((noipa)) +int foo (const char *s, unsigned long n) +{ + unsigned long len = 0; + while (*s++ && n--) + ++len; + return len; +} + +int main() +{ + + check_vect (); + + long pgsz = sysconf (_SC_PAGESIZE); + void *p = mmap (NULL, pgsz * 3, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, 0, 0); + if (p == MAP_FAILED) +return 0; + mprotect (p, pgsz, PROT_NONE); + mprotect (p+2*pgsz, pgsz, PROT_NONE); + char *p1 = p + pgsz; + p1[0] = 1; + p1[1] = 0; + foo (p1, 1000); + p1 = p + 2*pgsz - 2; + p1[0] = 1; + p1[1] = 0; + foo (p1, 1000); + return 0; +} + diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc index f592aeb8028afd4fd70e2175104efab2a2c0d82e..52cef242a7ce5d0e525bff639fa1dc2f0a6f30b9 100644 --- a/gcc/tree-vect-data-refs.cc +++ b/gcc/tree-vect-data-refs.cc @@ -619,10 +619,69 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, return opt_result::success (); } -/* Funcion vect_analyze_early_break_dependences. +/* Function vect_analyze_early_break_dependences_1 - Examime all the data references in the loop and make sure that if we have - mulitple exits that we are able to safely move stores such that they become + Helper function of vect_analyze_early_break_dependences which performs safety + analysis for load operations in an early break. */ + +static opt_result +vect_analyze_early_break_dependences_1 (data_reference *dr_ref, gimple *stmt) +{ + /* We currently only support statically allocated objects due to + not having first-faulting loads support or peeling for + alignment support. Compute the size of the referenced object + (it could be dynamically allocated). */ + tree obj = DR_BASE_ADDRESS (dr_ref); + if (!obj || TREE_CODE (obj) != ADDR_EXPR) +{ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +"early breaks only supported on statically" +" allocated objects.\n"); + return opt_result::failure_at (stmt, +"can't safely apply code motion to " +"dependencies of %G to vectorize " +"the early exit.\n", stmt); +} + + tree refop = TREE_OPERAND (obj, 0); + tree refbase = get_base_address (refop); + if (!refbase || !DECL_P (refbase) || !DECL_SIZE (refbase) + || TREE_CODE (DECL_SIZE (refbase)) != INTEGER_CST) +{ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, +"early