Re: [GCC 13 PATCH] aarch64: Remove architecture dependencies from intrinsics
On Thu, Jul 20, 2023 at 09:37:14AM +0200, Richard Biener wrote: > On Thu, Jul 20, 2023 at 8:49 AM Richard Sandiford via Gcc-patches > wrote: > > > > Andrew Carlotti writes: > > > Updated patch to fix the fp16 intrinsic pragmas, and pushed to master. > > > OK to backport to GCC 13? > > > > OK, thanks. > > In case you want it in 13.2 please push it really soon, we want to do 13.2 RC1 > today. > > Richard. Pushed, thanks. > > > Richard > > > > > Many intrinsics currently depend on both an architecture version and a > > > feature, despite the corresponding instructions being available within > > > GCC at lower architecture versions. > > > > > > LLVM has already removed these explicit architecture version > > > dependences; this patch does the same for GCC. Note that +fp16 does not > > > imply +simd, so we need to add an explicit +simd for the Neon fp16 > > > intrinsics. > > > > > > Binutils did not previously support all of these architecture+feature > > > combinations, but this problem is already reachable from GCC. For > > > example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c > > > with -O3 -march=armv8-a+dotprod has resulted in an assembler error since > > > GCC 10. This is fixed in Binutils 2.41. > > > > > > This patch retains explicit architecture version dependencies for > > > features that do not currently have a separate feature flag. > > > > > > gcc/ChangeLog: > > > > > > * config/aarch64/aarch64.h (TARGET_MEMTAG): Remove armv8.5 > > > dependency. > > > * config/aarch64/arm_acle.h: Remove unnecessary armv8.x > > > dependencies from target pragmas. > > > * config/aarch64/arm_fp16.h (target): Likewise. > > > * config/aarch64/arm_neon.h (target): Likewise. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/aarch64/feature-bf16-backport.c: New test. > > > * gcc.target/aarch64/feature-dotprod-backport.c: New test. > > > * gcc.target/aarch64/feature-fp16-backport.c: New test. > > > * gcc.target/aarch64/feature-fp16-scalar-backport.c: New test. > > > * gcc.target/aarch64/feature-fp16fml-backport.c: New test. > > > * gcc.target/aarch64/feature-i8mm-backport.c: New test. > > > * gcc.target/aarch64/feature-memtag-backport.c: New test. > > > * gcc.target/aarch64/feature-sha3-backport.c: New test. > > > * gcc.target/aarch64/feature-sm4-backport.c: New test. > > > > > > --- > > > > > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h > > > index > > > a01f1ee99d85917941ffba55bc3b4dcac87b41f6..2b0fc97bb71e9d560ae26035c7d7142682e46c38 > > > 100644 > > > --- a/gcc/config/aarch64/aarch64.h > > > +++ b/gcc/config/aarch64/aarch64.h > > > @@ -292,7 +292,7 @@ enum class aarch64_feature : unsigned char { > > > #define TARGET_RNG (AARCH64_ISA_RNG) > > > > > > /* Memory Tagging instructions optional to Armv8.5 enabled through > > > +memtag. */ > > > -#define TARGET_MEMTAG (AARCH64_ISA_V8_5A && AARCH64_ISA_MEMTAG) > > > +#define TARGET_MEMTAG (AARCH64_ISA_MEMTAG) > > > > > > /* I8MM instructions are enabled through +i8mm. */ > > > #define TARGET_I8MM (AARCH64_ISA_I8MM) > > > diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h > > > index > > > 3b6b63e6805432b5f1686745f987c52d2967c7c1..7599a32301dadf80760d3cb40a8685d2e6a476fb > > > 100644 > > > --- a/gcc/config/aarch64/arm_acle.h > > > +++ b/gcc/config/aarch64/arm_acle.h > > > @@ -292,7 +292,7 @@ __rndrrs (uint64_t *__res) > > > #pragma GCC pop_options > > > > > > #pragma GCC push_options > > > -#pragma GCC target ("arch=armv8.5-a+memtag") > > > +#pragma GCC target ("+nothing+memtag") > > > > > > #define __arm_mte_create_random_tag(__ptr, __u64_mask) \ > > >__builtin_aarch64_memtag_irg(__ptr, __u64_mask) > > > diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h > > > index > > > 350f8cc33d99e16137e9d70fa7958b10924dc67f..c10f9dcf7e097ded1740955addcd73348649dc56 > > > 100644 > > > --- a/gcc/config/aarch64/arm_fp16.h > > > +++ b/gcc/config/aarch64/arm_fp16.h > > > @@ -30,7 +30,7 @@ > > > #include > > > > > > #pragma GCC push_options > > > -#pragma GCC target ("arch=armv8.2-a+fp16") > > > +#pragma GCC target ("+nothing+fp16") > > > > > > typedef __fp16 float16_t; > > > > > > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > > > index > > > 0ace1eeddb97443433c091d2363403fcf2907654..349f3167699447eb397af482eaeadf8a07617025 > > > 100644 > > > --- a/gcc/config/aarch64/arm_neon.h > > > +++ b/gcc/config/aarch64/arm_neon.h > > > @@ -25590,7 +25590,7 @@ __INTERLEAVE_LIST (zip) > > > #include "arm_fp16.h" > > > > > > #pragma GCC push_options > > > -#pragma GCC target ("arch=armv8.2-a+fp16") > > > +#pragma GCC target ("+nothing+simd+fp16") > > > > > > /* ARMv8.2-A FP16 one operand vector intrinsics. */ > > > > > > @@ -26753,7 +26753,7 @@ vminnmvq_f16 (float16x8_t __a) > > > /* AdvSIMD Dot Product intrinsics. */ > > > > > > #pragma GCC push_options > > > -#pragma GCC target ("arch=armv8.2-a+dotprod") > > >
Re: [GCC 13 PATCH] aarch64: Remove architecture dependencies from intrinsics
On Wed, Jul 19, 2023 at 07:35:26PM +0100, Ramana Radhakrishnan wrote: > On Wed, Jul 19, 2023 at 5:44 PM Andrew Carlotti via Gcc-patches > wrote: > > > > Updated patch to fix the fp16 intrinsic pragmas, and pushed to master. > > OK to backport to GCC 13? > > > > > > Many intrinsics currently depend on both an architecture version and a > > feature, despite the corresponding instructions being available within > > GCC at lower architecture versions. > > > > LLVM has already removed these explicit architecture version > > dependences; this patch does the same for GCC. Note that +fp16 does not > > imply +simd, so we need to add an explicit +simd for the Neon fp16 > > intrinsics. > > > > Binutils did not previously support all of these architecture+feature > > combinations, but this problem is already reachable from GCC. For > > example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c > > with -O3 -march=armv8-a+dotprod has resulted in an assembler error since > > GCC 10. This is fixed in Binutils 2.41. > > Are there any implementations that actually implement v8-a + dotprod > ?. As far as I'm aware this was v8.2-A as the base architecture where > this was allowed. Has this changed recently? > > > regards > Ramana I don't recall whether there are any physical implementations of DotProd without Armv8.2, but similar situations have already occurred with other features. There are also situations where developers wish to enable only a subset of available features. For example, the existing restrictions in GCC have forced Chromium to disable their memtag support when building with GCC [1]; with this patch, they will be able to reenable memtag support from GCC 14 (and GCC 13.x when this is backported). I don't see any advantages to trying to enforce minimum architecture versions for features in GCC, except perhaps maintaining the status quo. But the status quo is already rather inconsistent, and these changes only make GCC more permissive (and only for options that currently don't work). [1] https://chromium-review.googlesource.com/c/chromium/src/+/3238466
[GCC 13 PATCH] aarch64: Remove architecture dependencies from intrinsics
Updated patch to fix the fp16 intrinsic pragmas, and pushed to master. OK to backport to GCC 13? Many intrinsics currently depend on both an architecture version and a feature, despite the corresponding instructions being available within GCC at lower architecture versions. LLVM has already removed these explicit architecture version dependences; this patch does the same for GCC. Note that +fp16 does not imply +simd, so we need to add an explicit +simd for the Neon fp16 intrinsics. Binutils did not previously support all of these architecture+feature combinations, but this problem is already reachable from GCC. For example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c with -O3 -march=armv8-a+dotprod has resulted in an assembler error since GCC 10. This is fixed in Binutils 2.41. This patch retains explicit architecture version dependencies for features that do not currently have a separate feature flag. gcc/ChangeLog: * config/aarch64/aarch64.h (TARGET_MEMTAG): Remove armv8.5 dependency. * config/aarch64/arm_acle.h: Remove unnecessary armv8.x dependencies from target pragmas. * config/aarch64/arm_fp16.h (target): Likewise. * config/aarch64/arm_neon.h (target): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/feature-bf16-backport.c: New test. * gcc.target/aarch64/feature-dotprod-backport.c: New test. * gcc.target/aarch64/feature-fp16-backport.c: New test. * gcc.target/aarch64/feature-fp16-scalar-backport.c: New test. * gcc.target/aarch64/feature-fp16fml-backport.c: New test. * gcc.target/aarch64/feature-i8mm-backport.c: New test. * gcc.target/aarch64/feature-memtag-backport.c: New test. * gcc.target/aarch64/feature-sha3-backport.c: New test. * gcc.target/aarch64/feature-sm4-backport.c: New test. --- diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index a01f1ee99d85917941ffba55bc3b4dcac87b41f6..2b0fc97bb71e9d560ae26035c7d7142682e46c38 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -292,7 +292,7 @@ enum class aarch64_feature : unsigned char { #define TARGET_RNG (AARCH64_ISA_RNG) /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag. */ -#define TARGET_MEMTAG (AARCH64_ISA_V8_5A && AARCH64_ISA_MEMTAG) +#define TARGET_MEMTAG (AARCH64_ISA_MEMTAG) /* I8MM instructions are enabled through +i8mm. */ #define TARGET_I8MM (AARCH64_ISA_I8MM) diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index 3b6b63e6805432b5f1686745f987c52d2967c7c1..7599a32301dadf80760d3cb40a8685d2e6a476fb 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -292,7 +292,7 @@ __rndrrs (uint64_t *__res) #pragma GCC pop_options #pragma GCC push_options -#pragma GCC target ("arch=armv8.5-a+memtag") +#pragma GCC target ("+nothing+memtag") #define __arm_mte_create_random_tag(__ptr, __u64_mask) \ __builtin_aarch64_memtag_irg(__ptr, __u64_mask) diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h index 350f8cc33d99e16137e9d70fa7958b10924dc67f..c10f9dcf7e097ded1740955addcd73348649dc56 100644 --- a/gcc/config/aarch64/arm_fp16.h +++ b/gcc/config/aarch64/arm_fp16.h @@ -30,7 +30,7 @@ #include #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+fp16") +#pragma GCC target ("+nothing+fp16") typedef __fp16 float16_t; diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 0ace1eeddb97443433c091d2363403fcf2907654..349f3167699447eb397af482eaeadf8a07617025 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -25590,7 +25590,7 @@ __INTERLEAVE_LIST (zip) #include "arm_fp16.h" #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+fp16") +#pragma GCC target ("+nothing+simd+fp16") /* ARMv8.2-A FP16 one operand vector intrinsics. */ @@ -26753,7 +26753,7 @@ vminnmvq_f16 (float16x8_t __a) /* AdvSIMD Dot Product intrinsics. */ #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+dotprod") +#pragma GCC target ("+nothing+dotprod") __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -26844,7 +26844,7 @@ vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index) #pragma GCC pop_options #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+sm4") +#pragma GCC target ("+nothing+sm4") __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -26911,7 +26911,7 @@ vsm4ekeyq_u32 (uint32x4_t __a, uint32x4_t __b) #pragma GCC pop_options #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+sha3") +#pragma GCC target ("+nothing+sha3") __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -27547,7 +27547,7 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, #pragma GCC pop_options #pragma GCC
Re: [PATCH] aarch64: Remove architecture dependencies from intrinsics
On Tue, Jun 27, 2023 at 07:23:32AM +0100, Richard Sandiford wrote: > Andrew Carlotti via Gcc-patches writes: > > Many intrinsics currently depend on both an architecture version and a > > feature, despite the corresponding instructions being available within > > GCC at lower architecture versions. > > > > LLVM has already removed these explicit architecture version > > dependences; this patch does the same for GCC, as well as removing an > > unecessary simd dependency for the scalar fp16 intrinsics. > > > > Binutils does not support all of these architecture+feature combinations > > yet, but this is an existing problem that is already reachable from GCC. > > For example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c > > with -O3 -march=armv8-a+dotprod has resulted in an assembler error since > > GCC 10. I intend to patch this in binutils. > > > > This patch retains explicit architecture version dependencies for > > features that do not currently have a separate feature flag. > > > > Ok for master, and backport to GCC 13? > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64.h (TARGET_MEMTAG): Remove armv8.5 > > dependency. > > * config/aarch64/arm_acle.h: Remove unnecessary armv8.x > > dependencies from target pragmas. > > * config/aarch64/arm_fp16.h (target): Likewise. > > The change to this file is a bit different from the others, > since it's removing an implicit dependency on +simd, rather > than a dependency on an architecture level. I think it'd be > worth mentioning that explicitly in the changelog. > > OK with that change, thanks. > > (Arguably we should add +nosimd to many of the other pragmas in > arm_acle.h, but that's logically a separate patch.) > > Richard Actually, I think I should just remove the +nosimd from the patch, because +fp16 doesn't enable simd (unlike +bf16, which has simd as an 'explicit on' implication). Aside from +bf16, the only other feature with simd as an 'explicit on' is +rdma. However, there appear to be no non-simd rdma instructions, so +nothing+rdma+nosimd is effectively the same as +nothing. > > ... > > > > diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h > > index > > a8fa4dbbdfe1bab4aa604bb311ef66d4e1de18ac..84b2ed66f9ba19fba6ccd8be33940d7239bfa22e > > 100644 > > --- a/gcc/config/aarch64/arm_fp16.h > > +++ b/gcc/config/aarch64/arm_fp16.h > > @@ -30,7 +30,7 @@ > > #include > > > > #pragma GCC push_options > > -#pragma GCC target ("arch=armv8.2-a+fp16") > > +#pragma GCC target ("+nothing+fp16+nosimd") > > > > typedef __fp16 float16_t; > >
[committed] docs: Fix typo
gcc/ChangeLog: * doc/optinfo.texi: Fix "steam" -> "stream". diff --git a/gcc/doc/optinfo.texi b/gcc/doc/optinfo.texi index b91bba7bd10470b17ca5190688beee06ad3b87ab..5e8c97ef118786e68b7e46f3c802154cb9b57b83 100644 --- a/gcc/doc/optinfo.texi +++ b/gcc/doc/optinfo.texi @@ -100,7 +100,7 @@ that one could also use special file names @code{stdout} and respectively. @item @code{alt_stream} -This steam is used for printing optimization specific output in +This stream is used for printing optimization specific output in response to the @option{-fopt-info}. Again a file name can be given. If the file name is not given, it defaults to @code{stderr}. @end table
[PATCH] aarch64: Remove architecture dependencies from intrinsics
Many intrinsics currently depend on both an architecture version and a feature, despite the corresponding instructions being available within GCC at lower architecture versions. LLVM has already removed these explicit architecture version dependences; this patch does the same for GCC, as well as removing an unecessary simd dependency for the scalar fp16 intrinsics. Binutils does not support all of these architecture+feature combinations yet, but this is an existing problem that is already reachable from GCC. For example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c with -O3 -march=armv8-a+dotprod has resulted in an assembler error since GCC 10. I intend to patch this in binutils. This patch retains explicit architecture version dependencies for features that do not currently have a separate feature flag. Ok for master, and backport to GCC 13? gcc/ChangeLog: * config/aarch64/aarch64.h (TARGET_MEMTAG): Remove armv8.5 dependency. * config/aarch64/arm_acle.h: Remove unnecessary armv8.x dependencies from target pragmas. * config/aarch64/arm_fp16.h (target): Likewise. * config/aarch64/arm_neon.h (target): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/feature-bf16-backport.c: New test. * gcc.target/aarch64/feature-dotprod-backport.c: New test. * gcc.target/aarch64/feature-fp16-backport.c: New test. * gcc.target/aarch64/feature-fp16-scalar-backport.c: New test. * gcc.target/aarch64/feature-fp16fml-backport.c: New test. * gcc.target/aarch64/feature-i8mm-backport.c: New test. * gcc.target/aarch64/feature-memtag-backport.c: New test. * gcc.target/aarch64/feature-sha3-backport.c: New test. * gcc.target/aarch64/feature-sm4-backport.c: New test. diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h index 7129ed1ff370d597895b3f46b56b1250da7fa190..cdb664eb8f7db820b6b06b2667bfad6dc14cb7a2 100644 --- a/gcc/config/aarch64/aarch64.h +++ b/gcc/config/aarch64/aarch64.h @@ -292,7 +292,7 @@ enum class aarch64_feature : unsigned char { #define TARGET_RNG (AARCH64_ISA_RNG) /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag. */ -#define TARGET_MEMTAG (AARCH64_ISA_V8_5A && AARCH64_ISA_MEMTAG) +#define TARGET_MEMTAG (AARCH64_ISA_MEMTAG) /* I8MM instructions are enabled through +i8mm. */ #define TARGET_I8MM (AARCH64_ISA_I8MM) diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h index e0ac591d2c8d6c4c4c8a074b2d9881c47b1db1ab..87fb42f47c5821adecbb0ea441e0a38c63972e77 100644 --- a/gcc/config/aarch64/arm_acle.h +++ b/gcc/config/aarch64/arm_acle.h @@ -325,7 +325,7 @@ __rndrrs (uint64_t *__res) #pragma GCC pop_options #pragma GCC push_options -#pragma GCC target ("arch=armv8.5-a+memtag") +#pragma GCC target ("+nothing+memtag") #define __arm_mte_create_random_tag(__ptr, __u64_mask) \ __builtin_aarch64_memtag_irg(__ptr, __u64_mask) diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h index a8fa4dbbdfe1bab4aa604bb311ef66d4e1de18ac..84b2ed66f9ba19fba6ccd8be33940d7239bfa22e 100644 --- a/gcc/config/aarch64/arm_fp16.h +++ b/gcc/config/aarch64/arm_fp16.h @@ -30,7 +30,7 @@ #include #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+fp16") +#pragma GCC target ("+nothing+fp16+nosimd") typedef __fp16 float16_t; diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index eeec9f162e223df8cf7803b3227aef22e94227ac..a078674376af121c36bbebef76631c25a6815b1b 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -25590,7 +25590,7 @@ __INTERLEAVE_LIST (zip) #include "arm_fp16.h" #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+fp16") +#pragma GCC target ("+nothing+fp16") /* ARMv8.2-A FP16 one operand vector intrinsics. */ @@ -26753,7 +26753,7 @@ vminnmvq_f16 (float16x8_t __a) /* AdvSIMD Dot Product intrinsics. */ #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+dotprod") +#pragma GCC target ("+nothing+dotprod") __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -26844,7 +26844,7 @@ vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b, const int __index) #pragma GCC pop_options #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+sm4") +#pragma GCC target ("+nothing+sm4") __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -26911,7 +26911,7 @@ vsm4ekeyq_u32 (uint32x4_t __a, uint32x4_t __b) #pragma GCC pop_options #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+sha3") +#pragma GCC target ("+nothing+sha3") __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -27547,7 +27547,7 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t __a, float32x4_t __b, #pragma GCC pop_options #pragma GCC push_options -#pragma GCC target ("arch=armv8.2-a+fp16fml") +#pragma GCC
Re: [PATCH 1/2]middle-end: Fix wrong overmatching of div-bitmask by using new optabs [PR108583]
On Thu, Feb 23, 2023 at 11:39:51AM -0500, Andrew MacLeod via Gcc-patches wrote: > > > Inheriting from operator_mult is also going to be hazardous because it also > has an op1_range and op2_range...� you should at least define those and > return VARYING to avoid other issues.� Same thing applies to widen_plus I > think, and it has relation processing and other things as well.� Your widen > operands are not what those classes expect, so I think you probably just > want a fresh range operator. > > It also looks like the mult operation is sign/zero extending both upper > bounds, and neither lower bound..�� I think that should be the LH upper > and > lower bound? > > I've attached a second patch� (newversion.patch) which incorporates my fix, > the fix to the sign of only op1's bounds,� as well as a simplification of > the classes to not inherit from operator_mult/plus..�� I think this still > does what you want?� and it wont get you into unexpected trouble later :-) > > let me know if this is still doing what you are expecting... > > Andrew > Hi, This patch still uses the wrong signedness for some of the extensions in WIDEN_MULT_EXPR. It currently bases it's promotion decisions on whether there is any signed argument, and whether the result is signed - i.e.: Patch extends as: UUU UU UUS -> USU USU SU USS SU wrong SUU US wrong SUS -> SSU SSU SS wrong SSS SS The documentation in tree.def is unclear about whether the output signedness is linked to the input signedness, but at least the SSU case seems valid, and is mishandled here. I think it would be clearer and simpler to have four (or three) different versions for each combnation of signedness of the input operands. This could be implemented without extra code duplication by creating four different instances of an operator_widen_mult class (perhaps extending a range_operator_mixed_sign class), with the signedness indicated by two additional class members. The documentation for WIDEN_PLUS_EXPR (and several other expressions added in the same commit) is completely missing. If the signs are required to be matching, then this should be clarified; otherwise it would need the same special handling as WIDEN_MULT_EXPR. Andrew > diff --git a/gcc/gimple-range-op.cc b/gcc/gimple-range-op.cc > index d9dfdc56939..824e0338f34 100644 > --- a/gcc/gimple-range-op.cc > +++ b/gcc/gimple-range-op.cc > @@ -179,6 +179,8 @@ gimple_range_op_handler::gimple_range_op_handler (gimple > *s) >// statements. >if (is_a (m_stmt)) > maybe_builtin_call (); > + else > +maybe_non_standard (); > } > > // Calculate what we can determine of the range of this unary > @@ -764,6 +766,36 @@ public: >} > } op_cfn_parity; > > +// Set up a gimple_range_op_handler for any nonstandard function which can be > +// supported via range-ops. > + > +void > +gimple_range_op_handler::maybe_non_standard () > +{ > + if (gimple_code (m_stmt) == GIMPLE_ASSIGN) > +switch (gimple_assign_rhs_code (m_stmt)) > + { > + case WIDEN_MULT_EXPR: > + { > + m_valid = true; > + m_op1 = gimple_assign_rhs1 (m_stmt); > + m_op2 = gimple_assign_rhs2 (m_stmt); > + bool signed1 = TYPE_SIGN (TREE_TYPE (m_op1)) == SIGNED; > + bool signed2 = TYPE_SIGN (TREE_TYPE (m_op2)) == SIGNED; > + if (signed2 && !signed1) > + std::swap (m_op1, m_op2); > + > + if (signed1 || signed2) > + m_int = ptr_op_widen_mult_signed; > + else > + m_int = ptr_op_widen_mult_unsigned; > + break; > + } > + default: > + break; > + } > +} > + > // Set up a gimple_range_op_handler for any built in function which can be > // supported via range-ops. > > diff --git a/gcc/gimple-range-op.h b/gcc/gimple-range-op.h > index 743b858126e..1bf63c5ce6f 100644 > --- a/gcc/gimple-range-op.h > +++ b/gcc/gimple-range-op.h > @@ -41,6 +41,7 @@ public: >relation_trio = TRIO_VARYING); > private: >void maybe_builtin_call (); > + void maybe_non_standard (); >gimple *m_stmt; >tree m_op1, m_op2; > }; > diff --git a/gcc/range-op.cc b/gcc/range-op.cc > index 5c67bce6d3a..7cd19a92d00 100644 > --- a/gcc/range-op.cc > +++ b/gcc/range-op.cc > @@ -1556,6 +1556,34 @@ operator_plus::op2_range (irange , tree type, >return op1_range (r, type, lhs, op1, rel.swap_op1_op2 ()); > } > > +class operator_widen_plus : public range_operator > +{ > +public: > + virtual void wi_fold (irange , tree type, > + const wide_int _lb, > + const wide_int _ub, > + const wide_int _lb, > + const wide_int _ub) const; > +} op_widen_plus; > + > +void > +operator_widen_plus::wi_fold (irange , tree type, > + const wide_int _lb, const wide_int _ub, > + const wide_int _lb, const wide_int _ub) const
Re: [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN
Erm, ignore this - I just rediscovered the approval in a different mail folder. I forgot that Outlook's automatic email dedpulication meant that messages CC'd to me end up in one of two different folders at random when I want them in both. On Mon, Jan 16, 2023 at 02:03:29PM +, Andrew Carlotti via Gcc-patches wrote: > Hi Richard > > I accidentally pushed this patch earlier in the mistaken belief that > you'd already approved it. It looks uncontroversial to me - it just adds > IFN support to build_popcount_expr, analogous to the changes you > suggested and approved for build_cltz_expr (and adjusts testcases > accordingly). I might have incorporated it into an earlier patch in this > series, if I hadn't already pushed that earlier patch. > > Is this OK to leave in master now? > > Thanks, > Andrew > > On Thu, Dec 22, 2022 at 05:43:21PM +, Andrew Carlotti via Gcc-patches > wrote: > > Bootstrapped and regression tested on aarch64-unknown-linux-gnu and > > x86_64-pc-linux-gnu - ok to merge? > > > > gcc/ChangeLog: > > > > * tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support. > > > > gcc/testsuite/ChangeLog: > > > > * g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex. > > * gcc.dg/tree-ssa/popcount.c: Likewise. > > * gcc.dg/tree-ssa/popcount2.c: Likewise. > > * gcc.dg/tree-ssa/popcount3.c: Likewise. > > * gcc.target/aarch64/popcount4.c: Likewise. > > * gcc.target/i386/pr95771.c: Likewise, and... > > * gcc.target/i386/pr95771-2.c: ...split int128 test from above, > > since this would emit just a single IFN if a TI optab is added. > > > > --- > > > > diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C > > b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C > > index > > ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c > > 100644 > > --- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C > > +++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C > > @@ -12,5 +12,5 @@ int PopCount (long b) { > > return c; > > } > > > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } > > } */ > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 > > "optimized" } } */ > > /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */ > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c > > b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c > > index > > b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6 > > 100644 > > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c > > @@ -39,4 +39,4 @@ void PopCount3 (long b1) { > >} > > } > > > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } > > } */ > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 > > "optimized" } } */ > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c > > b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c > > index > > ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be > > 100644 > > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c > > @@ -26,4 +26,4 @@ int main() > >return 0; > > } > > > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } > > } */ > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 > > "optimized" } } */ > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c > > b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c > > index > > ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c > > 100644 > > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c > > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c > > @@ -12,5 +12,5 @@ int PopCount (long b) { > > return c; > > } > > > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } > > } */ > > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 > > "optimized" } } */ > > /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c > > b/gcc/testsuite/gcc.target/aarch64/popcount4.c > > index > &g
Re: [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN
Hi Richard I accidentally pushed this patch earlier in the mistaken belief that you'd already approved it. It looks uncontroversial to me - it just adds IFN support to build_popcount_expr, analogous to the changes you suggested and approved for build_cltz_expr (and adjusts testcases accordingly). I might have incorporated it into an earlier patch in this series, if I hadn't already pushed that earlier patch. Is this OK to leave in master now? Thanks, Andrew On Thu, Dec 22, 2022 at 05:43:21PM +, Andrew Carlotti via Gcc-patches wrote: > Bootstrapped and regression tested on aarch64-unknown-linux-gnu and > x86_64-pc-linux-gnu - ok to merge? > > gcc/ChangeLog: > > * tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support. > > gcc/testsuite/ChangeLog: > > * g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex. > * gcc.dg/tree-ssa/popcount.c: Likewise. > * gcc.dg/tree-ssa/popcount2.c: Likewise. > * gcc.dg/tree-ssa/popcount3.c: Likewise. > * gcc.target/aarch64/popcount4.c: Likewise. > * gcc.target/i386/pr95771.c: Likewise, and... > * gcc.target/i386/pr95771-2.c: ...split int128 test from above, > since this would emit just a single IFN if a TI optab is added. > > --- > > diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C > b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C > index > ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c > 100644 > --- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C > +++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C > @@ -12,5 +12,5 @@ int PopCount (long b) { > return c; > } > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } > */ > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 > "optimized" } } */ > /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c > b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c > index > b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6 > 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c > @@ -39,4 +39,4 @@ void PopCount3 (long b1) { >} > } > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } > */ > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 > "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c > b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c > index > ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be > 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c > @@ -26,4 +26,4 @@ int main() >return 0; > } > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } > */ > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 > "optimized" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c > b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c > index > ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c > 100644 > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c > @@ -12,5 +12,5 @@ int PopCount (long b) { > return c; > } > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } > */ > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 > "optimized" } } */ > /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c > b/gcc/testsuite/gcc.target/aarch64/popcount4.c > index > ee55b2e335223053ca024e95b7a13aa4af32550e..8aa15ff018d4b5fc6bb59e52af20d5c33cea2ee0 > 100644 > --- a/gcc/testsuite/gcc.target/aarch64/popcount4.c > +++ b/gcc/testsuite/gcc.target/aarch64/popcount4.c > @@ -11,4 +11,4 @@ int PopCount (long b) { > return c; > } > > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "optimized" } } > */ > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 0 > "optimized" } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr95771-2.c > b/gcc/testsuite/gcc.target/i386/pr95771-2.c > new file mode 100644 > index > ..1db9dc94d0b66477667624012221d6844c141a26 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i
[PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN
Bootstrapped and regression tested on aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu - ok to merge? gcc/ChangeLog: * tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support. gcc/testsuite/ChangeLog: * g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex. * gcc.dg/tree-ssa/popcount.c: Likewise. * gcc.dg/tree-ssa/popcount2.c: Likewise. * gcc.dg/tree-ssa/popcount3.c: Likewise. * gcc.target/aarch64/popcount4.c: Likewise. * gcc.target/i386/pr95771.c: Likewise, and... * gcc.target/i386/pr95771-2.c: ...split int128 test from above, since this would emit just a single IFN if a TI optab is added. --- diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644 --- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C +++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C @@ -12,5 +12,5 @@ int PopCount (long b) { return c; } -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */ /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c index b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c @@ -39,4 +39,4 @@ void PopCount3 (long b1) { } } -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c index ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c @@ -26,4 +26,4 @@ int main() return 0; } -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c index ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c @@ -12,5 +12,5 @@ int PopCount (long b) { return c; } -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 "optimized" } } */ /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c b/gcc/testsuite/gcc.target/aarch64/popcount4.c index ee55b2e335223053ca024e95b7a13aa4af32550e..8aa15ff018d4b5fc6bb59e52af20d5c33cea2ee0 100644 --- a/gcc/testsuite/gcc.target/aarch64/popcount4.c +++ b/gcc/testsuite/gcc.target/aarch64/popcount4.c @@ -11,4 +11,4 @@ int PopCount (long b) { return c; } -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 0 "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr95771-2.c b/gcc/testsuite/gcc.target/i386/pr95771-2.c new file mode 100644 index ..1db9dc94d0b66477667624012221d6844c141a26 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95771-2.c @@ -0,0 +1,17 @@ +/* PR tree-optimization/95771 */ +/* { dg-do compile } */ +/* { dg-require-effective-target int128 } */ +/* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump " = __builtin_popcount| = \\.POPCOUNT" "optimized" } } */ + +int +corge (unsigned __int128 x) +{ + int i = 0; + while (x) +{ + x &= x - 1; + ++i; +} + return i; +} diff --git a/gcc/testsuite/gcc.target/i386/pr95771.c b/gcc/testsuite/gcc.target/i386/pr95771.c index d7b67017800b705b9854f561916c20901ea76803..d41be445f4a68613a082b8956fea3ceaf33d7e0f 100644 --- a/gcc/testsuite/gcc.target/i386/pr95771.c +++ b/gcc/testsuite/gcc.target/i386/pr95771.c @@ -1,8 +1,7 @@ /* PR tree-optimization/95771 */ /* { dg-do compile } */ /* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */ -/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 6 "optimized" { target int128 } } } */ -/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 4 "optimized" { target { ! int128 } } } } */ +/* { dg-final { scan-tree-dump-times " = __builtin_popcount| = \\.POPCOUNT" 4 "optimized" } } */ int foo (unsigned char x) @@ -51,17 +50,3 @@ qux (unsigned long long x) } return i; } - -#ifdef __SIZEOF_INT128__ -int -corge (unsigned
[PATCH 6/8 v2] docs: Add popcount, clz and ctz target attributes
Updated to reflect Sphinx revert; I'll commit this once the cltz_complement patch is merged. gcc/ChangeLog: * doc/sourcebuild.texi: Add missing target attributes. --- diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi index ffe69d6fcb9c46cf97ba570e85b56e586a0c9b99..1036b185ee289bbf7883bd14956a41da9a6d677b 100644 --- a/gcc/doc/sourcebuild.texi +++ b/gcc/doc/sourcebuild.texi @@ -2511,6 +2511,24 @@ Target supports the execution of @code{amx-fp16} instructions. @item cell_hw Test system can execute AltiVec and Cell PPU instructions. +@item clz +Target supports a clz optab on int. + +@item clzl +Target supports a clz optab on long. + +@item clzll +Target supports a clz optab on long long. + +@item ctz +Target supports a ctz optab on int. + +@item ctzl +Target supports a ctz optab on long. + +@item ctzll +Target supports a ctz optab on long long. + @item cmpccxadd Target supports the execution of @code{cmpccxadd} instructions. @@ -2532,6 +2550,15 @@ Target does not require strict alignment. @item pie_copyreloc The x86-64 target linker supports PIE with copy reloc. +@item popcount +Target supports a popcount optab on int. + +@item popcountl +Target supports a popcount optab on long. + +@item popcountll +Target supports a popcount optab on long long. + @item prefetchi Target supports the execution of @code{prefetchi} instructions.
[PATCH 5/8 v2] middle-end: Add cltz_complement idiom recognition
On Thu, Nov 24, 2022 at 11:41:31AM +0100, Richard Biener wrote: > Note we do have CTZ and CLZ > optabs and internal functions - in case there's a HImode CLZ this > could be elided. More general you can avoid using the __builtin_ > functions with their fixed types in favor of using IFN_C[TL]Z which > are type agnostic (but require optab support - you should be able > to check this via direct_internal_fn_supported_p). IFN support added. I've also renamed the defined_at_zero parameter to define_at_zero, since this is a request for the expression to define it, rather than a guarantee that it is already defined. New patch below, bootstrapped and regression tested on aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu - ok to merge? --- This recognises patterns of the form: while (n) { n >>= 1 } This patch results in improved (but still suboptimal) codegen: foo (unsigned int b) { int c = 0; while (b) { b >>= 1; c++; } return c; } foo: .LFB11: .cfi_startproc cbz w0, .L3 clz w1, w0 tst x0, 1 mov w0, 32 sub w0, w0, w1 cselw0, w0, wzr, ne ret The conditional is unnecessary. phiopt could recognise a redundant csel (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a clz call, but it cannot recognise the redunancy when the input is (e.g.) (32 - clz). I could perhaps extend this function to recognise this pattern in a later patch, if this is a good place to recognise more patterns. gcc/ChangeLog: PR tree-optimization/94793 * tree-scalar-evolution.cc (expression_expensive_p): Add checks for c[lt]z optabs. * tree-ssa-loop-niter.cc (build_cltz_expr): New. (number_of_iterations_cltz_complement): New. (number_of_iterations_bitcount): Add call to the above. gcc/testsuite/ChangeLog: * lib/target-supports.exp (check_effective_target_clz) (check_effective_target_clzl, check_effective_target_clzll) (check_effective_target_ctz, check_effective_target_clzl) (check_effective_target_ctzll): New. * gcc.dg/tree-ssa/cltz-complement-max.c: New test. * gcc.dg/tree-ssa/clz-complement-char.c: New test. * gcc.dg/tree-ssa/clz-complement-int.c: New test. * gcc.dg/tree-ssa/clz-complement-long-long.c: New test. * gcc.dg/tree-ssa/clz-complement-long.c: New test. * gcc.dg/tree-ssa/ctz-complement-char.c: New test. * gcc.dg/tree-ssa/ctz-complement-int.c: New test. * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test. * gcc.dg/tree-ssa/ctz-complement-long.c: New test. --- diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c new file mode 100644 index ..1a29ca52e42e50822e4e3213b2cb008b766d0318 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c @@ -0,0 +1,60 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */ + +#define PREC (__CHAR_BIT__) + +int clz_complement_count1 (unsigned char b) { +int c = 0; + +while (b) { + b >>= 1; + c++; +} +if (c <= PREC) + return 0; +else + return 34567; +} + +int clz_complement_count2 (unsigned char b) { +int c = 0; + +while (b) { + b >>= 1; + c++; +} +if (c <= PREC - 1) + return 0; +else + return 76543; +} + +int ctz_complement_count1 (unsigned char b) { +int c = 0; + +while (b) { + b <<= 1; + c++; +} +if (c <= PREC) + return 0; +else + return 23456; +} + +int ctz_complement_count2 (unsigned char b) { +int c = 0; + +while (b) { + b <<= 1; + c++; +} +if (c <= PREC - 1) + return 0; +else + return 65432; +} +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c new file mode 100644 index ..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c @@ -0,0 +1,31 @@ +/* { dg-do run } */ +/* { dg-require-effective-target clz } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +#define PREC (__CHAR_BIT__) + +int +__attribute__ ((noinline, noclone)) +foo (unsigned char b) { +int c = 0; + +while (b) { + b >>= 1; + c++; +} + +return c; +} + +int main() +{ + if (foo(0) != 0) +__builtin_abort (); + if (foo(5) != 3) +__builtin_abort (); + if (foo(255) != 8) +__builtin_abort (); + return 0; +} + +/* { dg-final {
Re: [committed] docs: Fix peephole paragraph ordering
Patches attached to the wrong email - this patch was actually: On Thu, Dec 22, 2022 at 05:06:13PM +, Andrew Carlotti via Gcc-patches wrote: > The documentation for the DONE and FAIL macros was incorrectly inserted > between example code, and a remark attached to that example. > > Committed as obvious. > > gcc/ChangeLog: > > * doc/md.texi: Move example code remark next to it's code block. > > --- diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index cc28f868fc85b5148450548a54d69a39ecc4f03a..c1d3ae2060d800bbaa9751fcf841d7417af1e37d 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -9321,6 +9321,11 @@ so here's a silly made-up example: "") @end smallexample +@noindent +If we had not added the @code{(match_dup 4)} in the middle of the input +sequence, it might have been the case that the register we chose at the +beginning of the sequence is killed by the first or second @code{set}. + There are two special macros defined for use in the preparation statements: @code{DONE} and @code{FAIL}. Use them with a following semicolon, as a statement. @@ -9348,11 +9353,6 @@ If the preparation falls through (invokes neither @code{DONE} nor @code{FAIL}), then the @code{define_peephole2} uses the replacement template. -@noindent -If we had not added the @code{(match_dup 4)} in the middle of the input -sequence, it might have been the case that the register we chose at the -beginning of the sequence is killed by the first or second @code{set}. - @end ifset @ifset INTERNALS @node Insn Attributes
Re: [committed] docs: Link to correct section for constraint modifiers
Patches attached in to the wrong emails - this patch was actually: On Thu, Dec 22, 2022 at 05:05:51PM +, Andrew Carlotti via Gcc-patches wrote: > Committed as obvious. > > gcc/ChangeLog: > > * doc/md.texi: Fix incorrect pxref. > > --- diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 482e86f15d8b312c67d4962510ce879fb5cbc541..78dc6d720700ca409677e44a34a60d4b7fceb046 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -1511,7 +1511,7 @@ operand 1 (meaning it must match operand 0), and @samp{dKs} for operand 2. The second alternative has @samp{d} (data register) for operand 0, @samp{0} for operand 1, and @samp{dmKs} for operand 2. The @samp{=} and @samp{%} in the constraints apply to all the alternatives; their -meaning is explained in the next section (@pxref{Class Preferences}). +meaning is explained in a later section (@pxref{Modifiers}). If all the operands fit any one alternative, the instruction is valid. Otherwise, for each alternative, the compiler counts how many instructions
[committed] docs: Fix peephole paragraph ordering
The documentation for the DONE and FAIL macros was incorrectly inserted between example code, and a remark attached to that example. Committed as obvious. gcc/ChangeLog: * doc/md.texi: Move example code remark next to it's code block. --- diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 482e86f15d8b312c67d4962510ce879fb5cbc541..78dc6d720700ca409677e44a34a60d4b7fceb046 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -1511,7 +1511,7 @@ operand 1 (meaning it must match operand 0), and @samp{dKs} for operand 2. The second alternative has @samp{d} (data register) for operand 0, @samp{0} for operand 1, and @samp{dmKs} for operand 2. The @samp{=} and @samp{%} in the constraints apply to all the alternatives; their -meaning is explained in the next section (@pxref{Class Preferences}). +meaning is explained in a later section (@pxref{Modifiers}). If all the operands fit any one alternative, the instruction is valid. Otherwise, for each alternative, the compiler counts how many instructions
[committed] docs: Fix inconsistent example predicate name
It is unclear why the example C function was renamed to `commutative_integer_operator` as part of ec8e098d in 2004, while the text and the example md were both left as `commutative_operator`. The latter name appears to be more accurate, so revert the 2004 change. Committed as obvious. gcc/ChangeLog: * doc/md.texi: Fix inconsistent example name. --- diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 78dc6d720700ca409677e44a34a60d4b7fceb046..cc28f868fc85b5148450548a54d69a39ecc4f03a 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -377,7 +377,7 @@ commutative arithmetic operators of RTL and whose mode is @var{mode}: @smallexample int -commutative_integer_operator (x, mode) +commutative_operator (x, mode) rtx x; machine_mode mode; @{
[committed] docs: Link to correct section for constraint modifiers
Committed as obvious. gcc/ChangeLog: * doc/md.texi: Fix incorrect pxref. --- diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index cc28f868fc85b5148450548a54d69a39ecc4f03a..c1d3ae2060d800bbaa9751fcf841d7417af1e37d 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -9321,6 +9321,11 @@ so here's a silly made-up example: "") @end smallexample +@noindent +If we had not added the @code{(match_dup 4)} in the middle of the input +sequence, it might have been the case that the register we chose at the +beginning of the sequence is killed by the first or second @code{set}. + There are two special macros defined for use in the preparation statements: @code{DONE} and @code{FAIL}. Use them with a following semicolon, as a statement. @@ -9348,11 +9353,6 @@ If the preparation falls through (invokes neither @code{DONE} nor @code{FAIL}), then the @code{define_peephole2} uses the replacement template. -@noindent -If we had not added the @code{(match_dup 4)} in the middle of the input -sequence, it might have been the case that the register we chose at the -beginning of the sequence is killed by the first or second @code{set}. - @end ifset @ifset INTERNALS @node Insn Attributes
Re: [PATCH 5/8] middle-end: Add cltz_complement idiom recognition
On Mon, Nov 14, 2022 at 04:10:22PM +0100, Richard Biener wrote: > On Fri, Nov 11, 2022 at 7:53 PM Andrew Carlotti via Gcc-patches > wrote: > > > > This recognises patterns of the form: > > while (n) { n >>= 1 } > > > > This patch results in improved (but still suboptimal) codegen: > > > > foo (unsigned int b) { > > int c = 0; > > > > while (b) { > > b >>= 1; > > c++; > > } > > > > return c; > > } > > > > foo: > > .LFB11: > > .cfi_startproc > > cbz w0, .L3 > > clz w1, w0 > > tst x0, 1 > > mov w0, 32 > > sub w0, w0, w1 > > cselw0, w0, wzr, ne > > ret > > > > The conditional is unnecessary. phiopt could recognise a redundant csel > > (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a > > clz call, but it cannot recognise the redunancy when the input is (e.g.) > > (32 - clz). > > > > I could perhaps extend this function to recognise this pattern in a later > > patch, if this is a good place to recognise more patterns. > > > > gcc/ChangeLog: > > + PR tree-optimization/94793 > > * tree-scalar-evolution.cc (expression_expensive_p): Add checks > > for c[lt]z optabs. > > * tree-ssa-loop-niter.cc (build_cltz_expr): New. > > (number_of_iterations_cltz_complement): New. > > (number_of_iterations_bitcount): Add call to the above. > > > > gcc/testsuite/ChangeLog: > > > > * lib/target-supports.exp (check_effective_target_clz) > > (check_effective_target_clzl, check_effective_target_clzll) > > (check_effective_target_ctz, check_effective_target_clzl) > > (check_effective_target_ctzll): New. > > * gcc.dg/tree-ssa/cltz-complement-max.c: New test. > > * gcc.dg/tree-ssa/clz-complement-char.c: New test. > > * gcc.dg/tree-ssa/clz-complement-int.c: New test. > > * gcc.dg/tree-ssa/clz-complement-long-long.c: New test. > > * gcc.dg/tree-ssa/clz-complement-long.c: New test. > > * gcc.dg/tree-ssa/ctz-complement-char.c: New test. > > * gcc.dg/tree-ssa/ctz-complement-int.c: New test. > > * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test. > > * gcc.dg/tree-ssa/ctz-complement-long.c: New test. > > > > > > -- > > > > [snip test diffs] > > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc > > index > > 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9 > > 100644 > > --- a/gcc/tree-scalar-evolution.cc > > +++ b/gcc/tree-scalar-evolution.cc > > @@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map > uint64_t> , > > library call for popcount when backend does not have an instruction > > to do so. We consider this to be expensive and generate > > __builtin_popcount only when backend defines it. */ > > + optab optab; > >combined_fn cfn = get_call_combined_fn (expr); > >switch (cfn) > > { > > CASE_CFN_POPCOUNT: > > + optab = popcount_optab; > > + goto bitcount_call; > > + CASE_CFN_CLZ: > > + optab = clz_optab; > > + goto bitcount_call; > > + CASE_CFN_CTZ: > > + optab = ctz_optab; > > +bitcount_call: > > /* Check if opcode for popcount is available in the mode > > required. */ > > - if (optab_handler (popcount_optab, > > + if (optab_handler (optab, > > TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, > > 0 > > == CODE_FOR_nothing) > > { > > @@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map > uint64_t> , > > instructions. */ > > if (is_a (mode, _mode) > > && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD > > - && (optab_handler (popcount_optab, word_mode) > > + && (optab_handler (optab, word_mode) > > != CODE_FOR_nothing)) > > break; > > return true; > > diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc > > index > > fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547 > > 100644 > > -
[PATCH 8/8] middle-end: Expand comment for tree_niter_desc.max
This requirement is enforced by a gcc_checking_assert in record_estimate. gcc/ChangeLog: * tree-ssa-loop.h (tree_niter_desc): Update comment. -- diff --git a/gcc/tree-ssa-loop.h b/gcc/tree-ssa-loop.h index 6c70f795d171f22b3ed75873fec4920fea75255b..c24215be8822c31a05eaedcf4d3a26db0feab6cf 100644 --- a/gcc/tree-ssa-loop.h +++ b/gcc/tree-ssa-loop.h @@ -52,7 +52,8 @@ public: may_be_zero == false), more precisely the number of executions of the latch of the loop. */ widest_int max; /* The upper bound on the number of iterations of - the loop. */ + the loop. If niter is constant, then these values + must agree. */ /* The simplified shape of the exit condition. This information is used by loop unrolling. If CMP is ERROR_MARK, then the loop cannot be unrolled.
[PATCH 7/8] middle-end: Add c[lt]z idiom recognition
This recognises the patterns of the form: while (n & 1) { n >>= 1 } Unfortunately there are currently two issues relating to this patch. Firstly, simplify_using_initial_conditions does not recognise that (n != 0) and ((n & 1) == 0) implies that ((n >> 1) != 0). This preconditions arise following the loop copy-header pass, and the assumptions returned by number_of_iterations_exit_assumptions then prevent final value replacement from using the niter result. I'm not sure what is the best way to fix this - one approach could be to modify simplify_using_initial_conditions to handle this sort of case, but it seems that it basically wants the information that ranger could give anway, so would something like that be a better option? The second issue arises in the vectoriser, which is able to determine that the niter->assumptions are always true. When building with -march=armv8.4-a+sve -S -O3, we get this codegen: foo (unsigned int b) { int c = 0; if (b == 0) return PREC; while (!(b & (1 << (PREC - 1 { b <<= 1; c++; } return c; } foo: .LFB0: .cfi_startproc cmp w0, 0 cbz w0, .L6 blt .L7 lsl w1, w0, 1 clz w2, w1 cmp w2, 14 bls .L8 mov x0, 0 cntwx3 add w1, w2, 1 index z1.s, #0, #1 whilelo p0.s, wzr, w1 .L4: add x0, x0, x3 mov p1.b, p0.b mov z0.d, z1.d whilelo p0.s, w0, w1 incwz1.s b.any .L4 add z0.s, z0.s, #1 lastb w0, p1, z0.s ret .p2align 2,,3 .L8: mov w0, 0 b .L3 .p2align 2,,3 .L13: lsl w1, w1, 1 .L3: add w0, w0, 1 tbz w1, #31, .L13 ret .p2align 2,,3 .L6: mov w0, 32 ret .p2align 2,,3 .L7: mov w0, 0 ret .cfi_endproc In essence, the vectoriser uses the niter information to determine exactly how many iterations of the loop it needs to run. It then uses SVE whilelo instructions to run this number of iterations. The original loop counter is also vectorised, despite only being used in the final iteration, and then the final value of this counter is used as the return value (which is the same as the number of iterations it computed in the first place). This vectorisation is obviously bad, and I think it exposes a latent bug in the vectoriser, rather than being an issue caused by this specific patch. gcc/ChangeLog: * tree-ssa-loop-niter.cc (number_of_iterations_cltz): New. (number_of_iterations_bitcount): Add call to the above. (number_of_iterations_exit_assumptions): Add EQ_EXPR case for c[lt]z idiom recognition. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/cltz-max.c: New test. * gcc.dg/tree-ssa/clz-char.c: New test. * gcc.dg/tree-ssa/clz-int.c: New test. * gcc.dg/tree-ssa/clz-long-long.c: New test. * gcc.dg/tree-ssa/clz-long.c: New test. * gcc.dg/tree-ssa/ctz-char.c: New test. * gcc.dg/tree-ssa/ctz-int.c: New test. * gcc.dg/tree-ssa/ctz-long-long.c: New test. * gcc.dg/tree-ssa/ctz-long.c: New test. -- diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c new file mode 100644 index ..a6bea3d338940efee2e7e1c95a5941525945af9e --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c @@ -0,0 +1,72 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */ + +#define PREC (__CHAR_BIT__) + +int clz_count1 (unsigned char b) { +int c = 0; + +if (b == 0) + return 0; + +while (!(b & (1 << (PREC - 1 { + b <<= 1; + c++; +} +if (c <= PREC - 1) + return 0; +else + return 34567; +} + +int clz_count2 (unsigned char b) { +int c = 0; + +if (b == 0) + return 0; + +while (!(b & (1 << PREC - 1))) { + b <<= 1; + c++; +} +if (c <= PREC - 2) + return 0; +else + return 76543; +} + +int ctz_count1 (unsigned char b) { +int c = 0; + +if (b == 0) + return 0; + +while (!(b & 1)) { + b >>= 1; + c++; +} +if (c <= PREC - 1) + return 0; +else + return 23456; +} + +int ctz_count2 (unsigned char b) { +int c = 0; + +if (b == 0) + return 0; + +while (!(b & 1)) { + b >>= 1; + c++; +} +if (c <= PREC - 2) + return 0; +else + return 65432; +} +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-char.c
[PATCH 6/8] docs: Add popcount, clz and ctz target attributes
gcc/ChangeLog: * doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst: Add missing target attributes. -- diff --git a/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst b/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst index 709e4ea2b903cfad4faed40899020b29bc9b5811..8410c40d38fceb83ea8c6ba3bbf0fba5db7929e5 100644 --- a/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst +++ b/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst @@ -1075,6 +1075,24 @@ Other hardware attributes ``cell_hw`` Test system can execute AltiVec and Cell PPU instructions. +``clz`` + Target supports a clz optab on int. + +``clzl`` + Target supports a clz optab on long. + +``clzll`` + Target supports a clz optab on long long. + +``ctz`` + Target supports a ctz optab on int. + +``ctzl`` + Target supports a ctz optab on long. + +``ctzll`` + Target supports a ctz optab on long long. + ``cmpccxadd`` Target supports the execution of ``cmpccxadd`` instructions. @@ -1096,6 +1114,15 @@ Other hardware attributes ``pie_copyreloc`` The x86-64 target linker supports PIE with copy reloc. +``popcount`` + Target supports a popcount optab on int. + +``popcountl`` + Target supports a popcount optab on long. + +``popcountll`` + Target supports a popcount optab on long long. + ``prefetchi`` Target supports the execution of ``prefetchi`` instructions.
[PATCH 5/8] middle-end: Add cltz_complement idiom recognition
This recognises patterns of the form: while (n) { n >>= 1 } This patch results in improved (but still suboptimal) codegen: foo (unsigned int b) { int c = 0; while (b) { b >>= 1; c++; } return c; } foo: .LFB11: .cfi_startproc cbz w0, .L3 clz w1, w0 tst x0, 1 mov w0, 32 sub w0, w0, w1 cselw0, w0, wzr, ne ret The conditional is unnecessary. phiopt could recognise a redundant csel (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a clz call, but it cannot recognise the redunancy when the input is (e.g.) (32 - clz). I could perhaps extend this function to recognise this pattern in a later patch, if this is a good place to recognise more patterns. gcc/ChangeLog: * tree-scalar-evolution.cc (expression_expensive_p): Add checks for c[lt]z optabs. * tree-ssa-loop-niter.cc (build_cltz_expr): New. (number_of_iterations_cltz_complement): New. (number_of_iterations_bitcount): Add call to the above. gcc/testsuite/ChangeLog: * lib/target-supports.exp (check_effective_target_clz) (check_effective_target_clzl, check_effective_target_clzll) (check_effective_target_ctz, check_effective_target_clzl) (check_effective_target_ctzll): New. * gcc.dg/tree-ssa/cltz-complement-max.c: New test. * gcc.dg/tree-ssa/clz-complement-char.c: New test. * gcc.dg/tree-ssa/clz-complement-int.c: New test. * gcc.dg/tree-ssa/clz-complement-long-long.c: New test. * gcc.dg/tree-ssa/clz-complement-long.c: New test. * gcc.dg/tree-ssa/ctz-complement-char.c: New test. * gcc.dg/tree-ssa/ctz-complement-int.c: New test. * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test. * gcc.dg/tree-ssa/ctz-complement-long.c: New test. -- diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c new file mode 100644 index ..1a29ca52e42e50822e4e3213b2cb008b766d0318 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c @@ -0,0 +1,60 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */ + +#define PREC (__CHAR_BIT__) + +int clz_complement_count1 (unsigned char b) { +int c = 0; + +while (b) { + b >>= 1; + c++; +} +if (c <= PREC) + return 0; +else + return 34567; +} + +int clz_complement_count2 (unsigned char b) { +int c = 0; + +while (b) { + b >>= 1; + c++; +} +if (c <= PREC - 1) + return 0; +else + return 76543; +} + +int ctz_complement_count1 (unsigned char b) { +int c = 0; + +while (b) { + b <<= 1; + c++; +} +if (c <= PREC) + return 0; +else + return 23456; +} + +int ctz_complement_count2 (unsigned char b) { +int c = 0; + +while (b) { + b <<= 1; + c++; +} +if (c <= PREC - 1) + return 0; +else + return 65432; +} +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c new file mode 100644 index ..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c @@ -0,0 +1,31 @@ +/* { dg-do run } */ +/* { dg-require-effective-target clz } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +#define PREC (__CHAR_BIT__) + +int +__attribute__ ((noinline, noclone)) +foo (unsigned char b) { +int c = 0; + +while (b) { + b >>= 1; + c++; +} + +return c; +} + +int main() +{ + if (foo(0) != 0) +__builtin_abort (); + if (foo(5) != 3) +__builtin_abort (); + if (foo(255) != 8) +__builtin_abort (); + return 0; +} + +/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c new file mode 100644 index ..f2c5c23f6a7d84ecb637c6961698b0fc30d7426b --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c @@ -0,0 +1,31 @@ +/* { dg-do run } */ +/* { dg-require-effective-target clz } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +#define PREC (__CHAR_BIT__ * __SIZEOF_INT__) + +int +__attribute__ ((noinline, noclone)) +foo (unsigned int b) { +int c = 0; + +while (b) { + b >>= 1; + c++; +} + +return c; +} + +int main() +{ + if (foo(0) != 0) +__builtin_abort (); + if (foo(5) != 3)
[PATCH 4/8] Modify test, to prevent the next patch breaking it
The upcoming c[lt]z idiom recognition patch eliminates the need for a brute force computation of the iteration count of these loops. The test is intended to verify that ivcanon can determine the loop count when the condition is given by a chain of constant computations. We replace the constant operations with a more complicated chain that should resist future idiom recognition. gcc/testsuite/ChangeLog: * gcc.dg/pr77975.c: Make tests more robust. -- diff --git a/gcc/testsuite/gcc.dg/pr77975.c b/gcc/testsuite/gcc.dg/pr77975.c index 148cebdded964da7fce148abdf2a430c55650513..a187ce2b50c2821841e71b5b6cb243a37a66fb57 100644 --- a/gcc/testsuite/gcc.dg/pr77975.c +++ b/gcc/testsuite/gcc.dg/pr77975.c @@ -7,10 +7,11 @@ unsigned int foo (unsigned int *b) { - unsigned int a = 3; + unsigned int a = 8; while (a) { - a >>= 1; + a += 5; + a &= 44; *b += a; } return a; @@ -21,10 +22,11 @@ foo (unsigned int *b) unsigned int bar (unsigned int *b) { - unsigned int a = 7; + unsigned int a = 3; while (a) { - a >>= 1; + a += 5; + a &= 44; *b += a; } return a;
[PATCH 3/8] middle-end: Refactor number_of_iterations_popcount
This includes various changes to improve clarity, and to enable the code to be more similar to the clz and ctz idiom recognition added in subsequent patches. We create new number_of_iterations_bitcount function, which will be used to call the other bit-counting recognition functions added in subsequent patches, as well as a generic comment describing the loop structures that are common to each idiom. Some of the variables in number_of_iterations_popcount are given more descriptive names, and the popcount expression builder is extracted into a separate function. As part of the refactoring, we also fix a bug where the max loop count for modes shorter than an integer would be incorrectly computed as if the input mode were actually an integer. We also ensure that niter->max takes into account the final value for niter->niter (after any folding and simplifying), since if the latter is a constant, then record_estimate mandates that the two values are equivalent. gcc/ChangeLog: * tree-ssa-loop-niter.cc (number_of_iterations_exit_assumptions): Modify to call... (number_of_iterations_bitcount): ...this new function. (number_of_iterations_popcount): Now called by the above. Refactor, and extract popcount expression builder to... (build_popcount_expr): this new function. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/popcount-max.c: New test. -- diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c b/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c new file mode 100644 index ..ca7204cbc3cea636183408e24d7dd36d702ffdb2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */ + +#define PREC (__CHAR_BIT__) + +int count1 (unsigned char b) { +int c = 0; + +while (b) { + b &= b - 1; + c++; +} +if (c <= PREC) + return 0; +else + return 34567; +} + +int count2 (unsigned char b) { +int c = 0; + +while (b) { + b &= b - 1; + c++; +} +if (c <= PREC - 1) + return 0; +else + return 76543; +} + +/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */ diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc index 0af34e46580bb9a6f9b40e09c9f29b8454a4aaf6..fece876099c1687569d6351e7d2416ea6acae5b5 100644 --- a/gcc/tree-ssa-loop-niter.cc +++ b/gcc/tree-ssa-loop-niter.cc @@ -2026,6 +2026,48 @@ number_of_iterations_cond (class loop *loop, return ret; } +/* Return an expression that computes the popcount of src. */ + +static tree +build_popcount_expr (tree src) +{ + tree fn; + int prec = TYPE_PRECISION (TREE_TYPE (src)); + int i_prec = TYPE_PRECISION (integer_type_node); + int li_prec = TYPE_PRECISION (long_integer_type_node); + int lli_prec = TYPE_PRECISION (long_long_integer_type_node); + if (prec <= i_prec) +fn = builtin_decl_implicit (BUILT_IN_POPCOUNT); + else if (prec == li_prec) +fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL); + else if (prec == lli_prec || prec == 2 * lli_prec) +fn = builtin_decl_implicit (BUILT_IN_POPCOUNTLL); + else +return NULL_TREE; + + tree utype = unsigned_type_for (TREE_TYPE (src)); + src = fold_convert (utype, src); + if (prec < i_prec) +src = fold_convert (unsigned_type_node, src); + tree call; + if (prec == 2 * lli_prec) +{ + tree src1 = fold_convert (long_long_unsigned_type_node, + fold_build2 (RSHIFT_EXPR, TREE_TYPE (src), +unshare_expr (src), +build_int_cst (integer_type_node, + lli_prec))); + tree src2 = fold_convert (long_long_unsigned_type_node, src); + tree call1 = build_call_expr (fn, 1, src1); + tree call2 = build_call_expr (fn, 1, src2); + call = fold_build2 (PLUS_EXPR, integer_type_node, call1, call2); +} + else +call = build_call_expr (fn, 1, src); + + return call; +} + /* Utility function to check if OP is defined by a stmt that is a val - 1. */ @@ -2041,45 +2083,18 @@ ssa_defined_by_minus_one_stmt_p (tree op, tree val) && integer_minus_onep (gimple_assign_rhs2 (stmt))); } -/* See if LOOP is a popcout implementation, determine NITER for the loop +/* See comment below for number_of_iterations_bitcount. + For popcount, we have: - We match: - - goto + modify: + _1 = iv_1 + -1 + iv_2 = iv_1 & _1 - - _1 = b_11 + -1 - b_6 = _1 & b_11 - - - b_11 = PHI + test: + if (iv != 0) - exit block - if (b_11 != 0) - goto - else - goto - - OR we match copy-header version: - if (b_5 != 0) - goto - else - goto - - - b_11 = PHI - _1 = b_11 + -1 - b_6 = _1 &
[PATCH 2/8] middle-end: Remove prototype for number_of_iterations_popcount
gcc/ChangeLog: * tree-ssa-loop-niter.c (ssa_defined_by_minus_one_stmt_p): Move (number_of_iterations_popcount): Move, and remove separate prototype. -- diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc index cdbb924216243ebcabe6c695698a4aee71882c49..c23643fd9dd8b27ff11549e1f28f585534e84cd3 100644 --- a/gcc/tree-ssa-loop-niter.cc +++ b/gcc/tree-ssa-loop-niter.cc @@ -63,11 +63,6 @@ struct bounds mpz_t below, up; }; -static bool number_of_iterations_popcount (loop_p loop, edge exit, - enum tree_code code, - class tree_niter_desc *niter); - - /* Splits expression EXPR to a variable part VAR and constant OFFSET. */ static void @@ -2031,6 +2026,200 @@ number_of_iterations_cond (class loop *loop, return ret; } +/* Utility function to check if OP is defined by a stmt + that is a val - 1. */ + +static bool +ssa_defined_by_minus_one_stmt_p (tree op, tree val) +{ + gimple *stmt; + return (TREE_CODE (op) == SSA_NAME + && (stmt = SSA_NAME_DEF_STMT (op)) + && is_gimple_assign (stmt) + && (gimple_assign_rhs_code (stmt) == PLUS_EXPR) + && val == gimple_assign_rhs1 (stmt) + && integer_minus_onep (gimple_assign_rhs2 (stmt))); +} + +/* See if LOOP is a popcout implementation, determine NITER for the loop + + We match: + + goto + + + _1 = b_11 + -1 + b_6 = _1 & b_11 + + + b_11 = PHI + + exit block + if (b_11 != 0) + goto + else + goto + + OR we match copy-header version: + if (b_5 != 0) + goto + else + goto + + + b_11 = PHI + _1 = b_11 + -1 + b_6 = _1 & b_11 + + exit block + if (b_6 != 0) + goto + else + goto + + If popcount pattern, update NITER accordingly. + i.e., set NITER to __builtin_popcount (b) + return true if we did, false otherwise. + + */ + +static bool +number_of_iterations_popcount (loop_p loop, edge exit, + enum tree_code code, + class tree_niter_desc *niter) +{ + bool adjust = true; + tree iter; + HOST_WIDE_INT max; + adjust = true; + tree fn = NULL_TREE; + + /* Check loop terminating branch is like + if (b != 0). */ + gimple *stmt = last_stmt (exit->src); + if (!stmt + || gimple_code (stmt) != GIMPLE_COND + || code != NE_EXPR + || !integer_zerop (gimple_cond_rhs (stmt)) + || TREE_CODE (gimple_cond_lhs (stmt)) != SSA_NAME) +return false; + + gimple *and_stmt = SSA_NAME_DEF_STMT (gimple_cond_lhs (stmt)); + + /* Depending on copy-header is performed, feeding PHI stmts might be in + the loop header or loop latch, handle this. */ + if (gimple_code (and_stmt) == GIMPLE_PHI + && gimple_bb (and_stmt) == loop->header + && gimple_phi_num_args (and_stmt) == 2 + && (TREE_CODE (gimple_phi_arg_def (and_stmt, +loop_latch_edge (loop)->dest_idx)) + == SSA_NAME)) +{ + /* SSA used in exit condition is defined by PHI stmt + b_11 = PHI + from the PHI stmt, get the and_stmt + b_6 = _1 & b_11. */ + tree t = gimple_phi_arg_def (and_stmt, loop_latch_edge (loop)->dest_idx); + and_stmt = SSA_NAME_DEF_STMT (t); + adjust = false; +} + + /* Make sure it is indeed an and stmt (b_6 = _1 & b_11). */ + if (!is_gimple_assign (and_stmt) + || gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR) +return false; + + tree b_11 = gimple_assign_rhs1 (and_stmt); + tree _1 = gimple_assign_rhs2 (and_stmt); + + /* Check that _1 is defined by _b11 + -1 (_1 = b_11 + -1). + Also make sure that b_11 is the same in and_stmt and _1 defining stmt. + Also canonicalize if _1 and _b11 are revrsed. */ + if (ssa_defined_by_minus_one_stmt_p (b_11, _1)) +std::swap (b_11, _1); + else if (ssa_defined_by_minus_one_stmt_p (_1, b_11)) +; + else +return false; + /* Check the recurrence: + ... = PHI . */ + gimple *phi = SSA_NAME_DEF_STMT (b_11); + if (gimple_code (phi) != GIMPLE_PHI + || (gimple_bb (phi) != loop_latch_edge (loop)->dest) + || (gimple_assign_lhs (and_stmt) + != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx))) +return false; + + /* We found a match. Get the corresponding popcount builtin. */ + tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx); + if (TYPE_PRECISION (TREE_TYPE (src)) <= TYPE_PRECISION (integer_type_node)) +fn = builtin_decl_implicit (BUILT_IN_POPCOUNT); + else if (TYPE_PRECISION (TREE_TYPE (src)) + == TYPE_PRECISION (long_integer_type_node)) +fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL); + else if (TYPE_PRECISION (TREE_TYPE (src)) + == TYPE_PRECISION (long_long_integer_type_node) + || (TYPE_PRECISION (TREE_TYPE (src)) + == 2 * TYPE_PRECISION (long_long_integer_type_node))) +fn =
[PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit
This prevents a null dereference error when outputing debug information following an early exit from number_of_iterations_exit_assumptions. gcc/ChangeLog: * tree-ssa-loop-niter.cc (number_of_iterations_exit_assumptions): Move at_stmt assignment. -- diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc index 4ffcef4f4ff2fe182fbe711553c8e4575560ab07..cdbb924216243ebcabe6c695698a4aee71882c49 100644 --- a/gcc/tree-ssa-loop-niter.cc +++ b/gcc/tree-ssa-loop-niter.cc @@ -2537,6 +2537,9 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit, if (!stmt) return false; + if (at_stmt) +*at_stmt = stmt; + /* We want the condition for staying inside loop. */ code = gimple_cond_code (stmt); if (exit->flags & EDGE_TRUE_VALUE) @@ -2642,9 +2645,6 @@ number_of_iterations_exit_assumptions (class loop *loop, edge exit, if (TREE_CODE (niter->niter) == INTEGER_CST) niter->max = wi::to_widest (niter->niter); - if (at_stmt) -*at_stmt = stmt; - return (!integer_zerop (niter->assumptions)); }
[PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements
This is a series of patches to improve recognition of popcount and clz/ctz idioms, along with some related fixes. - Patches 1 and 8 are independent fixes or improvements. - Patch 4 is a dependency of patch 5, as it improves the robustness of a test that would otherwise begin failing. - Patches 2, 3, 5 and 7 form the main dependent sequence. - Patch 6 is a documentation update, covering attributes in patch 5 and existing code. - Patch 7 may require other work before it can be merged, as it seems to expose a latent issue in the vectoriser. Each patch has been bootstrapped and regression tested on aarch64-none-linux-gnu.
[committed] Improve comment for tree_niter_desc.{control,bound,cmp}
Fix typos and explain ERROR_MARK usage. gcc/ChangeLog: * tree-ssa-loop.h: Improve comment --- diff --git a/gcc/tree-ssa-loop.h b/gcc/tree-ssa-loop.h index 415f461c37e4cd7df0b49f6104f796c49cc830fa..6c70f795d171f22b3ed75873fec4920fea75255b 100644 --- a/gcc/tree-ssa-loop.h +++ b/gcc/tree-ssa-loop.h @@ -54,11 +54,11 @@ public: widest_int max; /* The upper bound on the number of iterations of the loop. */ - /* The simplified shape of the exit condition. The loop exits if - CONTROL CMP BOUND is false, where CMP is one of NE_EXPR, - LT_EXPR, or GT_EXPR, and step of CONTROL is positive if CMP is - LE_EXPR and negative if CMP is GE_EXPR. This information is used - by loop unrolling. */ + /* The simplified shape of the exit condition. This information is used by + loop unrolling. If CMP is ERROR_MARK, then the loop cannot be unrolled. + Otherwise, the loop exits if CONTROL CMP BOUND is false, where CMP is one + of NE_EXPR, LT_EXPR, or GT_EXPR, and CONTROL.STEP is positive if CMP is + LT_EXPR and negative if CMP is GT_EXPR. */ affine_iv control; tree bound; enum tree_code cmp;
[committed] docs: Fix outdated reference to LOOPS_HAVE_MARKED_SINGLE_EXITS
This reference has been wrong since 2007; committed as an obvious fix. gcc/ChangeLog: * doc/loop.texi: Refer to LOOPS_HAVE_RECORDED_EXITS instead. diff --git a/gcc/doc/loop.texi b/gcc/doc/loop.texi index d7b71a24dbfed284b13da702bd5037691a515535..6e8657a074d2447db7ae9b75cbfbb71282b84287 100644 --- a/gcc/doc/loop.texi +++ b/gcc/doc/loop.texi @@ -210,7 +210,7 @@ loop in depth-first search order in reversed CFG, ordered by dominance relation, and breath-first search order, respectively. @item @code{single_exit}: Returns the single exit edge of the loop, or @code{NULL} if the loop has more than one exit. You can only use this -function if LOOPS_HAVE_MARKED_SINGLE_EXITS property is used. +function if @code{LOOPS_HAVE_RECORDED_EXITS} is used. @item @code{get_loop_exit_edges}: Enumerates the exit edges of a loop. @item @code{just_once_each_iteration_p}: Returns true if the basic block is executed exactly once during each iteration of a loop (that is, it
[PATCH v2.1 3/4] aarch64: Consolidate simd type lookup functions
On Wed, Jul 13, 2022 at 05:36:04PM +0100, Richard Sandiford wrote: > I like the part about getting rid of: > > static tree > aarch64_simd_builtin_type (machine_mode mode, > bool unsigned_p, bool poly_p) > > and the flow of the new function. However, I think it's still > slightly more readable if we keep the switch and lookup routines > separate, partly to keep down the size of the main routine and > partly to avoid the goto. I agree. > So how about: > > - aarch64_simd_builtin_std_type becomes aarch64_int_or_fp_element_type > but otherwise stays as-is > > ... I've called it aarch64_int_or_fp_type, because it's sometimes used for an operand that doesn't represent an element of a vector. Updated patch below. --- There were several similarly-named functions, which each built or looked up an operand type using a different subset of valid modes or qualifiers. This change provides a single function to return operand types, which can additionally handle const and pointer qualifiers. For clarity, the existing functionality is kept in separate helper functions. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (aarch64_simd_builtin_std_type): Rename to... (aarch64_int_or_fp_type): ...this, and allow irrelevant qualifiers. (aarch64_lookup_simd_builtin_type): Rename to... (aarch64_simd_builtin_type): ...this. Add const/pointer support, and extract table lookup to... (aarch64_lookup_simd_type_in_table): ...this function. (aarch64_init_crc32_builtins): Update to use aarch64_simd_builtin_type. (aarch64_init_fcmla_laneq_builtins): Ditto. (aarch64_init_simd_builtin_functions): Ditto. --- diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 55ad2e8b6831d6cc2b039270c8656d429347092d..cd7c2a79d9b4d67adf1d9de1f9b56eb3a0d1ee2b 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -788,12 +788,13 @@ aarch64_general_mangle_builtin_type (const_tree type) return NULL; } +/* Helper function for aarch64_simd_builtin_type. */ static tree -aarch64_simd_builtin_std_type (machine_mode mode, - enum aarch64_type_qualifiers q) +aarch64_int_or_fp_type (machine_mode mode, + enum aarch64_type_qualifiers qualifiers) { -#define QUAL_TYPE(M) \ - ((q == qualifier_none) ? int##M##_type_node : unsigned_int##M##_type_node); +#define QUAL_TYPE(M) ((qualifiers & qualifier_unsigned) \ + ? unsigned_int##M##_type_node : int##M##_type_node); switch (mode) { case E_QImode: @@ -826,16 +827,14 @@ aarch64_simd_builtin_std_type (machine_mode mode, #undef QUAL_TYPE } +/* Helper function for aarch64_simd_builtin_type. */ static tree -aarch64_lookup_simd_builtin_type (machine_mode mode, - enum aarch64_type_qualifiers q) +aarch64_lookup_simd_type_in_table (machine_mode mode, + enum aarch64_type_qualifiers qualifiers) { int i; int nelts = sizeof (aarch64_simd_types) / sizeof (aarch64_simd_types[0]); - - /* Non-poly scalar modes map to standard types not in the table. */ - if (q != qualifier_poly && !VECTOR_MODE_P (mode)) -return aarch64_simd_builtin_std_type (mode, q); + int q = qualifiers & (qualifier_poly | qualifier_unsigned); for (i = 0; i < nelts; i++) { @@ -852,16 +851,32 @@ aarch64_lookup_simd_builtin_type (machine_mode mode, return NULL_TREE; } +/* Return a type for an operand with specified mode and qualifiers. */ static tree aarch64_simd_builtin_type (machine_mode mode, - bool unsigned_p, bool poly_p) + enum aarch64_type_qualifiers qualifiers) { - if (poly_p) -return aarch64_lookup_simd_builtin_type (mode, qualifier_poly); - else if (unsigned_p) -return aarch64_lookup_simd_builtin_type (mode, qualifier_unsigned); + tree type = NULL_TREE; + + /* For pointers, we want a pointer to the basic type of the vector. */ + if ((qualifiers & qualifier_pointer) && VECTOR_MODE_P (mode)) +mode = GET_MODE_INNER (mode); + + /* Non-poly scalar modes map to standard types not in the table. */ + if ((qualifiers & qualifier_poly) || VECTOR_MODE_P (mode)) +type = aarch64_lookup_simd_type_in_table (mode, qualifiers); else -return aarch64_lookup_simd_builtin_type (mode, qualifier_none); +type = aarch64_int_or_fp_type (mode, qualifiers); + + gcc_assert (type != NULL_TREE); + + /* Add qualifiers. */ + if (qualifiers & qualifier_const) +type = build_qualified_type (type, TYPE_QUAL_CONST); + if (qualifiers & qualifier_pointer) +type = build_pointer_type (type); + + return type; } static void @@ -1110,12 +1125,11 @@ aarch64_init_fcmla_laneq_builtins (void) { aarch64_fcmla_laneq_builtin_datum* d = _fcmla_lane_builtin_data[i]; - tree
Re: [PATCH v2 1/2] aarch64: Don't return invalid GIMPLE assign statements
On Wed, Jul 13, 2022 at 02:32:16PM +0200, Richard Biener wrote: > On Wed, Jul 13, 2022 at 12:50 PM Andrew Carlotti > wrote: > > I specifically wanted to avoid not folding the call, because always > > folding means that the builtin doesn't need to be implemented anywhere > > else (which isn't relevant here, but may become relevant when folding > > newly defined builtins in the future). > > > > I considered dropping the statement, but I wasn't sure at the time that > > I could do it safely. I could send a patch to instead replace new_stmt > > with a GIMPLE_NOP. > > If you can be sure there's no side-effect on the RHS then I think > I'd prefer that over allocating an SSA name for something that's > going to be DCEd anyway. > > Richard. I discussed this off-list with Richard Sandiford, and we agreed that it would be better to leave this code as it is. The only time this form is likely to arise is if the statement has side-effects (e.g. reading from volatile memory or triggering floating-point exceptions), in which case we can't just replace it with a nop. On the other hand, in the event that someone has written an entirely redundant statement, then it will quickly get eliminated anyway. Adding code to distinguish between the two cases here, or to handle the hard case, is unnecessary and wouldn't be worthwhile. > > > >> gcc/ChangeLog: > > > >> > > > >> * config/aarch64/aarch64-builtins.cc > > > >> (aarch64_general_gimple_fold_builtin): Add fixup for invalid GIMPLE. > > > >> > > > >> gcc/testsuite/ChangeLog: > > > >> > > > >> * gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c: New test. > > > >> > > > >> --- > > > >> > > > >> diff --git a/gcc/config/aarch64/aarch64-builtins.cc > > > >> b/gcc/config/aarch64/aarch64-builtins.cc > > > >> index > > > >> e0a741ac663188713e21f457affa57217d074783..5753988a9964967c27a03aca5fddb9025fd8ed6e > > > >> 100644 > > > >> --- a/gcc/config/aarch64/aarch64-builtins.cc > > > >> +++ b/gcc/config/aarch64/aarch64-builtins.cc > > > >> @@ -3022,6 +3022,16 @@ aarch64_general_gimple_fold_builtin (unsigned > > > >> int fcode, gcall *stmt, > > > >> default: > > > >>break; > > > >> } > > > >> + > > > >> + /* GIMPLE assign statements (unlike calls) require a non-null lhs. > > > >> If we > > > >> + created an assign statement with a null lhs, then fix this by > > > >> assigning > > > >> + to a new (and subsequently unused) variable. */ > > > >> + if (new_stmt && is_gimple_assign (new_stmt) && !gimple_assign_lhs > > > >> (new_stmt)) > > > >> +{ > > > >> + tree new_lhs = make_ssa_name (gimple_call_return_type (stmt)); > > > >> + gimple_assign_set_lhs (new_stmt, new_lhs); > > > >> +} > > > >> + > > > >>return new_stmt; > > > >> } > > > >> > > > >> diff --git > > > >> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c > > > >> > > > >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c > > > >> new file mode 100644 > > > >> index > > > >> ..345307456b175307f5cb22de5e59cfc6254f2737 > > > >> --- /dev/null > > > >> +++ > > > >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c > > > >> @@ -0,0 +1,9 @@ > > > >> +/* { dg-do compile { target { aarch64*-*-* } } } */ > > > >> + > > > >> +#include > > > >> + > > > >> +int8_t *bar(); > > > >> + > > > >> +void foo() { > > > >> + __builtin_aarch64_ld1v16qi(bar()); > > > >> +}
[committed] MAINTAINERS: Add myself to Write After Approval
ChangeLog: * MAINTAINERS: Add myself to Write After Approval. diff --git a/MAINTAINERS b/MAINTAINERS index 7d9aab76dd9676c806bd08abc7542553fcf81928..7a7ad42ced3027f1f7970916b355fd5fc7b0088c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -352,6 +352,7 @@ Kevin Buettner Andrew Burgess Adam Butcher Andrew Cagney +Andrew Carlotti Daniel Carrera Stephane Carrez Gabriel Charette
[PATCH v2 4/4] aarch64: Move vreinterpret definitions into the compiler
This removes a significant number of intrinsic definitions from the arm_neon.h header file, and reduces the amount of code duplication. The new macros and data structures are intended to also facilitate moving other intrinsic definitions out of the header file in future. There is a a slight change in the behaviour of the bf16 vreinterpret intrinsics when compiling without bf16 support. Expressions like: b = vreinterpretq_s32_bf16(vreinterpretq_bf16_s64(a)) are now compiled successfully, instead of causing a 'target specific option mismatch' during inlining. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (MODE_d_bf16, MODE_d_f16, MODE_d_f32, MODE_d_f64, MODE_d_s8) (MODE_d_s16, MODE_d_s32, MODE_d_s64, MODE_d_u8, MODE_d_u16) (MODE_d_u32, MODE_d_u64, MODE_d_p8, MODE_d_p16, MODE_d_p64) (MODE_q_bf16, MODE_q_f16, MODE_q_f32, MODE_q_f64, MODE_q_s8) (MODE_q_s16, MODE_q_s32, MODE_q_s64, MODE_q_u8, MODE_q_u16) (MODE_q_u32, MODE_q_u64, MODE_q_p8, MODE_q_p16, MODE_q_p64) (MODE_q_p128): Define macro to map to corresponding mode name. (QUAL_bf16, QUAL_f16, QUAL_f32, QUAL_f64, QUAL_s8, QUAL_s16) (QUAL_s32, QUAL_s64, QUAL_u8, QUAL_u16, QUAL_u32, QUAL_u64) (QUAL_p8, QUAL_p16, QUAL_p64, QUAL_p128): Define macro to map to corresponding qualifier name. (LENGTH_d, LENGTH_q): Define macro to map to "" or "q" suffix. (SIMD_INTR_MODE, SIMD_INTR_QUAL, SIMD_INTR_LENGTH_CHAR): Macro functions for the above mappings (VREINTERPRET_BUILTIN2, VREINTERPRET_BUILTINS1, VREINTERPRET_BUILTINS) (VREINTERPRETQ_BUILTIN2, VREINTERPRETQ_BUILTINS1) (VREINTERPRETQ_BUILTINS, VREINTERPRET_BUILTIN) (AARCH64_SIMD_VREINTERPRET_BUILTINS): New macros to create definitions for all vreinterpret intrinsics (enum aarch64_builtins): Add vreinterpret function codes (aarch64_init_simd_intrinsics): New (handle_arm_neon_h): Improved comment. (aarch64_general_fold_builtin): Fold vreinterpret calls * config/aarch64/arm_neon.h (vreinterpret_p8_f16, vreinterpret_p8_f64, vreinterpret_p8_s8) (vreinterpret_p8_s16, vreinterpret_p8_s32, vreinterpret_p8_s64) (vreinterpret_p8_f32, vreinterpret_p8_u8, vreinterpret_p8_u16) (vreinterpret_p8_u32, vreinterpret_p8_u64, vreinterpret_p8_p16) (vreinterpret_p8_p64, vreinterpretq_p8_f64, vreinterpretq_p8_s8) (vreinterpretq_p8_s16, vreinterpretq_p8_s32, vreinterpretq_p8_s64) (vreinterpretq_p8_f16, vreinterpretq_p8_f32, vreinterpretq_p8_u8) (vreinterpretq_p8_u16, vreinterpretq_p8_u32, vreinterpretq_p8_u64) (vreinterpretq_p8_p16, vreinterpretq_p8_p64, vreinterpretq_p8_p128) (vreinterpret_p16_f16, vreinterpret_p16_f64, vreinterpret_p16_s8) (vreinterpret_p16_s16, vreinterpret_p16_s32, vreinterpret_p16_s64) (vreinterpret_p16_f32, vreinterpret_p16_u8, vreinterpret_p16_u16) (vreinterpret_p16_u32, vreinterpret_p16_u64, vreinterpret_p16_p8) (vreinterpret_p16_p64, vreinterpretq_p16_f64, vreinterpretq_p16_s8) (vreinterpretq_p16_s16, vreinterpretq_p16_s32, vreinterpretq_p16_s64) (vreinterpretq_p16_f16, vreinterpretq_p16_f32, vreinterpretq_p16_u8) (vreinterpretq_p16_u16, vreinterpretq_p16_u32, vreinterpretq_p16_u64) (vreinterpretq_p16_p8, vreinterpretq_p16_p64, vreinterpretq_p16_p128) (vreinterpret_p64_f16, vreinterpret_p64_f64, vreinterpret_p64_s8) (vreinterpret_p64_s16, vreinterpret_p64_s32, vreinterpret_p64_s64) (vreinterpret_p64_f32, vreinterpret_p64_u8, vreinterpret_p64_u16) (vreinterpret_p64_u32, vreinterpret_p64_u64, vreinterpret_p64_p8) (vreinterpret_p64_p16, vreinterpretq_p64_f64, vreinterpretq_p64_s8) (vreinterpretq_p64_s16, vreinterpretq_p64_s32, vreinterpretq_p64_s64) (vreinterpretq_p64_f16, vreinterpretq_p64_f32, vreinterpretq_p64_p128) (vreinterpretq_p64_u8, vreinterpretq_p64_u16, vreinterpretq_p64_p16) (vreinterpretq_p64_u32, vreinterpretq_p64_u64, vreinterpretq_p64_p8) (vreinterpretq_p128_p8, vreinterpretq_p128_p16, vreinterpretq_p128_f16) (vreinterpretq_p128_f32, vreinterpretq_p128_p64, vreinterpretq_p128_s64) (vreinterpretq_p128_u64, vreinterpretq_p128_s8, vreinterpretq_p128_s16) (vreinterpretq_p128_s32, vreinterpretq_p128_u8, vreinterpretq_p128_u16) (vreinterpretq_p128_u32, vreinterpret_f16_f64, vreinterpret_f16_s8) (vreinterpret_f16_s16): (vreinterpret_f16_s32): (vreinterpret_f16_s64): (vreinterpret_f16_f32, vreinterpret_f16_u8, vreinterpret_f16_u16) (vreinterpret_f16_u32, vreinterpret_f16_u64, vreinterpret_f16_p8) (vreinterpret_f16_p16, vreinterpret_f16_p64, vreinterpretq_f16_f64) (vreinterpretq_f16_s8, vreinterpretq_f16_s16, vreinterpretq_f16_s32) (vreinterpretq_f16_s64, vreinterpretq_f16_f32, vreinterpretq_f16_u8)
[PATCH v2 2/4] aarch64: Remove qualifier_internal
This has been unused since 2014, so there's no reason to retain it. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (enum aarch64_type_qualifiers): Remove qualifier_internal. (aarch64_init_simd_builtin_functions): Remove qualifier_internal check. --- diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 52d27c6978990ca3e6c523654fe1cdc952e77ad7..55ad2e8b6831d6cc2b039270c8656d429347092d 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -145,9 +145,7 @@ enum aarch64_type_qualifiers qualifier_maybe_immediate = 0x10, /* 1 << 4 */ /* void foo (...). */ qualifier_void = 0x20, /* 1 << 5 */ - /* Some patterns may have internal operands, this qualifier is an - instruction to the initialisation code to skip this operand. */ - qualifier_internal = 0x40, /* 1 << 6 */ + /* 1 << 6 is now unused */ /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum rather than using the type of the operand. */ qualifier_map_mode = 0x80, /* 1 << 7 */ @@ -1207,10 +1205,6 @@ aarch64_init_simd_builtin_functions (bool called_from_pragma) else type_signature[op_num] = 's'; - /* Skip an internal operand for vget_{low, high}. */ - if (qualifiers & qualifier_internal) - continue; - /* Some builtins have different user-facing types for certain arguments, encoded in d->mode. */ if (qualifiers & qualifier_map_mode)
[PATCH v2 3/4] aarch64: Consolidate simd type lookup functions
There were several similarly-named functions, which each built or looked up a type using a different subset of valid modes or qualifiers. This change combines these all into a single function, which can additionally handle const and pointer qualifiers. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (aarch64_simd_builtin_std_type, aarch64_lookup_simd_builtin_type) (aarch64_simd_builtin_type): Combine and replace with... (aarch64_build_simd_builtin_type): ...this new function. (aarch64_init_fcmla_laneq_builtins): Update to call new function. (aarch64_init_simd_builtin_functions): Ditto. (aarch64_init_crc32_builtins): Ditto. --- diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 55ad2e8b6831d6cc2b039270c8656d429347092d..6b413a36a09c7a4ac41b0fe7c414a3247580f222 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -789,79 +789,101 @@ aarch64_general_mangle_builtin_type (const_tree type) } static tree -aarch64_simd_builtin_std_type (machine_mode mode, - enum aarch64_type_qualifiers q) -{ -#define QUAL_TYPE(M) \ - ((q == qualifier_none) ? int##M##_type_node : unsigned_int##M##_type_node); - switch (mode) -{ -case E_QImode: - return QUAL_TYPE (QI); -case E_HImode: - return QUAL_TYPE (HI); -case E_SImode: - return QUAL_TYPE (SI); -case E_DImode: - return QUAL_TYPE (DI); -case E_TImode: - return QUAL_TYPE (TI); -case E_OImode: - return aarch64_simd_intOI_type_node; -case E_CImode: - return aarch64_simd_intCI_type_node; -case E_XImode: - return aarch64_simd_intXI_type_node; -case E_HFmode: - return aarch64_fp16_type_node; -case E_SFmode: - return float_type_node; -case E_DFmode: - return double_type_node; -case E_BFmode: - return aarch64_bf16_type_node; -default: - gcc_unreachable (); -} -#undef QUAL_TYPE -} - -static tree -aarch64_lookup_simd_builtin_type (machine_mode mode, - enum aarch64_type_qualifiers q) +aarch64_build_simd_builtin_type (machine_mode mode, +enum aarch64_type_qualifiers qualifiers) { + tree type = NULL_TREE; int i; int nelts = sizeof (aarch64_simd_types) / sizeof (aarch64_simd_types[0]); - /* Non-poly scalar modes map to standard types not in the table. */ - if (q != qualifier_poly && !VECTOR_MODE_P (mode)) -return aarch64_simd_builtin_std_type (mode, q); + /* For pointers, we want a pointer to the basic type of the vector. */ + if ((qualifiers & qualifier_pointer) && VECTOR_MODE_P (mode)) +mode = GET_MODE_INNER (mode); - for (i = 0; i < nelts; i++) + if ((qualifiers & qualifier_poly) || VECTOR_MODE_P (mode)) { - if (aarch64_simd_types[i].mode == mode - && aarch64_simd_types[i].q == q) - return aarch64_simd_types[i].itype; - if (aarch64_simd_tuple_types[i][0] != NULL_TREE) - for (int j = 0; j < 3; j++) - if (aarch64_simd_tuple_modes[i][j] == mode + int q = qualifiers & (qualifier_poly | qualifier_unsigned); + /* Poly or vector modes map to types in the table. */ + for (i = 0; i < nelts; i++) + { + if (aarch64_simd_types[i].mode == mode && aarch64_simd_types[i].q == q) - return aarch64_simd_tuple_types[i][j]; + { + type = aarch64_simd_types[i].itype; + goto finished_type_lookup; + } + if (aarch64_simd_tuple_types[i][0] != NULL_TREE) + { + for (int j = 0; j < 3; j++) + { + if (aarch64_simd_tuple_modes[i][j] == mode + && aarch64_simd_types[i].q == q) + { + type = aarch64_simd_tuple_types[i][j]; + goto finished_type_lookup; + } + } + } + } } + else +{ + /* Non-poly scalar modes map to standard types. */ +#define QUAL_TYPE(M) ((qualifiers & qualifier_unsigned) \ + ? unsigned_int##M##_type_node : int##M##_type_node); + switch (mode) + { + case E_QImode: + type = QUAL_TYPE (QI); + break; + case E_HImode: + type = QUAL_TYPE (HI); + break; + case E_SImode: + type = QUAL_TYPE (SI); + break; + case E_DImode: + type = QUAL_TYPE (DI); + break; + case E_TImode: + type = QUAL_TYPE (TI); + break; + case E_OImode: + type = aarch64_simd_intOI_type_node; + break; + case E_CImode: + type = aarch64_simd_intCI_type_node; + break; + case E_XImode: + type = aarch64_simd_intXI_type_node; + break; + case E_HFmode: + type = aarch64_fp16_type_node; +
[PATCH v2 1/4] aarch64: Add V1DI mode
We already have a V1DF mode, so this makes the vector modes more consistent. Additionally, this allows us to recognise uint64x1_t and int64x1_t types given only the mode and type qualifiers (e.g. in aarch64_lookup_simd_builtin_type). gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (v1di_UP): Add V1DI mode to _UP macros. * config/aarch64/aarch64-modes.def (VECTOR_MODE): Add V1DI mode * config/aarch64/aarch64-simd-builtin-types.def: Use V1DI mode * config/aarch64/aarch64-simd.md (vec_extractv2dfv1df): Replace with... (vec_extract): ...this. * config/aarch64/aarch64.cc (aarch64_classify_vector_mode): Add V1DI mode * config/aarch64/iterators.md (VQ_2E, V1HALF, V1half): New. (nunits): Add V1DI mode. --- diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index c21476d7ae963450b12efa24418ce4004a3c74bf..52d27c6978990ca3e6c523654fe1cdc952e77ad7 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -55,6 +55,7 @@ #define v2si_UP E_V2SImode #define v2sf_UP E_V2SFmode #define v1df_UP E_V1DFmode +#define v1di_UP E_V1DImode #define di_UPE_DImode #define df_UPE_DFmode #define v16qi_UP E_V16QImode diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def index 8f399225a8048d93108e33e9d49c736aeb5612ce..d3c9b74434cd2c0d0cb1a2fd26af8c0bf38a4cfa 100644 --- a/gcc/config/aarch64/aarch64-modes.def +++ b/gcc/config/aarch64/aarch64-modes.def @@ -70,6 +70,7 @@ VECTOR_MODES (INT, 8);/* V8QI V4HI V2SI. */ VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI. */ VECTOR_MODES (FLOAT, 8); /* V2SF. */ VECTOR_MODES (FLOAT, 16); /*V4SF V2DF. */ +VECTOR_MODE (INT, DI, 1); /* V1DI. */ VECTOR_MODE (FLOAT, DF, 1); /* V1DF. */ VECTOR_MODE (FLOAT, HF, 2); /* V2HF. */ diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def b/gcc/config/aarch64/aarch64-simd-builtin-types.def index 248e51e96549fb640817d79c099a3f5e62c71317..40545581408e2ee2be84f08abb5801058c4ea42e 100644 --- a/gcc/config/aarch64/aarch64-simd-builtin-types.def +++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def @@ -24,7 +24,7 @@ ENTRY (Int16x8_t, V8HI, none, 11) ENTRY (Int32x2_t, V2SI, none, 11) ENTRY (Int32x4_t, V4SI, none, 11) - ENTRY (Int64x1_t, DI, none, 11) + ENTRY (Int64x1_t, V1DI, none, 11) ENTRY (Int64x2_t, V2DI, none, 11) ENTRY (Uint8x8_t, V8QI, unsigned, 11) ENTRY (Uint8x16_t, V16QI, unsigned, 12) @@ -32,7 +32,7 @@ ENTRY (Uint16x8_t, V8HI, unsigned, 12) ENTRY (Uint32x2_t, V2SI, unsigned, 12) ENTRY (Uint32x4_t, V4SI, unsigned, 12) - ENTRY (Uint64x1_t, DI, unsigned, 12) + ENTRY (Uint64x1_t, V1DI, unsigned, 12) ENTRY (Uint64x2_t, V2DI, unsigned, 12) ENTRY (Poly8_t, QI, poly, 9) ENTRY (Poly16_t, HI, poly, 10) @@ -42,7 +42,7 @@ ENTRY (Poly8x16_t, V16QI, poly, 12) ENTRY (Poly16x4_t, V4HI, poly, 12) ENTRY (Poly16x8_t, V8HI, poly, 12) - ENTRY (Poly64x1_t, DI, poly, 12) + ENTRY (Poly64x1_t, V1DI, poly, 12) ENTRY (Poly64x2_t, V2DI, poly, 12) ENTRY (Float16x4_t, V4HF, none, 13) ENTRY (Float16x8_t, V8HF, none, 13) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index a00e1c6ef8d6b43d8b1a0fe4701e6b8c1f0f622f..587a45d77721e1b39accbad7dbeca4d741eccb10 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -8026,16 +8026,16 @@ }) ;; Extract a single-element 64-bit vector from one half of a 128-bit vector. -(define_expand "vec_extractv2dfv1df" - [(match_operand:V1DF 0 "register_operand") - (match_operand:V2DF 1 "register_operand") +(define_expand "vec_extract" + [(match_operand: 0 "register_operand") + (match_operand:VQ_2E 1 "register_operand") (match_operand 2 "immediate_operand")] "TARGET_SIMD" { - /* V1DF is rarely used by other patterns, so it should be better to hide - it in a subreg destination of a normal DF op. */ - rtx scalar0 = gen_lowpart (DFmode, operands[0]); - emit_insn (gen_vec_extractv2dfdf (scalar0, operands[1], operands[2])); + /* V1DI and V1DF are rarely used by other patterns, so it should be better + to hide it in a subreg destination of a normal DI or DF op. */ + rtx scalar0 = gen_lowpart (mode, operands[0]); + emit_insn (gen_vec_extract (scalar0, operands[1], operands[2])); DONE; }) diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index f650abbc4ce49cf0947049931f86bad1130c3428..278910af0a38c0203a962d34c6792191f0fe9e31 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -3568,7 +3568,7 @@ aarch64_classify_vector_mode (machine_mode mode) case E_V8QImode: case E_V4HImode: case E_V2SImode: -/* ...E_V1DImode doesn't exist. */ +case E_V1DImode:
Re: [PATCH v2 1/2] aarch64: Don't return invalid GIMPLE assign statements
On Wed, Jul 13, 2022 at 09:10:25AM +0100, Richard Sandiford wrote: > Richard Biener via Gcc-patches writes: > > On Tue, Jul 12, 2022 at 4:38 PM Andrew Carlotti > > wrote: > >> > >> aarch64_general_gimple_fold_builtin doesn't check whether the LHS of a > >> function call is null before converting it to an assign statement. To avoid > >> returning an invalid GIMPLE statement in this case, we instead assign the > >> expression result to a new (unused) variable. > >> > >> This change only affects code that: > >> 1) Calls an intrinsic function that has no side effects; > >> 2) Does not use or store the value returned by the intrinsic; > >> 3) Uses parameters that prevent the front-end eliminating the call prior to > >> gimplification. > >> > >> The ICE is unlikely to have occurred in the wild, as it relies on the > >> presence > >> of a redundant intrinsic call. > > > > Other targets usually simply refrain from folding intrinsic calls with no > > LHS. > > Another option is to just drop it on the floor if it does not have any > > side-effects which for the gimple_fold_builtin hook means folding it to > > a GIMPLE_NOP (gimple_build_nop ()). > > Sorry, I just pushed the patch before seeing this. > > I guess the problem with refraining from folding calls with no lhs > is that it has to be done on a per-function basis. (E.g. stores > should still be folded.) It then becomes something that we need > to remember for each individual call. E.g. ix86_gimple_fold_builtin > seems to have three different pieces of code for handling null lhses, > even with its heavy use of gotos. > > So a nice thing about the current patch is that it handles all this > in one place only. > > Thanks, > Richard I specifically wanted to avoid not folding the call, because always folding means that the builtin doesn't need to be implemented anywhere else (which isn't relevant here, but may become relevant when folding newly defined builtins in the future). I considered dropping the statement, but I wasn't sure at the time that I could do it safely. I could send a patch to instead replace new_stmt with a GIMPLE_NOP. > >> gcc/ChangeLog: > >> > >> * config/aarch64/aarch64-builtins.cc > >> (aarch64_general_gimple_fold_builtin): Add fixup for invalid GIMPLE. > >> > >> gcc/testsuite/ChangeLog: > >> > >> * gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c: New test. > >> > >> --- > >> > >> diff --git a/gcc/config/aarch64/aarch64-builtins.cc > >> b/gcc/config/aarch64/aarch64-builtins.cc > >> index > >> e0a741ac663188713e21f457affa57217d074783..5753988a9964967c27a03aca5fddb9025fd8ed6e > >> 100644 > >> --- a/gcc/config/aarch64/aarch64-builtins.cc > >> +++ b/gcc/config/aarch64/aarch64-builtins.cc > >> @@ -3022,6 +3022,16 @@ aarch64_general_gimple_fold_builtin (unsigned int > >> fcode, gcall *stmt, > >> default: > >>break; > >> } > >> + > >> + /* GIMPLE assign statements (unlike calls) require a non-null lhs. If we > >> + created an assign statement with a null lhs, then fix this by > >> assigning > >> + to a new (and subsequently unused) variable. */ > >> + if (new_stmt && is_gimple_assign (new_stmt) && !gimple_assign_lhs > >> (new_stmt)) > >> +{ > >> + tree new_lhs = make_ssa_name (gimple_call_return_type (stmt)); > >> + gimple_assign_set_lhs (new_stmt, new_lhs); > >> +} > >> + > >>return new_stmt; > >> } > >> > >> diff --git > >> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c > >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c > >> new file mode 100644 > >> index > >> ..345307456b175307f5cb22de5e59cfc6254f2737 > >> --- /dev/null > >> +++ > >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c > >> @@ -0,0 +1,9 @@ > >> +/* { dg-do compile { target { aarch64*-*-* } } } */ > >> + > >> +#include > >> + > >> +int8_t *bar(); > >> + > >> +void foo() { > >> + __builtin_aarch64_ld1v16qi(bar()); > >> +}
[PATCH v2 2/2] aarch64: Lower vcombine to GIMPLE
This lowers vcombine intrinsics to a GIMPLE vector constructor, which enables better optimisation during GIMPLE passes. gcc/ * config/aarch64/aarch64-builtins.c (aarch64_general_gimple_fold_builtin): Add combine. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/combine.c: New test. diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 5753988a9964967c27a03aca5fddb9025fd8ed6e..a25756cfed5fab3a98ebf3e2ee29a5e117cbd2aa 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -2857,6 +2857,28 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt, gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); break; + BUILTIN_VDC (BINOP, combine, 0, AUTO_FP) + BUILTIN_VD_I (BINOPU, combine, 0, NONE) + BUILTIN_VDC_P (BINOPP, combine, 0, NONE) + { + tree first_part, second_part; + if (BYTES_BIG_ENDIAN) + { + second_part = args[0]; + first_part = args[1]; + } + else + { + first_part = args[0]; + second_part = args[1]; + } + tree ret_type = TREE_TYPE (gimple_call_lhs (stmt)); + tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, first_part, + NULL_TREE, second_part); + new_stmt = gimple_build_assign (gimple_call_lhs (stmt), ctor); + } + break; + /*lower store and load neon builtins to gimple. */ BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c new file mode 100644 index ..d08faf7a4a160a1e83428ed9b270731bbf7b8c8a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c @@ -0,0 +1,18 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ + +#include + +/* +** foo: +** umovw0, v1\.s\[1\] +** ret +*/ + +int32_t foo (int32x2_t a, int32x2_t b) +{ + int32x4_t c = vcombine_s32(a, b); + return vgetq_lane_s32(c, 3); +} +
[PATCH v2 1/2] aarch64: Don't return invalid GIMPLE assign statements
aarch64_general_gimple_fold_builtin doesn't check whether the LHS of a function call is null before converting it to an assign statement. To avoid returning an invalid GIMPLE statement in this case, we instead assign the expression result to a new (unused) variable. This change only affects code that: 1) Calls an intrinsic function that has no side effects; 2) Does not use or store the value returned by the intrinsic; 3) Uses parameters that prevent the front-end eliminating the call prior to gimplification. The ICE is unlikely to have occurred in the wild, as it relies on the presence of a redundant intrinsic call. gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (aarch64_general_gimple_fold_builtin): Add fixup for invalid GIMPLE. gcc/testsuite/ChangeLog: * gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c: New test. --- diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index e0a741ac663188713e21f457affa57217d074783..5753988a9964967c27a03aca5fddb9025fd8ed6e 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -3022,6 +3022,16 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt, default: break; } + + /* GIMPLE assign statements (unlike calls) require a non-null lhs. If we + created an assign statement with a null lhs, then fix this by assigning + to a new (and subsequently unused) variable. */ + if (new_stmt && is_gimple_assign (new_stmt) && !gimple_assign_lhs (new_stmt)) +{ + tree new_lhs = make_ssa_name (gimple_call_return_type (stmt)); + gimple_assign_set_lhs (new_stmt, new_lhs); +} + return new_stmt; } diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c new file mode 100644 index ..345307456b175307f5cb22de5e59cfc6254f2737 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c @@ -0,0 +1,9 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ + +#include + +int8_t *bar(); + +void foo() { + __builtin_aarch64_ld1v16qi(bar()); +}
Re: [PATCH] aarch64: Fix pure/const function attributes for intrinsics
On Fri, Jul 01, 2022 at 08:42:15AM +0200, Richard Biener wrote: > On Thu, Jun 30, 2022 at 6:04 PM Andrew Carlotti via Gcc-patches > wrote: > > diff --git a/gcc/config/aarch64/aarch64-builtins.cc > > b/gcc/config/aarch64/aarch64-builtins.cc > > index > > e0a741ac663188713e21f457affa57217d074783..877f54aab787862794413259cd36ca0fb7bd49c5 > > 100644 > > --- a/gcc/config/aarch64/aarch64-builtins.cc > > +++ b/gcc/config/aarch64/aarch64-builtins.cc > > @@ -1085,9 +1085,9 @@ aarch64_get_attributes (unsigned int f, machine_mode > > mode) > >if (!aarch64_modifies_global_state_p (f, mode)) > > { > >if (aarch64_reads_global_state_p (f, mode)) > > - attrs = aarch64_add_attribute ("pure", attrs); > > - else > > attrs = aarch64_add_attribute ("const", attrs); > > + else > > + attrs = aarch64_add_attribute ("pure", attrs); > > that looks backwards. 'pure' allows read of global memory while > 'const' does not. Is > aarch64_reads_global_state_p really backwards? Oh - the thing that's backwards is my understanding of what "pure" and "const" mean. Their meanings as GCC function attributes seem to be approximately the opposite way round to their meanings in general usage.
[PATCH] aarch64: Fix pure/const function attributes for intrinsics
No testcase for this, since I haven't found a way to turn the incorrect attribute into incorrect codegen. Bootstrapped and tested on aarch64-none-linux gnu. gcc/ * config/aarch64/aarch64-builtins.c (aarch64_get_attributes): Fix choice of pure/const attributes. --- diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index e0a741ac663188713e21f457affa57217d074783..877f54aab787862794413259cd36ca0fb7bd49c5 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -1085,9 +1085,9 @@ aarch64_get_attributes (unsigned int f, machine_mode mode) if (!aarch64_modifies_global_state_p (f, mode)) { if (aarch64_reads_global_state_p (f, mode)) - attrs = aarch64_add_attribute ("pure", attrs); - else attrs = aarch64_add_attribute ("const", attrs); + else + attrs = aarch64_add_attribute ("pure", attrs); } if (!flag_non_call_exceptions || !aarch64_could_trap_p (f, mode))
[PATCH] aarch64: Move vreinterpret definitions into the compiler
Hi, This removes a significant number of intrinsic definitions from the arm_neon.h header file, and reduces the amount of code duplication. The new macros and data structures are intended to also facilitate moving other intrinsic definitions out of the header file in future. There is a a slight change in the behaviour of the bf16 vreinterpret intrinsics when compiling without bf16 support. Expressions like: b = vreinterpretq_s32_bf16(vreinterpretq_bf16_s64(a)); are now compiled successfully, instead of causing a 'target specific option mismatch' during inlining. Bootstrapped and tested on aarch64-none-linux-gnu gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (v1di_UP): Add V1DI mode. (MODE_d_bf16, MODE_d_f16, MODE_d_f32, MODE_d_f64, MODE_d_s8) (MODE_d_s16, MODE_d_s32, MODE_d_s64, MODE_d_u8, MODE_d_u16) (MODE_d_u32, MODE_d_u64, MODE_d_p8, MODE_d_p16, MODE_d_p64) (MODE_q_bf16, MODE_q_f16, MODE_q_f32, MODE_q_f64, MODE_q_s8) (MODE_q_s16, MODE_q_s32, MODE_q_s64, MODE_q_u8, MODE_q_u16) (MODE_q_u32, MODE_q_u64, MODE_q_p8, MODE_q_p16, MODE_q_p64) (MODE_q_p128): Define macro to map to corresponding mode name. (QUAL_bf16, QUAL_f16, QUAL_f32, QUAL_f64, QUAL_s8, QUAL_s16) (QUAL_s32, QUAL_s64, QUAL_u8, QUAL_u16, QUAL_u32, QUAL_u64) (QUAL_p8, QUAL_p16, QUAL_p64, QUAL_p128): Define macro to map to corresponding qualifier name. (LENGTH_d, LENGTH_q): Define macro to map to "" or "q" suffix. (SIMD_INTR_MODE, SIMD_INTR_QUAL, SIMD_INTR_LENGTH_CHAR): Macro functions for the above mappings (VREINTERPRET_BUILTIN2, VREINTERPRET_BUILTINS1, VREINTERPRET_BUILTINS) (VREINTERPRETQ_BUILTIN2, VREINTERPRETQ_BUILTINS1) (VREINTERPRETQ_BUILTINS, VREINTERPRET_BUILTIN) (AARCH64_SIMD_VREINTERPRET_BUILTINS): New macros to create definitions for all vreinterpret intrinsics (enum aarch64_builtins): Add vreinterpret function codes (aarch64_init_simd_intrinsics): New (handle_arm_neon_h): Improved comment. (aarch64_general_fold_builtin): Fold vreinterpret calls * config/aarch64/aarch64-modes.def (VECTOR_MODE): Add V1DI mode * config/aarch64/aarch64-simd-builtin-types.def: Use V1DI mode * config/aarch64/aarch64-simd.md (vec_extractv2div1di): New * config/aarch64/aarch64.cc (aarch64_classify_vector_mode): Add V1DI mode * config/aarch64/arm_neon.h (vreinterpret_p8_f16, vreinterpret_p8_f64, vreinterpret_p8_s8) (vreinterpret_p8_s16, vreinterpret_p8_s32, vreinterpret_p8_s64) (vreinterpret_p8_f32, vreinterpret_p8_u8, vreinterpret_p8_u16) (vreinterpret_p8_u32, vreinterpret_p8_u64, vreinterpret_p8_p16) (vreinterpret_p8_p64, vreinterpretq_p8_f64, vreinterpretq_p8_s8) (vreinterpretq_p8_s16, vreinterpretq_p8_s32, vreinterpretq_p8_s64) (vreinterpretq_p8_f16, vreinterpretq_p8_f32, vreinterpretq_p8_u8) (vreinterpretq_p8_u16, vreinterpretq_p8_u32, vreinterpretq_p8_u64) (vreinterpretq_p8_p16, vreinterpretq_p8_p64, vreinterpretq_p8_p128) (vreinterpret_p16_f16, vreinterpret_p16_f64, vreinterpret_p16_s8) (vreinterpret_p16_s16, vreinterpret_p16_s32, vreinterpret_p16_s64) (vreinterpret_p16_f32, vreinterpret_p16_u8, vreinterpret_p16_u16) (vreinterpret_p16_u32, vreinterpret_p16_u64, vreinterpret_p16_p8) (vreinterpret_p16_p64, vreinterpretq_p16_f64, vreinterpretq_p16_s8) (vreinterpretq_p16_s16, vreinterpretq_p16_s32, vreinterpretq_p16_s64) (vreinterpretq_p16_f16, vreinterpretq_p16_f32, vreinterpretq_p16_u8) (vreinterpretq_p16_u16, vreinterpretq_p16_u32, vreinterpretq_p16_u64) (vreinterpretq_p16_p8, vreinterpretq_p16_p64, vreinterpretq_p16_p128) (vreinterpret_p64_f16, vreinterpret_p64_f64, vreinterpret_p64_s8) (vreinterpret_p64_s16, vreinterpret_p64_s32, vreinterpret_p64_s64) (vreinterpret_p64_f32, vreinterpret_p64_u8, vreinterpret_p64_u16) (vreinterpret_p64_u32, vreinterpret_p64_u64, vreinterpret_p64_p8) (vreinterpret_p64_p16, vreinterpretq_p64_f64, vreinterpretq_p64_s8) (vreinterpretq_p64_s16, vreinterpretq_p64_s32, vreinterpretq_p64_s64) (vreinterpretq_p64_f16, vreinterpretq_p64_f32, vreinterpretq_p64_p128) (vreinterpretq_p64_u8, vreinterpretq_p64_u16, vreinterpretq_p64_p16) (vreinterpretq_p64_u32, vreinterpretq_p64_u64, vreinterpretq_p64_p8) (vreinterpretq_p128_p8, vreinterpretq_p128_p16, vreinterpretq_p128_f16) (vreinterpretq_p128_f32, vreinterpretq_p128_p64, vreinterpretq_p128_s64) (vreinterpretq_p128_u64, vreinterpretq_p128_s8, vreinterpretq_p128_s16) (vreinterpretq_p128_s32, vreinterpretq_p128_u8, vreinterpretq_p128_u16) (vreinterpretq_p128_u32, vreinterpret_f16_f64, vreinterpret_f16_s8) (vreinterpret_f16_s16): (vreinterpret_f16_s32): (vreinterpret_f16_s64):
[PATCH] aarch64: Lower vcombine to GIMPLE
Hi all, This lowers vcombine intrinsics to a GIMPLE vector constructor, which enables better optimisation during GIMPLE passes. Bootstrapped and tested on aarch64-none-linux-gnu, and tested for aarch64_be-none-linux-gnu via cross-compilation. gcc/ * config/aarch64/aarch64-builtins.c (aarch64_general_gimple_fold_builtin): Add combine. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/combine.c: New test. --- diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 5217dbdb2ac78bba0a669d22af6d769d1fe91a3d..9d52fb8c5a48c9b743defb340a85fb20a1c8f014 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -2827,6 +2827,18 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt, gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); break; + BUILTIN_VDC (BINOP, combine, 0, AUTO_FP) + BUILTIN_VD_I (BINOPU, combine, 0, NONE) + BUILTIN_VDC_P (BINOPP, combine, 0, NONE) + { + if (BYTES_BIG_ENDIAN) + std::swap(args[0], args[1]); + tree ret_type = TREE_TYPE (gimple_call_lhs (stmt)); + tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, args[0], NULL_TREE, args[1]); + new_stmt = gimple_build_assign (gimple_call_lhs (stmt), ctor); + } + break; + /*lower store and load neon builtins to gimple. */ BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c new file mode 100644 index ..d08faf7a4a160a1e83428ed9b270731bbf7b8c8a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c @@ -0,0 +1,18 @@ +/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ + +#include + +/* +** foo: +** umovw0, v1\.s\[1\] +** ret +*/ + +int32_t foo (int32x2_t a, int32x2_t b) +{ + int32x4_t c = vcombine_s32(a, b); + return vgetq_lane_s32(c, 3); +} +