Re: [GCC 13 PATCH] aarch64: Remove architecture dependencies from intrinsics

2023-07-20 Thread Andrew Carlotti via Gcc-patches
On Thu, Jul 20, 2023 at 09:37:14AM +0200, Richard Biener wrote:
> On Thu, Jul 20, 2023 at 8:49 AM Richard Sandiford via Gcc-patches
>  wrote:
> >
> > Andrew Carlotti  writes:
> > > Updated patch to fix the fp16 intrinsic pragmas, and pushed to master.
> > > OK to backport to GCC 13?
> >
> > OK, thanks.
> 
> In case you want it in 13.2 please push it really soon, we want to do 13.2 RC1
> today.
> 
> Richard.

Pushed, thanks.

> 
> > Richard
> >
> > > Many intrinsics currently depend on both an architecture version and a
> > > feature, despite the corresponding instructions being available within
> > > GCC at lower architecture versions.
> > >
> > > LLVM has already removed these explicit architecture version
> > > dependences; this patch does the same for GCC. Note that +fp16 does not
> > > imply +simd, so we need to add an explicit +simd for the Neon fp16
> > > intrinsics.
> > >
> > > Binutils did not previously support all of these architecture+feature
> > > combinations, but this problem is already reachable from GCC.  For
> > > example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c
> > > with -O3 -march=armv8-a+dotprod has resulted in an assembler error since
> > > GCC 10.  This is fixed in Binutils 2.41.
> > >
> > > This patch retains explicit architecture version dependencies for
> > > features that do not currently have a separate feature flag.
> > >
> > > gcc/ChangeLog:
> > >
> > >  * config/aarch64/aarch64.h (TARGET_MEMTAG): Remove armv8.5
> > >  dependency.
> > >  * config/aarch64/arm_acle.h: Remove unnecessary armv8.x
> > >  dependencies from target pragmas.
> > >  * config/aarch64/arm_fp16.h (target): Likewise.
> > >  * config/aarch64/arm_neon.h (target): Likewise.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >  * gcc.target/aarch64/feature-bf16-backport.c: New test.
> > >  * gcc.target/aarch64/feature-dotprod-backport.c: New test.
> > >  * gcc.target/aarch64/feature-fp16-backport.c: New test.
> > >  * gcc.target/aarch64/feature-fp16-scalar-backport.c: New test.
> > >  * gcc.target/aarch64/feature-fp16fml-backport.c: New test.
> > >  * gcc.target/aarch64/feature-i8mm-backport.c: New test.
> > >  * gcc.target/aarch64/feature-memtag-backport.c: New test.
> > >  * gcc.target/aarch64/feature-sha3-backport.c: New test.
> > >  * gcc.target/aarch64/feature-sm4-backport.c: New test.
> > >
> > > ---
> > >
> > > diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> > > index 
> > > a01f1ee99d85917941ffba55bc3b4dcac87b41f6..2b0fc97bb71e9d560ae26035c7d7142682e46c38
> > >  100644
> > > --- a/gcc/config/aarch64/aarch64.h
> > > +++ b/gcc/config/aarch64/aarch64.h
> > > @@ -292,7 +292,7 @@ enum class aarch64_feature : unsigned char {
> > >  #define TARGET_RNG (AARCH64_ISA_RNG)
> > >
> > >  /* Memory Tagging instructions optional to Armv8.5 enabled through 
> > > +memtag.  */
> > > -#define TARGET_MEMTAG (AARCH64_ISA_V8_5A && AARCH64_ISA_MEMTAG)
> > > +#define TARGET_MEMTAG (AARCH64_ISA_MEMTAG)
> > >
> > >  /* I8MM instructions are enabled through +i8mm.  */
> > >  #define TARGET_I8MM (AARCH64_ISA_I8MM)
> > > diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
> > > index 
> > > 3b6b63e6805432b5f1686745f987c52d2967c7c1..7599a32301dadf80760d3cb40a8685d2e6a476fb
> > >  100644
> > > --- a/gcc/config/aarch64/arm_acle.h
> > > +++ b/gcc/config/aarch64/arm_acle.h
> > > @@ -292,7 +292,7 @@ __rndrrs (uint64_t *__res)
> > >  #pragma GCC pop_options
> > >
> > >  #pragma GCC push_options
> > > -#pragma GCC target ("arch=armv8.5-a+memtag")
> > > +#pragma GCC target ("+nothing+memtag")
> > >
> > >  #define __arm_mte_create_random_tag(__ptr, __u64_mask) \
> > >__builtin_aarch64_memtag_irg(__ptr, __u64_mask)
> > > diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h
> > > index 
> > > 350f8cc33d99e16137e9d70fa7958b10924dc67f..c10f9dcf7e097ded1740955addcd73348649dc56
> > >  100644
> > > --- a/gcc/config/aarch64/arm_fp16.h
> > > +++ b/gcc/config/aarch64/arm_fp16.h
> > > @@ -30,7 +30,7 @@
> > >  #include 
> > >
> > >  #pragma GCC push_options
> > > -#pragma GCC target ("arch=armv8.2-a+fp16")
> > > +#pragma GCC target ("+nothing+fp16")
> > >
> > >  typedef __fp16 float16_t;
> > >
> > > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> > > index 
> > > 0ace1eeddb97443433c091d2363403fcf2907654..349f3167699447eb397af482eaeadf8a07617025
> > >  100644
> > > --- a/gcc/config/aarch64/arm_neon.h
> > > +++ b/gcc/config/aarch64/arm_neon.h
> > > @@ -25590,7 +25590,7 @@ __INTERLEAVE_LIST (zip)
> > >  #include "arm_fp16.h"
> > >
> > >  #pragma GCC push_options
> > > -#pragma GCC target ("arch=armv8.2-a+fp16")
> > > +#pragma GCC target ("+nothing+simd+fp16")
> > >
> > >  /* ARMv8.2-A FP16 one operand vector intrinsics.  */
> > >
> > > @@ -26753,7 +26753,7 @@ vminnmvq_f16 (float16x8_t __a)
> > >  /* AdvSIMD Dot Product intrinsics.  */
> > >
> > >  #pragma GCC push_options
> > > -#pragma GCC target ("arch=armv8.2-a+dotprod")
> > > 

Re: [GCC 13 PATCH] aarch64: Remove architecture dependencies from intrinsics

2023-07-19 Thread Andrew Carlotti via Gcc-patches
On Wed, Jul 19, 2023 at 07:35:26PM +0100, Ramana Radhakrishnan wrote:
> On Wed, Jul 19, 2023 at 5:44 PM Andrew Carlotti via Gcc-patches
>  wrote:
> >
> > Updated patch to fix the fp16 intrinsic pragmas, and pushed to master.
> > OK to backport to GCC 13?
> >
> >
> > Many intrinsics currently depend on both an architecture version and a
> > feature, despite the corresponding instructions being available within
> > GCC at lower architecture versions.
> >
> > LLVM has already removed these explicit architecture version
> > dependences; this patch does the same for GCC. Note that +fp16 does not
> > imply +simd, so we need to add an explicit +simd for the Neon fp16
> > intrinsics.
> >
> > Binutils did not previously support all of these architecture+feature
> > combinations, but this problem is already reachable from GCC.  For
> > example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c
> > with -O3 -march=armv8-a+dotprod has resulted in an assembler error since
> > GCC 10.  This is fixed in Binutils 2.41.
> 
> Are there any implementations that actually implement v8-a + dotprod
> ?. As far as I'm aware this was v8.2-A as the base architecture where
> this was allowed. Has this changed recently?
> 
> 
> regards
> Ramana

I don't recall whether there are any physical implementations of DotProd
without Armv8.2, but similar situations have already occurred with other
features.

There are also situations where developers wish to enable only a subset of
available features.  For example, the existing restrictions in GCC have forced
Chromium to disable their memtag support when building with GCC [1]; with this
patch, they will be able to reenable memtag support from GCC 14 (and GCC 13.x
when this is backported).

I don't see any advantages to trying to enforce minimum architecture versions
for features in GCC, except perhaps maintaining the status quo.  But the status
quo is already rather inconsistent, and these changes only make GCC more
permissive (and only for options that currently don't work).


[1] https://chromium-review.googlesource.com/c/chromium/src/+/3238466


[GCC 13 PATCH] aarch64: Remove architecture dependencies from intrinsics

2023-07-19 Thread Andrew Carlotti via Gcc-patches
Updated patch to fix the fp16 intrinsic pragmas, and pushed to master.
OK to backport to GCC 13?


Many intrinsics currently depend on both an architecture version and a
feature, despite the corresponding instructions being available within
GCC at lower architecture versions.

LLVM has already removed these explicit architecture version
dependences; this patch does the same for GCC. Note that +fp16 does not
imply +simd, so we need to add an explicit +simd for the Neon fp16
intrinsics.

Binutils did not previously support all of these architecture+feature
combinations, but this problem is already reachable from GCC.  For
example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c
with -O3 -march=armv8-a+dotprod has resulted in an assembler error since
GCC 10.  This is fixed in Binutils 2.41.

This patch retains explicit architecture version dependencies for
features that do not currently have a separate feature flag.

gcc/ChangeLog:

 * config/aarch64/aarch64.h (TARGET_MEMTAG): Remove armv8.5
 dependency.
 * config/aarch64/arm_acle.h: Remove unnecessary armv8.x
 dependencies from target pragmas.
 * config/aarch64/arm_fp16.h (target): Likewise.
 * config/aarch64/arm_neon.h (target): Likewise.

gcc/testsuite/ChangeLog:

 * gcc.target/aarch64/feature-bf16-backport.c: New test.
 * gcc.target/aarch64/feature-dotprod-backport.c: New test.
 * gcc.target/aarch64/feature-fp16-backport.c: New test.
 * gcc.target/aarch64/feature-fp16-scalar-backport.c: New test.
 * gcc.target/aarch64/feature-fp16fml-backport.c: New test.
 * gcc.target/aarch64/feature-i8mm-backport.c: New test.
 * gcc.target/aarch64/feature-memtag-backport.c: New test.
 * gcc.target/aarch64/feature-sha3-backport.c: New test.
 * gcc.target/aarch64/feature-sm4-backport.c: New test.

---

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
a01f1ee99d85917941ffba55bc3b4dcac87b41f6..2b0fc97bb71e9d560ae26035c7d7142682e46c38
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -292,7 +292,7 @@ enum class aarch64_feature : unsigned char {
 #define TARGET_RNG (AARCH64_ISA_RNG)
 
 /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag.  */
-#define TARGET_MEMTAG (AARCH64_ISA_V8_5A && AARCH64_ISA_MEMTAG)
+#define TARGET_MEMTAG (AARCH64_ISA_MEMTAG)
 
 /* I8MM instructions are enabled through +i8mm.  */
 #define TARGET_I8MM (AARCH64_ISA_I8MM)
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 
3b6b63e6805432b5f1686745f987c52d2967c7c1..7599a32301dadf80760d3cb40a8685d2e6a476fb
 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -292,7 +292,7 @@ __rndrrs (uint64_t *__res)
 #pragma GCC pop_options
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.5-a+memtag")
+#pragma GCC target ("+nothing+memtag")
 
 #define __arm_mte_create_random_tag(__ptr, __u64_mask) \
   __builtin_aarch64_memtag_irg(__ptr, __u64_mask)
diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h
index 
350f8cc33d99e16137e9d70fa7958b10924dc67f..c10f9dcf7e097ded1740955addcd73348649dc56
 100644
--- a/gcc/config/aarch64/arm_fp16.h
+++ b/gcc/config/aarch64/arm_fp16.h
@@ -30,7 +30,7 @@
 #include 
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+fp16")
+#pragma GCC target ("+nothing+fp16")
 
 typedef __fp16 float16_t;
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 
0ace1eeddb97443433c091d2363403fcf2907654..349f3167699447eb397af482eaeadf8a07617025
 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -25590,7 +25590,7 @@ __INTERLEAVE_LIST (zip)
 #include "arm_fp16.h"
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+fp16")
+#pragma GCC target ("+nothing+simd+fp16")
 
 /* ARMv8.2-A FP16 one operand vector intrinsics.  */
 
@@ -26753,7 +26753,7 @@ vminnmvq_f16 (float16x8_t __a)
 /* AdvSIMD Dot Product intrinsics.  */
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+dotprod")
+#pragma GCC target ("+nothing+dotprod")
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -26844,7 +26844,7 @@ vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, 
int8x16_t __b, const int __index)
 #pragma GCC pop_options
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+sm4")
+#pragma GCC target ("+nothing+sm4")
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -26911,7 +26911,7 @@ vsm4ekeyq_u32 (uint32x4_t __a, uint32x4_t __b)
 #pragma GCC pop_options
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+sha3")
+#pragma GCC target ("+nothing+sha3")
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -27547,7 +27547,7 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t 
__a, float32x4_t __b,
 #pragma GCC pop_options
 
 #pragma GCC 

Re: [PATCH] aarch64: Remove architecture dependencies from intrinsics

2023-06-29 Thread Andrew Carlotti via Gcc-patches
On Tue, Jun 27, 2023 at 07:23:32AM +0100, Richard Sandiford wrote:
> Andrew Carlotti via Gcc-patches  writes:
> > Many intrinsics currently depend on both an architecture version and a
> > feature, despite the corresponding instructions being available within
> > GCC at lower architecture versions.
> >
> > LLVM has already removed these explicit architecture version
> > dependences; this patch does the same for GCC, as well as removing an
> > unecessary simd dependency for the scalar fp16 intrinsics.
> >
> > Binutils does not support all of these architecture+feature combinations
> > yet, but this is an existing problem that is already reachable from GCC.
> > For example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c
> > with -O3 -march=armv8-a+dotprod has resulted in an assembler error since
> > GCC 10. I intend to patch this in binutils.
> >
> > This patch retains explicit architecture version dependencies for
> > features that do not currently have a separate feature flag.
> >
> > Ok for master, and backport to GCC 13?
> >
> > gcc/ChangeLog:
> >
> >  * config/aarch64/aarch64.h (TARGET_MEMTAG): Remove armv8.5
> >  dependency.
> >  * config/aarch64/arm_acle.h: Remove unnecessary armv8.x
> >  dependencies from target pragmas.
> >  * config/aarch64/arm_fp16.h (target): Likewise.
> 
> The change to this file is a bit different from the others,
> since it's removing an implicit dependency on +simd, rather
> than a dependency on an architecture level.  I think it'd be
> worth mentioning that explicitly in the changelog.
> 
> OK with that change, thanks.
> 
> (Arguably we should add +nosimd to many of the other pragmas in
> arm_acle.h, but that's logically a separate patch.)
> 
> Richard

Actually, I think I should just remove the +nosimd from the patch, because
+fp16 doesn't enable simd (unlike +bf16, which has simd as an 'explicit on'
implication).

Aside from +bf16, the only other feature with simd as an 'explicit on' is
+rdma. However, there appear to be no non-simd rdma instructions, so
+nothing+rdma+nosimd is effectively the same as +nothing.

> > ...
> >
> > diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h
> > index 
> > a8fa4dbbdfe1bab4aa604bb311ef66d4e1de18ac..84b2ed66f9ba19fba6ccd8be33940d7239bfa22e
> >  100644
> > --- a/gcc/config/aarch64/arm_fp16.h
> > +++ b/gcc/config/aarch64/arm_fp16.h
> > @@ -30,7 +30,7 @@
> >  #include 
> >  
> >  #pragma GCC push_options
> > -#pragma GCC target ("arch=armv8.2-a+fp16")
> > +#pragma GCC target ("+nothing+fp16+nosimd")
> >  
> >  typedef __fp16 float16_t;
> >  


[committed] docs: Fix typo

2023-06-26 Thread Andrew Carlotti via Gcc-patches
gcc/ChangeLog:

 * doc/optinfo.texi: Fix "steam" -> "stream".


diff --git a/gcc/doc/optinfo.texi b/gcc/doc/optinfo.texi
index 
b91bba7bd10470b17ca5190688beee06ad3b87ab..5e8c97ef118786e68b7e46f3c802154cb9b57b83
 100644
--- a/gcc/doc/optinfo.texi
+++ b/gcc/doc/optinfo.texi
@@ -100,7 +100,7 @@ that one could also use special file names @code{stdout} and
 respectively.
 
 @item @code{alt_stream}
-This steam is used for printing optimization specific output in
+This stream is used for printing optimization specific output in
 response to the @option{-fopt-info}. Again a file name can be given. If
 the file name is not given, it defaults to @code{stderr}.
 @end table


[PATCH] aarch64: Remove architecture dependencies from intrinsics

2023-06-26 Thread Andrew Carlotti via Gcc-patches
Many intrinsics currently depend on both an architecture version and a
feature, despite the corresponding instructions being available within
GCC at lower architecture versions.

LLVM has already removed these explicit architecture version
dependences; this patch does the same for GCC, as well as removing an
unecessary simd dependency for the scalar fp16 intrinsics.

Binutils does not support all of these architecture+feature combinations
yet, but this is an existing problem that is already reachable from GCC.
For example, compiling the test gcc.target/aarch64/usadv16qi-dotprod.c
with -O3 -march=armv8-a+dotprod has resulted in an assembler error since
GCC 10. I intend to patch this in binutils.

This patch retains explicit architecture version dependencies for
features that do not currently have a separate feature flag.

Ok for master, and backport to GCC 13?

gcc/ChangeLog:

 * config/aarch64/aarch64.h (TARGET_MEMTAG): Remove armv8.5
 dependency.
 * config/aarch64/arm_acle.h: Remove unnecessary armv8.x
 dependencies from target pragmas.
 * config/aarch64/arm_fp16.h (target): Likewise.
 * config/aarch64/arm_neon.h (target): Likewise.

gcc/testsuite/ChangeLog:

 * gcc.target/aarch64/feature-bf16-backport.c: New test.
 * gcc.target/aarch64/feature-dotprod-backport.c: New test.
 * gcc.target/aarch64/feature-fp16-backport.c: New test.
 * gcc.target/aarch64/feature-fp16-scalar-backport.c: New test.
 * gcc.target/aarch64/feature-fp16fml-backport.c: New test.
 * gcc.target/aarch64/feature-i8mm-backport.c: New test.
 * gcc.target/aarch64/feature-memtag-backport.c: New test.
 * gcc.target/aarch64/feature-sha3-backport.c: New test.
 * gcc.target/aarch64/feature-sm4-backport.c: New test.


diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
7129ed1ff370d597895b3f46b56b1250da7fa190..cdb664eb8f7db820b6b06b2667bfad6dc14cb7a2
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -292,7 +292,7 @@ enum class aarch64_feature : unsigned char {
 #define TARGET_RNG (AARCH64_ISA_RNG)
 
 /* Memory Tagging instructions optional to Armv8.5 enabled through +memtag.  */
-#define TARGET_MEMTAG (AARCH64_ISA_V8_5A && AARCH64_ISA_MEMTAG)
+#define TARGET_MEMTAG (AARCH64_ISA_MEMTAG)
 
 /* I8MM instructions are enabled through +i8mm.  */
 #define TARGET_I8MM (AARCH64_ISA_I8MM)
diff --git a/gcc/config/aarch64/arm_acle.h b/gcc/config/aarch64/arm_acle.h
index 
e0ac591d2c8d6c4c4c8a074b2d9881c47b1db1ab..87fb42f47c5821adecbb0ea441e0a38c63972e77
 100644
--- a/gcc/config/aarch64/arm_acle.h
+++ b/gcc/config/aarch64/arm_acle.h
@@ -325,7 +325,7 @@ __rndrrs (uint64_t *__res)
 #pragma GCC pop_options
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.5-a+memtag")
+#pragma GCC target ("+nothing+memtag")
 
 #define __arm_mte_create_random_tag(__ptr, __u64_mask) \
   __builtin_aarch64_memtag_irg(__ptr, __u64_mask)
diff --git a/gcc/config/aarch64/arm_fp16.h b/gcc/config/aarch64/arm_fp16.h
index 
a8fa4dbbdfe1bab4aa604bb311ef66d4e1de18ac..84b2ed66f9ba19fba6ccd8be33940d7239bfa22e
 100644
--- a/gcc/config/aarch64/arm_fp16.h
+++ b/gcc/config/aarch64/arm_fp16.h
@@ -30,7 +30,7 @@
 #include 
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+fp16")
+#pragma GCC target ("+nothing+fp16+nosimd")
 
 typedef __fp16 float16_t;
 
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 
eeec9f162e223df8cf7803b3227aef22e94227ac..a078674376af121c36bbebef76631c25a6815b1b
 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -25590,7 +25590,7 @@ __INTERLEAVE_LIST (zip)
 #include "arm_fp16.h"
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+fp16")
+#pragma GCC target ("+nothing+fp16")
 
 /* ARMv8.2-A FP16 one operand vector intrinsics.  */
 
@@ -26753,7 +26753,7 @@ vminnmvq_f16 (float16x8_t __a)
 /* AdvSIMD Dot Product intrinsics.  */
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+dotprod")
+#pragma GCC target ("+nothing+dotprod")
 
 __extension__ extern __inline uint32x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -26844,7 +26844,7 @@ vdotq_laneq_s32 (int32x4_t __r, int8x16_t __a, 
int8x16_t __b, const int __index)
 #pragma GCC pop_options
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+sm4")
+#pragma GCC target ("+nothing+sm4")
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -26911,7 +26911,7 @@ vsm4ekeyq_u32 (uint32x4_t __a, uint32x4_t __b)
 #pragma GCC pop_options
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+sha3")
+#pragma GCC target ("+nothing+sha3")
 
 __extension__ extern __inline uint64x2_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -27547,7 +27547,7 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, float32x4_t 
__a, float32x4_t __b,
 #pragma GCC pop_options
 
 #pragma GCC push_options
-#pragma GCC target ("arch=armv8.2-a+fp16fml")
+#pragma GCC 

Re: [PATCH 1/2]middle-end: Fix wrong overmatching of div-bitmask by using new optabs [PR108583]

2023-03-01 Thread Andrew Carlotti via Gcc-patches
On Thu, Feb 23, 2023 at 11:39:51AM -0500, Andrew MacLeod via Gcc-patches wrote:
> 
> 
> Inheriting from operator_mult is also going to be hazardous because it also
> has an op1_range and op2_range...� you should at least define those and
> return VARYING to avoid other issues.� Same thing applies to widen_plus I
> think, and it has relation processing and other things as well.� Your widen
> operands are not what those classes expect, so I think you probably just
> want a fresh range operator.
> 
> It also looks like the mult operation is sign/zero extending both upper
> bounds, and neither lower bound..�� I think that should be the LH upper 
> and
> lower bound?
> 
> I've attached a second patch� (newversion.patch) which incorporates my fix,
> the fix to the sign of only op1's bounds,� as well as a simplification of
> the classes to not inherit from operator_mult/plus..�� I think this still
> does what you want?� and it wont get you into unexpected trouble later :-)
> 
> let me know if this is still doing what you are expecting...
> 
> Andrew
> 

Hi,

This patch still uses the wrong signedness for some of the extensions in
WIDEN_MULT_EXPR. It currently bases it's promotion decisions on whether there
is any signed argument, and whether the result is signed - i.e.:

Patch extends as:
UUU UU
UUS -> USU
USU SU
USS SU  wrong
SUU US  wrong
SUS -> SSU
SSU SS  wrong
SSS SS

The documentation in tree.def is unclear about whether the output signedness is
linked to the input signedness, but at least the SSU case seems valid, and is
mishandled here.

I think it would be clearer and simpler to have four (or three) different
versions for each combnation of signedness of the input operands. This could be
implemented without extra code duplication by creating four different instances
of an operator_widen_mult class (perhaps extending a range_operator_mixed_sign
class), with the signedness indicated by two additional class members.

The documentation for WIDEN_PLUS_EXPR (and several other expressions added in
the same commit) is completely missing. If the signs are required to be
matching, then this should be clarified; otherwise it would need the same
special handling as WIDEN_MULT_EXPR.

Andrew

> diff --git a/gcc/gimple-range-op.cc b/gcc/gimple-range-op.cc
> index d9dfdc56939..824e0338f34 100644
> --- a/gcc/gimple-range-op.cc
> +++ b/gcc/gimple-range-op.cc
> @@ -179,6 +179,8 @@ gimple_range_op_handler::gimple_range_op_handler (gimple 
> *s)
>// statements.
>if (is_a  (m_stmt))
>  maybe_builtin_call ();
> +  else
> +maybe_non_standard ();
>  }
>  
>  // Calculate what we can determine of the range of this unary
> @@ -764,6 +766,36 @@ public:
>}
>  } op_cfn_parity;
>  
> +// Set up a gimple_range_op_handler for any nonstandard function which can be
> +// supported via range-ops.
> +
> +void
> +gimple_range_op_handler::maybe_non_standard ()
> +{
> +  if (gimple_code (m_stmt) == GIMPLE_ASSIGN)
> +switch (gimple_assign_rhs_code (m_stmt))
> +  {
> + case WIDEN_MULT_EXPR:
> + {
> +   m_valid = true;
> +   m_op1 = gimple_assign_rhs1 (m_stmt);
> +   m_op2 = gimple_assign_rhs2 (m_stmt);
> +   bool signed1 = TYPE_SIGN (TREE_TYPE (m_op1)) == SIGNED;
> +   bool signed2 = TYPE_SIGN (TREE_TYPE (m_op2)) == SIGNED;
> +   if (signed2 && !signed1)
> + std::swap (m_op1, m_op2);
> +
> +   if (signed1 || signed2)
> + m_int = ptr_op_widen_mult_signed;
> +   else
> + m_int = ptr_op_widen_mult_unsigned;
> +   break;
> + }
> + default:
> +   break;
> +  }
> +}
> +
>  // Set up a gimple_range_op_handler for any built in function which can be
>  // supported via range-ops.
>  
> diff --git a/gcc/gimple-range-op.h b/gcc/gimple-range-op.h
> index 743b858126e..1bf63c5ce6f 100644
> --- a/gcc/gimple-range-op.h
> +++ b/gcc/gimple-range-op.h
> @@ -41,6 +41,7 @@ public:
>relation_trio = TRIO_VARYING);
>  private:
>void maybe_builtin_call ();
> +  void maybe_non_standard ();
>gimple *m_stmt;
>tree m_op1, m_op2;
>  };
> diff --git a/gcc/range-op.cc b/gcc/range-op.cc
> index 5c67bce6d3a..7cd19a92d00 100644
> --- a/gcc/range-op.cc
> +++ b/gcc/range-op.cc
> @@ -1556,6 +1556,34 @@ operator_plus::op2_range (irange , tree type,
>return op1_range (r, type, lhs, op1, rel.swap_op1_op2 ());
>  }
>  
> +class operator_widen_plus : public range_operator
> +{
> +public:
> +  virtual void wi_fold (irange , tree type,
> + const wide_int _lb,
> + const wide_int _ub,
> + const wide_int _lb,
> + const wide_int _ub) const;
> +} op_widen_plus;
> +
> +void
> +operator_widen_plus::wi_fold (irange , tree type,
> + const wide_int _lb, const wide_int _ub,
> + const wide_int _lb, const wide_int _ub) const

Re: [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN

2023-01-16 Thread Andrew Carlotti via Gcc-patches
Erm, ignore this - I just rediscovered the approval in a different mail
folder. I forgot that Outlook's automatic email dedpulication meant that
messages CC'd to me end up in one of two different folders at random
when I want them in both.


On Mon, Jan 16, 2023 at 02:03:29PM +, Andrew Carlotti via Gcc-patches wrote:
> Hi Richard
> 
> I accidentally pushed this patch earlier in the mistaken belief that
> you'd already approved it. It looks uncontroversial to me - it just adds
> IFN support to build_popcount_expr, analogous to the changes you
> suggested and approved for build_cltz_expr (and adjusts testcases
> accordingly). I might have incorporated it into an earlier patch in this
> series, if I hadn't already pushed that earlier patch.
> 
> Is this OK to leave in master now?
> 
> Thanks,
> Andrew
> 
> On Thu, Dec 22, 2022 at 05:43:21PM +, Andrew Carlotti via Gcc-patches 
> wrote:
> > Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
> > x86_64-pc-linux-gnu - ok to merge?
> > 
> > gcc/ChangeLog:
> > 
> > * tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > * g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex.
> > * gcc.dg/tree-ssa/popcount.c: Likewise.
> > * gcc.dg/tree-ssa/popcount2.c: Likewise.
> > * gcc.dg/tree-ssa/popcount3.c: Likewise.
> > * gcc.target/aarch64/popcount4.c: Likewise.
> > * gcc.target/i386/pr95771.c: Likewise, and...
> > * gcc.target/i386/pr95771-2.c: ...split int128 test from above,
> > since this would emit just a single IFN if a TI optab is added.
> > 
> > ---
> > 
> > diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C 
> > b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> > index 
> > ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c
> >  100644
> > --- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> > +++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> > @@ -12,5 +12,5 @@ int PopCount (long b) {
> >  return c;
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } 
> > } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
> > "optimized" } } */
> >  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> > index 
> > b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6
> >  100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> > @@ -39,4 +39,4 @@ void PopCount3 (long b1) {
> >}
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } 
> > } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 
> > "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> > index 
> > ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be
> >  100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> > @@ -26,4 +26,4 @@ int main()
> >return 0;
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } 
> > } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
> > "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> > index 
> > ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c
> >  100644
> > --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> > @@ -12,5 +12,5 @@ int PopCount (long b) {
> >  return c;
> >  }
> >  
> > -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } 
> > } */
> > +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
> > "optimized" } } */
> >  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c 
> > b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> > index 
> &g

Re: [PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN

2023-01-16 Thread Andrew Carlotti via Gcc-patches
Hi Richard

I accidentally pushed this patch earlier in the mistaken belief that
you'd already approved it. It looks uncontroversial to me - it just adds
IFN support to build_popcount_expr, analogous to the changes you
suggested and approved for build_cltz_expr (and adjusts testcases
accordingly). I might have incorporated it into an earlier patch in this
series, if I hadn't already pushed that earlier patch.

Is this OK to leave in master now?

Thanks,
Andrew

On Thu, Dec 22, 2022 at 05:43:21PM +, Andrew Carlotti via Gcc-patches wrote:
> Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
> x86_64-pc-linux-gnu - ok to merge?
> 
> gcc/ChangeLog:
> 
>   * tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex.
>   * gcc.dg/tree-ssa/popcount.c: Likewise.
>   * gcc.dg/tree-ssa/popcount2.c: Likewise.
>   * gcc.dg/tree-ssa/popcount3.c: Likewise.
>   * gcc.target/aarch64/popcount4.c: Likewise.
>   * gcc.target/i386/pr95771.c: Likewise, and...
>   * gcc.target/i386/pr95771-2.c: ...split int128 test from above,
>   since this would emit just a single IFN if a TI optab is added.
> 
> ---
> 
> diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C 
> b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> index 
> ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c
>  100644
> --- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> +++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
> @@ -12,5 +12,5 @@ int PopCount (long b) {
>  return c;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } 
> */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
> "optimized" } } */
>  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> index 
> b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6
>  100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
> @@ -39,4 +39,4 @@ void PopCount3 (long b1) {
>}
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } 
> */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 
> "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> index 
> ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be
>  100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
> @@ -26,4 +26,4 @@ int main()
>return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } 
> */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
> "optimized" } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> index 
> ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c
>  100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
> @@ -12,5 +12,5 @@ int PopCount (long b) {
>  return c;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } 
> */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
> "optimized" } } */
>  /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c 
> b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> index 
> ee55b2e335223053ca024e95b7a13aa4af32550e..8aa15ff018d4b5fc6bb59e52af20d5c33cea2ee0
>  100644
> --- a/gcc/testsuite/gcc.target/aarch64/popcount4.c
> +++ b/gcc/testsuite/gcc.target/aarch64/popcount4.c
> @@ -11,4 +11,4 @@ int PopCount (long b) {
>  return c;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "optimized" } } 
> */
> +/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 0 
> "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr95771-2.c 
> b/gcc/testsuite/gcc.target/i386/pr95771-2.c
> new file mode 100644
> index 
> ..1db9dc94d0b66477667624012221d6844c141a26
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i

[PATCH 9/8] middle-end: Allow build_popcount_expr to use an IFN

2022-12-22 Thread Andrew Carlotti via Gcc-patches
Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
x86_64-pc-linux-gnu - ok to merge?

gcc/ChangeLog:

* tree-ssa-loop-niter.cc (build_popcount_expr): Add IFN support.

gcc/testsuite/ChangeLog:

* g++.dg/tree-ssa/pr86544.C: Add .POPCOUNT to tree scan regex.
* gcc.dg/tree-ssa/popcount.c: Likewise.
* gcc.dg/tree-ssa/popcount2.c: Likewise.
* gcc.dg/tree-ssa/popcount3.c: Likewise.
* gcc.target/aarch64/popcount4.c: Likewise.
* gcc.target/i386/pr95771.c: Likewise, and...
* gcc.target/i386/pr95771-2.c: ...split int128 test from above,
since this would emit just a single IFN if a TI optab is added.

---

diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
index 
ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c
 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr86544.C
@@ -12,5 +12,5 @@ int PopCount (long b) {
 return c;
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
"optimized" } } */
 /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c 
b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
index 
b4694109411a4631697463519acbe7d9df65bf6e..efd906a0f5447f0beb3752eded3756999b02e6e6
 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount.c
@@ -39,4 +39,4 @@ void PopCount3 (long b1) {
   }
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 3 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 3 
"optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
index 
ef73e345573de721833e98e89c252640a55f7c60..ae38a329bd4d868a762300d3218d68864c0fc4be
 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount2.c
@@ -26,4 +26,4 @@ int main()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
"optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c 
b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
index 
ef438916a8019320564f444ace08e2f4b4190684..50befb36bac75de1cfa282e38358278b3288bd1c
 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount3.c
@@ -12,5 +12,5 @@ int PopCount (long b) {
 return c;
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 1 
"optimized" } } */
 /* { dg-final { scan-tree-dump-times "if" 0 "phiopt4" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/popcount4.c 
b/gcc/testsuite/gcc.target/aarch64/popcount4.c
index 
ee55b2e335223053ca024e95b7a13aa4af32550e..8aa15ff018d4b5fc6bb59e52af20d5c33cea2ee0
 100644
--- a/gcc/testsuite/gcc.target/aarch64/popcount4.c
+++ b/gcc/testsuite/gcc.target/aarch64/popcount4.c
@@ -11,4 +11,4 @@ int PopCount (long b) {
 return c;
 }
 
-/* { dg-final { scan-tree-dump-times "__builtin_popcount" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "__builtin_popcount|\\.POPCOUNT" 0 
"optimized" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr95771-2.c 
b/gcc/testsuite/gcc.target/i386/pr95771-2.c
new file mode 100644
index 
..1db9dc94d0b66477667624012221d6844c141a26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr95771-2.c
@@ -0,0 +1,17 @@
+/* PR tree-optimization/95771 */
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump " = __builtin_popcount| = \\.POPCOUNT" 
"optimized" } } */
+
+int
+corge (unsigned __int128 x)
+{
+  int i = 0;
+  while (x)
+{
+  x &= x - 1;
+  ++i;
+}
+  return i;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr95771.c 
b/gcc/testsuite/gcc.target/i386/pr95771.c
index 
d7b67017800b705b9854f561916c20901ea76803..d41be445f4a68613a082b8956fea3ceaf33d7e0f
 100644
--- a/gcc/testsuite/gcc.target/i386/pr95771.c
+++ b/gcc/testsuite/gcc.target/i386/pr95771.c
@@ -1,8 +1,7 @@
 /* PR tree-optimization/95771 */
 /* { dg-do compile } */
 /* { dg-options "-O2 -mpopcnt -fdump-tree-optimized" } */
-/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 6 "optimized" { 
target int128 } } } */
-/* { dg-final { scan-tree-dump-times " = __builtin_popcount" 4 "optimized" { 
target { ! int128 } } } } */
+/* { dg-final { scan-tree-dump-times " = __builtin_popcount| = \\.POPCOUNT" 4 
"optimized" } } */
 
 int
 foo (unsigned char x)
@@ -51,17 +50,3 @@ qux (unsigned long long x)
 }
   return i;
 }
-
-#ifdef __SIZEOF_INT128__
-int
-corge (unsigned 

[PATCH 6/8 v2] docs: Add popcount, clz and ctz target attributes

2022-12-22 Thread Andrew Carlotti via Gcc-patches
Updated to reflect Sphinx revert; I'll commit this once the
cltz_complement patch is merged.

gcc/ChangeLog:

* doc/sourcebuild.texi: Add missing target attributes.

---

diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index 
ffe69d6fcb9c46cf97ba570e85b56e586a0c9b99..1036b185ee289bbf7883bd14956a41da9a6d677b
 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2511,6 +2511,24 @@ Target supports the execution of @code{amx-fp16} 
instructions.
 @item cell_hw
 Test system can execute AltiVec and Cell PPU instructions.
 
+@item clz
+Target supports a clz optab on int.
+
+@item clzl
+Target supports a clz optab on long.
+
+@item clzll
+Target supports a clz optab on long long.
+
+@item ctz
+Target supports a ctz optab on int.
+
+@item ctzl
+Target supports a ctz optab on long.
+
+@item ctzll
+Target supports a ctz optab on long long.
+
 @item cmpccxadd
 Target supports the execution of @code{cmpccxadd} instructions.
 
@@ -2532,6 +2550,15 @@ Target does not require strict alignment.
 @item pie_copyreloc
 The x86-64 target linker supports PIE with copy reloc.
 
+@item popcount
+Target supports a popcount optab on int.
+
+@item popcountl
+Target supports a popcount optab on long.
+
+@item popcountll
+Target supports a popcount optab on long long.
+
 @item prefetchi
 Target supports the execution of @code{prefetchi} instructions.
 


[PATCH 5/8 v2] middle-end: Add cltz_complement idiom recognition

2022-12-22 Thread Andrew Carlotti via Gcc-patches
On Thu, Nov 24, 2022 at 11:41:31AM +0100, Richard Biener wrote:
> Note we do have CTZ and CLZ
> optabs and internal functions - in case there's a HImode CLZ this
> could be elided.  More general you can avoid using the __builtin_
> functions with their fixed types in favor of using IFN_C[TL]Z which
> are type agnostic (but require optab support - you should be able
> to check this via direct_internal_fn_supported_p).

IFN support added. I've also renamed the defined_at_zero parameter to
define_at_zero, since this is a request for the expression to define it,
rather than a guarantee that it is already defined.

New patch below, bootstrapped and regression tested on
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu - ok to merge?

---

This recognises patterns of the form:
while (n) { n >>= 1 }

This patch results in improved (but still suboptimal) codegen:

foo (unsigned int b) {
int c = 0;

while (b) {
b >>= 1;
c++;
}

return c;
}

foo:
.LFB11:
.cfi_startproc
cbz w0, .L3
clz w1, w0
tst x0, 1
mov w0, 32
sub w0, w0, w1
cselw0, w0, wzr, ne
ret

The conditional is unnecessary. phiopt could recognise a redundant csel
(using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
clz call, but it cannot recognise the redunancy when the input is (e.g.)
(32 - clz).

I could perhaps extend this function to recognise this pattern in a later
patch, if this is a good place to recognise more patterns.

gcc/ChangeLog:

PR tree-optimization/94793
* tree-scalar-evolution.cc (expression_expensive_p): Add checks
for c[lt]z optabs.
* tree-ssa-loop-niter.cc (build_cltz_expr): New.
(number_of_iterations_cltz_complement): New.
(number_of_iterations_bitcount): Add call to the above.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp (check_effective_target_clz)
(check_effective_target_clzl, check_effective_target_clzll)
(check_effective_target_ctz, check_effective_target_clzl)
(check_effective_target_ctzll): New.
* gcc.dg/tree-ssa/cltz-complement-max.c: New test.
* gcc.dg/tree-ssa/clz-complement-char.c: New test.
* gcc.dg/tree-ssa/clz-complement-int.c: New test.
* gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
* gcc.dg/tree-ssa/clz-complement-long.c: New test.
* gcc.dg/tree-ssa/ctz-complement-char.c: New test.
* gcc.dg/tree-ssa/ctz-complement-int.c: New test.
* gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
* gcc.dg/tree-ssa/ctz-complement-long.c: New test.

---

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c 
b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
new file mode 100644
index 
..1a29ca52e42e50822e4e3213b2cb008b766d0318
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int clz_complement_count1 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b >>= 1;
+   c++;
+}
+if (c <= PREC)
+  return 0;
+else
+  return 34567;
+}
+
+int clz_complement_count2 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b >>= 1;
+   c++;
+}
+if (c <= PREC - 1)
+  return 0;
+else
+  return 76543;
+}
+
+int ctz_complement_count1 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b <<= 1;
+   c++;
+}
+if (c <= PREC)
+  return 0;
+else
+  return 23456;
+}
+
+int ctz_complement_count2 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b <<= 1;
+   c++;
+}
+if (c <= PREC - 1)
+  return 0;
+else
+  return 65432;
+}
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c 
b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
new file mode 100644
index 
..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b >>= 1;
+   c++;
+}
+
+return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+__builtin_abort ();
+  if (foo(5) != 3)
+__builtin_abort ();
+  if (foo(255) != 8)
+__builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { 

Re: [committed] docs: Fix peephole paragraph ordering

2022-12-22 Thread Andrew Carlotti via Gcc-patches
Patches attached to the wrong email - this patch was actually:

On Thu, Dec 22, 2022 at 05:06:13PM +, Andrew Carlotti via Gcc-patches wrote:
> The documentation for the DONE and FAIL macros was incorrectly inserted
> between example code, and a remark attached to that example.
> 
> Committed as obvious.
> 
> gcc/ChangeLog:
> 
>   * doc/md.texi: Move example code remark next to it's code block.
> 
> ---

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
cc28f868fc85b5148450548a54d69a39ecc4f03a..c1d3ae2060d800bbaa9751fcf841d7417af1e37d
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -9321,6 +9321,11 @@ so here's a silly made-up example:
   "")
 @end smallexample
 
+@noindent
+If we had not added the @code{(match_dup 4)} in the middle of the input
+sequence, it might have been the case that the register we chose at the
+beginning of the sequence is killed by the first or second @code{set}.
+
 There are two special macros defined for use in the preparation statements:
 @code{DONE} and @code{FAIL}.  Use them with a following semicolon,
 as a statement.
@@ -9348,11 +9353,6 @@ If the preparation falls through (invokes neither 
@code{DONE} nor
 @code{FAIL}), then the @code{define_peephole2} uses the replacement
 template.
 
-@noindent
-If we had not added the @code{(match_dup 4)} in the middle of the input
-sequence, it might have been the case that the register we chose at the
-beginning of the sequence is killed by the first or second @code{set}.
-
 @end ifset
 @ifset INTERNALS
 @node Insn Attributes


Re: [committed] docs: Link to correct section for constraint modifiers

2022-12-22 Thread Andrew Carlotti via Gcc-patches
Patches attached in to the wrong emails - this patch was actually:

On Thu, Dec 22, 2022 at 05:05:51PM +, Andrew Carlotti via Gcc-patches wrote:
> Committed as obvious.
> 
> gcc/ChangeLog:
> 
>   * doc/md.texi: Fix incorrect pxref.
> 
> ---

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
482e86f15d8b312c67d4962510ce879fb5cbc541..78dc6d720700ca409677e44a34a60d4b7fceb046
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -1511,7 +1511,7 @@ operand 1 (meaning it must match operand 0), and 
@samp{dKs} for operand
 2.  The second alternative has @samp{d} (data register) for operand 0,
 @samp{0} for operand 1, and @samp{dmKs} for operand 2.  The @samp{=} and
 @samp{%} in the constraints apply to all the alternatives; their
-meaning is explained in the next section (@pxref{Class Preferences}).
+meaning is explained in a later section (@pxref{Modifiers}).
 
 If all the operands fit any one alternative, the instruction is valid.
 Otherwise, for each alternative, the compiler counts how many instructions


[committed] docs: Fix peephole paragraph ordering

2022-12-22 Thread Andrew Carlotti via Gcc-patches
The documentation for the DONE and FAIL macros was incorrectly inserted
between example code, and a remark attached to that example.

Committed as obvious.

gcc/ChangeLog:

* doc/md.texi: Move example code remark next to it's code block.

---

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
482e86f15d8b312c67d4962510ce879fb5cbc541..78dc6d720700ca409677e44a34a60d4b7fceb046
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -1511,7 +1511,7 @@ operand 1 (meaning it must match operand 0), and 
@samp{dKs} for operand
 2.  The second alternative has @samp{d} (data register) for operand 0,
 @samp{0} for operand 1, and @samp{dmKs} for operand 2.  The @samp{=} and
 @samp{%} in the constraints apply to all the alternatives; their
-meaning is explained in the next section (@pxref{Class Preferences}).
+meaning is explained in a later section (@pxref{Modifiers}).
 
 If all the operands fit any one alternative, the instruction is valid.
 Otherwise, for each alternative, the compiler counts how many instructions


[committed] docs: Fix inconsistent example predicate name

2022-12-22 Thread Andrew Carlotti via Gcc-patches
It is unclear why the example C function was renamed to
`commutative_integer_operator` as part of ec8e098d in 2004, while the
text and the example md were both left as `commutative_operator`. The
latter name appears to be more accurate, so revert the 2004 change.

Committed as obvious.

gcc/ChangeLog:

* doc/md.texi: Fix inconsistent example name.

---

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
78dc6d720700ca409677e44a34a60d4b7fceb046..cc28f868fc85b5148450548a54d69a39ecc4f03a
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -377,7 +377,7 @@ commutative arithmetic operators of RTL and whose mode is 
@var{mode}:
 
 @smallexample
 int
-commutative_integer_operator (x, mode)
+commutative_operator (x, mode)
  rtx x;
  machine_mode mode;
 @{


[committed] docs: Link to correct section for constraint modifiers

2022-12-22 Thread Andrew Carlotti via Gcc-patches
Committed as obvious.

gcc/ChangeLog:

* doc/md.texi: Fix incorrect pxref.

---

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 
cc28f868fc85b5148450548a54d69a39ecc4f03a..c1d3ae2060d800bbaa9751fcf841d7417af1e37d
 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -9321,6 +9321,11 @@ so here's a silly made-up example:
   "")
 @end smallexample
 
+@noindent
+If we had not added the @code{(match_dup 4)} in the middle of the input
+sequence, it might have been the case that the register we chose at the
+beginning of the sequence is killed by the first or second @code{set}.
+
 There are two special macros defined for use in the preparation statements:
 @code{DONE} and @code{FAIL}.  Use them with a following semicolon,
 as a statement.
@@ -9348,11 +9353,6 @@ If the preparation falls through (invokes neither 
@code{DONE} nor
 @code{FAIL}), then the @code{define_peephole2} uses the replacement
 template.
 
-@noindent
-If we had not added the @code{(match_dup 4)} in the middle of the input
-sequence, it might have been the case that the register we chose at the
-beginning of the sequence is killed by the first or second @code{set}.
-
 @end ifset
 @ifset INTERNALS
 @node Insn Attributes


Re: [PATCH 5/8] middle-end: Add cltz_complement idiom recognition

2022-11-21 Thread Andrew Carlotti via Gcc-patches
On Mon, Nov 14, 2022 at 04:10:22PM +0100, Richard Biener wrote:
> On Fri, Nov 11, 2022 at 7:53 PM Andrew Carlotti via Gcc-patches
>  wrote:
> >
> > This recognises patterns of the form:
> > while (n) { n >>= 1 }
> >
> > This patch results in improved (but still suboptimal) codegen:
> >
> > foo (unsigned int b) {
> > int c = 0;
> >
> > while (b) {
> > b >>= 1;
> > c++;
> > }
> >
> > return c;
> > }
> >
> > foo:
> > .LFB11:
> > .cfi_startproc
> > cbz w0, .L3
> > clz w1, w0
> > tst x0, 1
> > mov w0, 32
> > sub w0, w0, w1
> > cselw0, w0, wzr, ne
> > ret
> >
> > The conditional is unnecessary. phiopt could recognise a redundant csel
> > (using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
> > clz call, but it cannot recognise the redunancy when the input is (e.g.)
> > (32 - clz).
> >
> > I could perhaps extend this function to recognise this pattern in a later
> > patch, if this is a good place to recognise more patterns.
> >
> > gcc/ChangeLog:
> >
+   PR tree-optimization/94793
> > * tree-scalar-evolution.cc (expression_expensive_p): Add checks
> > for c[lt]z optabs.
> > * tree-ssa-loop-niter.cc (build_cltz_expr): New.
> > (number_of_iterations_cltz_complement): New.
> > (number_of_iterations_bitcount): Add call to the above.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * lib/target-supports.exp (check_effective_target_clz)
> > (check_effective_target_clzl, check_effective_target_clzll)
> > (check_effective_target_ctz, check_effective_target_clzl)
> > (check_effective_target_ctzll): New.
> > * gcc.dg/tree-ssa/cltz-complement-max.c: New test.
> > * gcc.dg/tree-ssa/clz-complement-char.c: New test.
> > * gcc.dg/tree-ssa/clz-complement-int.c: New test.
> > * gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
> > * gcc.dg/tree-ssa/clz-complement-long.c: New test.
> > * gcc.dg/tree-ssa/ctz-complement-char.c: New test.
> > * gcc.dg/tree-ssa/ctz-complement-int.c: New test.
> > * gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
> > * gcc.dg/tree-ssa/ctz-complement-long.c: New test.
> >
> >
> > --
> >
> >

[snip test diffs]

> > diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> > index 
> > 7e2a3e986619de87e4ae9daf16198be1f13b917c..1ac9526c69b5fe80c26022f2fa1176d222e2dfb9
> >  100644
> > --- a/gcc/tree-scalar-evolution.cc
> > +++ b/gcc/tree-scalar-evolution.cc
> > @@ -3406,12 +3406,21 @@ expression_expensive_p (tree expr, hash_map > uint64_t> ,
> >  library call for popcount when backend does not have an instruction
> >  to do so.  We consider this to be expensive and generate
> >  __builtin_popcount only when backend defines it.  */
> > +  optab optab;
> >combined_fn cfn = get_call_combined_fn (expr);
> >switch (cfn)
> > {
> > CASE_CFN_POPCOUNT:
> > + optab = popcount_optab;
> > + goto bitcount_call;
> > +   CASE_CFN_CLZ:
> > + optab = clz_optab;
> > + goto bitcount_call;
> > +   CASE_CFN_CTZ:
> > + optab = ctz_optab;
> > +bitcount_call:
> >   /* Check if opcode for popcount is available in the mode 
> > required.  */
> > - if (optab_handler (popcount_optab,
> > + if (optab_handler (optab,
> >  TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (expr, 
> > 0
> >   == CODE_FOR_nothing)
> > {
> > @@ -3424,7 +3433,7 @@ expression_expensive_p (tree expr, hash_map > uint64_t> ,
> >  instructions.  */
> >   if (is_a  (mode, _mode)
> >   && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> > - && (optab_handler (popcount_optab, word_mode)
> > + && (optab_handler (optab, word_mode)
> >   != CODE_FOR_nothing))
> >   break;
> >   return true;
> > diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
> > index 
> > fece876099c1687569d6351e7d2416ea6acae5b5..16e8e25919d808cea27adbd72f0be01ae2f0e547
> >  100644
> > -

[PATCH 8/8] middle-end: Expand comment for tree_niter_desc.max

2022-11-11 Thread Andrew Carlotti via Gcc-patches
This requirement is enforced by a gcc_checking_assert in
record_estimate.

gcc/ChangeLog:

* tree-ssa-loop.h (tree_niter_desc): Update comment.


--


diff --git a/gcc/tree-ssa-loop.h b/gcc/tree-ssa-loop.h
index 
6c70f795d171f22b3ed75873fec4920fea75255b..c24215be8822c31a05eaedcf4d3a26db0feab6cf
 100644
--- a/gcc/tree-ssa-loop.h
+++ b/gcc/tree-ssa-loop.h
@@ -52,7 +52,8 @@ public:
   may_be_zero == false), more precisely the number
   of executions of the latch of the loop.  */
   widest_int max;  /* The upper bound on the number of iterations of
-  the loop.  */
+  the loop.  If niter is constant, then these values
+  must agree.  */
 
   /* The simplified shape of the exit condition.  This information is used by
  loop unrolling.  If CMP is ERROR_MARK, then the loop cannot be unrolled.


[PATCH 7/8] middle-end: Add c[lt]z idiom recognition

2022-11-11 Thread Andrew Carlotti via Gcc-patches
This recognises the patterns of the form:
  while (n & 1) { n >>= 1 }

Unfortunately there are currently two issues relating to this patch.

Firstly, simplify_using_initial_conditions does not recognise that
(n != 0) and ((n & 1) == 0) implies that ((n >> 1) != 0).

This preconditions arise following the loop copy-header pass, and the
assumptions returned by number_of_iterations_exit_assumptions then
prevent final value replacement from using the niter result.

I'm not sure what is the best way to fix this - one approach could be to
modify simplify_using_initial_conditions to handle this sort of case,
but it seems that it basically wants the information that ranger could
give anway, so would something like that be a better option?

The second issue arises in the vectoriser, which is able to determine
that the niter->assumptions are always true.
When building with -march=armv8.4-a+sve -S -O3, we get this codegen:

foo (unsigned int b) {
int c = 0;

if (b == 0)
  return PREC;

while (!(b & (1 << (PREC - 1 {
b <<= 1;
c++;
}

return c;
}

foo:
.LFB0:
.cfi_startproc
cmp w0, 0
cbz w0, .L6
blt .L7
lsl w1, w0, 1
clz w2, w1
cmp w2, 14
bls .L8
mov x0, 0
cntwx3
add w1, w2, 1
index   z1.s, #0, #1
whilelo p0.s, wzr, w1
.L4:
add x0, x0, x3
mov p1.b, p0.b
mov z0.d, z1.d
whilelo p0.s, w0, w1
incwz1.s
b.any   .L4
add z0.s, z0.s, #1
lastb   w0, p1, z0.s
ret
.p2align 2,,3
.L8:
mov w0, 0
b   .L3
.p2align 2,,3
.L13:
lsl w1, w1, 1
.L3:
add w0, w0, 1
tbz w1, #31, .L13
ret
.p2align 2,,3
.L6:
mov w0, 32
ret
.p2align 2,,3
.L7:
mov w0, 0
ret
.cfi_endproc

In essence, the vectoriser uses the niter information to determine
exactly how many iterations of the loop it needs to run. It then uses
SVE whilelo instructions to run this number of iterations. The original
loop counter is also vectorised, despite only being used in the final
iteration, and then the final value of this counter is used as the
return value (which is the same as the number of iterations it computed
in the first place).

This vectorisation is obviously bad, and I think it exposes a latent
bug in the vectoriser, rather than being an issue caused by this
specific patch.

gcc/ChangeLog:

* tree-ssa-loop-niter.cc (number_of_iterations_cltz): New.
(number_of_iterations_bitcount): Add call to the above.
(number_of_iterations_exit_assumptions): Add EQ_EXPR case for
c[lt]z idiom recognition.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/cltz-max.c: New test.
* gcc.dg/tree-ssa/clz-char.c: New test.
* gcc.dg/tree-ssa/clz-int.c: New test.
* gcc.dg/tree-ssa/clz-long-long.c: New test.
* gcc.dg/tree-ssa/clz-long.c: New test.
* gcc.dg/tree-ssa/ctz-char.c: New test.
* gcc.dg/tree-ssa/ctz-int.c: New test.
* gcc.dg/tree-ssa/ctz-long-long.c: New test.
* gcc.dg/tree-ssa/ctz-long.c: New test.


--


diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c 
b/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c
new file mode 100644
index 
..a6bea3d338940efee2e7e1c95a5941525945af9e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-max.c
@@ -0,0 +1,72 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int clz_count1 (unsigned char b) {
+int c = 0;
+
+if (b == 0)
+  return 0;
+
+while (!(b & (1 << (PREC - 1 {
+   b <<= 1;
+   c++;
+}
+if (c <= PREC - 1)
+  return 0;
+else
+  return 34567;
+}
+
+int clz_count2 (unsigned char b) {
+int c = 0;
+
+if (b == 0)
+  return 0;
+
+while (!(b & (1 << PREC - 1))) {
+   b <<= 1;
+   c++;
+}
+if (c <= PREC - 2)
+  return 0;
+else
+  return 76543;
+}
+
+int ctz_count1 (unsigned char b) {
+int c = 0;
+
+if (b == 0)
+  return 0;
+
+while (!(b & 1)) {
+   b >>= 1;
+   c++;
+}
+if (c <= PREC - 1)
+  return 0;
+else
+  return 23456;
+}
+
+int ctz_count2 (unsigned char b) {
+int c = 0;
+
+if (b == 0)
+  return 0;
+
+while (!(b & 1)) {
+   b >>= 1;
+   c++;
+}
+if (c <= PREC - 2)
+  return 0;
+else
+  return 65432;
+}
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-char.c 

[PATCH 6/8] docs: Add popcount, clz and ctz target attributes

2022-11-11 Thread Andrew Carlotti via Gcc-patches
gcc/ChangeLog:

* 
doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst:
Add missing target attributes.


--


diff --git 
a/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst
 
b/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst
index 
709e4ea2b903cfad4faed40899020b29bc9b5811..8410c40d38fceb83ea8c6ba3bbf0fba5db7929e5
 100644
--- 
a/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst
+++ 
b/gcc/doc/gccint/testsuites/directives-used-within-dejagnu-tests/keywords-describing-target-attributes.rst
@@ -1075,6 +1075,24 @@ Other hardware attributes
 ``cell_hw``
   Test system can execute AltiVec and Cell PPU instructions.
 
+``clz``
+  Target supports a clz optab on int.
+
+``clzl``
+  Target supports a clz optab on long.
+
+``clzll``
+  Target supports a clz optab on long long.
+
+``ctz``
+  Target supports a ctz optab on int.
+
+``ctzl``
+  Target supports a ctz optab on long.
+
+``ctzll``
+  Target supports a ctz optab on long long.
+
 ``cmpccxadd``
   Target supports the execution of ``cmpccxadd`` instructions.
 
@@ -1096,6 +1114,15 @@ Other hardware attributes
 ``pie_copyreloc``
   The x86-64 target linker supports PIE with copy reloc.
 
+``popcount``
+  Target supports a popcount optab on int.
+
+``popcountl``
+  Target supports a popcount optab on long.
+
+``popcountll``
+  Target supports a popcount optab on long long.
+
 ``prefetchi``
   Target supports the execution of ``prefetchi`` instructions.
 


[PATCH 5/8] middle-end: Add cltz_complement idiom recognition

2022-11-11 Thread Andrew Carlotti via Gcc-patches
This recognises patterns of the form:
while (n) { n >>= 1 }

This patch results in improved (but still suboptimal) codegen:

foo (unsigned int b) {
int c = 0;

while (b) {
b >>= 1;
c++;
}

return c;
}

foo:
.LFB11:
.cfi_startproc
cbz w0, .L3
clz w1, w0
tst x0, 1
mov w0, 32
sub w0, w0, w1
cselw0, w0, wzr, ne
ret

The conditional is unnecessary. phiopt could recognise a redundant csel
(using cond_removal_in_builtin_zero_pattern) when one of the inputs is a
clz call, but it cannot recognise the redunancy when the input is (e.g.)
(32 - clz).

I could perhaps extend this function to recognise this pattern in a later
patch, if this is a good place to recognise more patterns.

gcc/ChangeLog:

* tree-scalar-evolution.cc (expression_expensive_p): Add checks
for c[lt]z optabs.
* tree-ssa-loop-niter.cc (build_cltz_expr): New.
(number_of_iterations_cltz_complement): New.
(number_of_iterations_bitcount): Add call to the above.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp (check_effective_target_clz)
(check_effective_target_clzl, check_effective_target_clzll)
(check_effective_target_ctz, check_effective_target_clzl)
(check_effective_target_ctzll): New.
* gcc.dg/tree-ssa/cltz-complement-max.c: New test.
* gcc.dg/tree-ssa/clz-complement-char.c: New test.
* gcc.dg/tree-ssa/clz-complement-int.c: New test.
* gcc.dg/tree-ssa/clz-complement-long-long.c: New test.
* gcc.dg/tree-ssa/clz-complement-long.c: New test.
* gcc.dg/tree-ssa/ctz-complement-char.c: New test.
* gcc.dg/tree-ssa/ctz-complement-int.c: New test.
* gcc.dg/tree-ssa/ctz-complement-long-long.c: New test.
* gcc.dg/tree-ssa/ctz-complement-long.c: New test.


--


diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c 
b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
new file mode 100644
index 
..1a29ca52e42e50822e4e3213b2cb008b766d0318
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cltz-complement-max.c
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int clz_complement_count1 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b >>= 1;
+   c++;
+}
+if (c <= PREC)
+  return 0;
+else
+  return 34567;
+}
+
+int clz_complement_count2 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b >>= 1;
+   c++;
+}
+if (c <= PREC - 1)
+  return 0;
+else
+  return 76543;
+}
+
+int ctz_complement_count1 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b <<= 1;
+   c++;
+}
+if (c <= PREC)
+  return 0;
+else
+  return 23456;
+}
+
+int ctz_complement_count2 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b <<= 1;
+   c++;
+}
+if (c <= PREC - 1)
+  return 0;
+else
+  return 65432;
+}
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "23456" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "65432" 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c 
b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
new file mode 100644
index 
..2ebe8fabcaf0ce88f3a6a46e9ba4ba79b7d3672e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-char.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b >>= 1;
+   c++;
+}
+
+return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+__builtin_abort ();
+  if (foo(5) != 3)
+__builtin_abort ();
+  if (foo(255) != 8)
+__builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "__builtin_clz|\\.CLZ" 1 "optimized" } } 
*/
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c 
b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
new file mode 100644
index 
..f2c5c23f6a7d84ecb637c6961698b0fc30d7426b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/clz-complement-int.c
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+/* { dg-require-effective-target clz } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__ * __SIZEOF_INT__)
+
+int
+__attribute__ ((noinline, noclone))
+foo (unsigned int b) {
+int c = 0;
+
+while (b) {
+   b >>= 1;
+   c++;
+}
+
+return c;
+}
+
+int main()
+{
+  if (foo(0) != 0)
+__builtin_abort ();
+  if (foo(5) != 3)

[PATCH 4/8] Modify test, to prevent the next patch breaking it

2022-11-11 Thread Andrew Carlotti via Gcc-patches
The upcoming c[lt]z idiom recognition patch eliminates the need for a
brute force computation of the iteration count of these loops. The test
is intended to verify that ivcanon can determine the loop count when the
condition is given by a chain of constant computations.

We replace the constant operations with a more complicated chain that should
resist future idiom recognition.

gcc/testsuite/ChangeLog:

* gcc.dg/pr77975.c: Make tests more robust.


--


diff --git a/gcc/testsuite/gcc.dg/pr77975.c b/gcc/testsuite/gcc.dg/pr77975.c
index 
148cebdded964da7fce148abdf2a430c55650513..a187ce2b50c2821841e71b5b6cb243a37a66fb57
 100644
--- a/gcc/testsuite/gcc.dg/pr77975.c
+++ b/gcc/testsuite/gcc.dg/pr77975.c
@@ -7,10 +7,11 @@
 unsigned int
 foo (unsigned int *b)
 {
-  unsigned int a = 3;
+  unsigned int a = 8;
   while (a)
 {
-  a >>= 1;
+  a += 5;
+  a &= 44;
   *b += a;
 }
   return a; 
@@ -21,10 +22,11 @@ foo (unsigned int *b)
 unsigned int
 bar (unsigned int *b)
 {
-  unsigned int a = 7;
+  unsigned int a = 3;
   while (a)
 {
-  a >>= 1;
+  a += 5;
+  a &= 44;
   *b += a;
 }
   return a; 


[PATCH 3/8] middle-end: Refactor number_of_iterations_popcount

2022-11-11 Thread Andrew Carlotti via Gcc-patches
This includes various changes to improve clarity, and to enable the code
to be more similar to the clz and ctz idiom recognition added in
subsequent patches.

We create new number_of_iterations_bitcount function, which will be used
to call the other bit-counting recognition functions added in subsequent
patches, as well as a generic comment describing the loop structures
that are common to each idiom. Some of the variables in
number_of_iterations_popcount are given more descriptive names, and the
popcount expression builder is extracted into a separate function.

As part of the refactoring, we also fix a bug where the max loop count
for modes shorter than an integer would be incorrectly computed as if
the input mode were actually an integer.

We also ensure that niter->max takes into account the final value for
niter->niter (after any folding and simplifying), since if the latter is a
constant, then record_estimate mandates that the two values are equivalent.

gcc/ChangeLog:

* tree-ssa-loop-niter.cc
(number_of_iterations_exit_assumptions): Modify to call...
(number_of_iterations_bitcount): ...this new function.
(number_of_iterations_popcount): Now called by the above.
Refactor, and extract popcount expression builder to...
(build_popcount_expr): this new function.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/popcount-max.c: New test.


--


diff --git a/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c 
b/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c
new file mode 100644
index 
..ca7204cbc3cea636183408e24d7dd36d702ffdb2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/popcount-max.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-tree-loop-optimize -fdump-tree-optimized" } */
+
+#define PREC (__CHAR_BIT__)
+
+int count1 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b &= b - 1;
+   c++;
+}
+if (c <= PREC)
+  return 0;
+else
+  return 34567;
+}
+
+int count2 (unsigned char b) {
+int c = 0;
+
+while (b) {
+   b &= b - 1;
+   c++;
+}
+if (c <= PREC - 1)
+  return 0;
+else
+  return 76543;
+}
+
+/* { dg-final { scan-tree-dump-times "34567" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "76543" 1 "optimized" } } */
diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 
0af34e46580bb9a6f9b40e09c9f29b8454a4aaf6..fece876099c1687569d6351e7d2416ea6acae5b5
 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -2026,6 +2026,48 @@ number_of_iterations_cond (class loop *loop,
   return ret;
 }
 
+/* Return an expression that computes the popcount of src.  */
+
+static tree
+build_popcount_expr (tree src)
+{
+  tree fn;
+  int prec = TYPE_PRECISION (TREE_TYPE (src));
+  int i_prec = TYPE_PRECISION (integer_type_node);
+  int li_prec = TYPE_PRECISION (long_integer_type_node);
+  int lli_prec = TYPE_PRECISION (long_long_integer_type_node);
+  if (prec <= i_prec)
+fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
+  else if (prec == li_prec)
+fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
+  else if (prec == lli_prec || prec == 2 * lli_prec)
+fn = builtin_decl_implicit (BUILT_IN_POPCOUNTLL);
+  else
+return NULL_TREE;
+
+  tree utype = unsigned_type_for (TREE_TYPE (src));
+  src = fold_convert (utype, src);
+  if (prec < i_prec)
+src = fold_convert (unsigned_type_node, src);
+  tree call;
+  if (prec == 2 * lli_prec)
+{
+  tree src1 = fold_convert (long_long_unsigned_type_node,
+   fold_build2 (RSHIFT_EXPR, TREE_TYPE (src),
+unshare_expr (src),
+build_int_cst (integer_type_node,
+   lli_prec)));
+  tree src2 = fold_convert (long_long_unsigned_type_node, src);
+  tree call1 = build_call_expr (fn, 1, src1);
+  tree call2 = build_call_expr (fn, 1, src2);
+  call = fold_build2 (PLUS_EXPR, integer_type_node, call1, call2);
+}
+  else
+call = build_call_expr (fn, 1, src);
+
+  return call;
+}
+
 /* Utility function to check if OP is defined by a stmt
that is a val - 1.  */
 
@@ -2041,45 +2083,18 @@ ssa_defined_by_minus_one_stmt_p (tree op, tree val)
  && integer_minus_onep (gimple_assign_rhs2 (stmt)));
 }
 
-/* See if LOOP is a popcout implementation, determine NITER for the loop
+/* See comment below for number_of_iterations_bitcount.
+   For popcount, we have:
 
-   We match:
-   
-   goto 
+   modify:
+   _1 = iv_1 + -1
+   iv_2 = iv_1 & _1
 
-   
-   _1 = b_11 + -1
-   b_6 = _1 & b_11
-
-   
-   b_11 = PHI 
+   test:
+   if (iv != 0)
 
-   exit block
-   if (b_11 != 0)
-   goto 
-   else
-   goto 
-
-   OR we match copy-header version:
-   if (b_5 != 0)
-   goto 
-   else
-   goto 
-
-   
-   b_11 = PHI 
-   _1 = b_11 + -1
-   b_6 = _1 & 

[PATCH 2/8] middle-end: Remove prototype for number_of_iterations_popcount

2022-11-11 Thread Andrew Carlotti via Gcc-patches
gcc/ChangeLog:

* tree-ssa-loop-niter.c (ssa_defined_by_minus_one_stmt_p): Move
(number_of_iterations_popcount): Move, and remove separate prototype.


--


diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 
cdbb924216243ebcabe6c695698a4aee71882c49..c23643fd9dd8b27ff11549e1f28f585534e84cd3
 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -63,11 +63,6 @@ struct bounds
   mpz_t below, up;
 };
 
-static bool number_of_iterations_popcount (loop_p loop, edge exit,
-  enum tree_code code,
-  class tree_niter_desc *niter);
-
-
 /* Splits expression EXPR to a variable part VAR and constant OFFSET.  */
 
 static void
@@ -2031,6 +2026,200 @@ number_of_iterations_cond (class loop *loop,
   return ret;
 }
 
+/* Utility function to check if OP is defined by a stmt
+   that is a val - 1.  */
+
+static bool
+ssa_defined_by_minus_one_stmt_p (tree op, tree val)
+{
+  gimple *stmt;
+  return (TREE_CODE (op) == SSA_NAME
+ && (stmt = SSA_NAME_DEF_STMT (op))
+ && is_gimple_assign (stmt)
+ && (gimple_assign_rhs_code (stmt) == PLUS_EXPR)
+ && val == gimple_assign_rhs1 (stmt)
+ && integer_minus_onep (gimple_assign_rhs2 (stmt)));
+}
+
+/* See if LOOP is a popcout implementation, determine NITER for the loop
+
+   We match:
+   
+   goto 
+
+   
+   _1 = b_11 + -1
+   b_6 = _1 & b_11
+
+   
+   b_11 = PHI 
+
+   exit block
+   if (b_11 != 0)
+   goto 
+   else
+   goto 
+
+   OR we match copy-header version:
+   if (b_5 != 0)
+   goto 
+   else
+   goto 
+
+   
+   b_11 = PHI 
+   _1 = b_11 + -1
+   b_6 = _1 & b_11
+
+   exit block
+   if (b_6 != 0)
+   goto 
+   else
+   goto 
+
+   If popcount pattern, update NITER accordingly.
+   i.e., set NITER to  __builtin_popcount (b)
+   return true if we did, false otherwise.
+
+ */
+
+static bool
+number_of_iterations_popcount (loop_p loop, edge exit,
+  enum tree_code code,
+  class tree_niter_desc *niter)
+{
+  bool adjust = true;
+  tree iter;
+  HOST_WIDE_INT max;
+  adjust = true;
+  tree fn = NULL_TREE;
+
+  /* Check loop terminating branch is like
+ if (b != 0).  */
+  gimple *stmt = last_stmt (exit->src);
+  if (!stmt
+  || gimple_code (stmt) != GIMPLE_COND
+  || code != NE_EXPR
+  || !integer_zerop (gimple_cond_rhs (stmt))
+  || TREE_CODE (gimple_cond_lhs (stmt)) != SSA_NAME)
+return false;
+
+  gimple *and_stmt = SSA_NAME_DEF_STMT (gimple_cond_lhs (stmt));
+
+  /* Depending on copy-header is performed, feeding PHI stmts might be in
+ the loop header or loop latch, handle this.  */
+  if (gimple_code (and_stmt) == GIMPLE_PHI
+  && gimple_bb (and_stmt) == loop->header
+  && gimple_phi_num_args (and_stmt) == 2
+  && (TREE_CODE (gimple_phi_arg_def (and_stmt,
+loop_latch_edge (loop)->dest_idx))
+ == SSA_NAME))
+{
+  /* SSA used in exit condition is defined by PHI stmt
+   b_11 = PHI 
+   from the PHI stmt, get the and_stmt
+   b_6 = _1 & b_11.  */
+  tree t = gimple_phi_arg_def (and_stmt, loop_latch_edge (loop)->dest_idx);
+  and_stmt = SSA_NAME_DEF_STMT (t);
+  adjust = false;
+}
+
+  /* Make sure it is indeed an and stmt (b_6 = _1 & b_11).  */
+  if (!is_gimple_assign (and_stmt)
+  || gimple_assign_rhs_code (and_stmt) != BIT_AND_EXPR)
+return false;
+
+  tree b_11 = gimple_assign_rhs1 (and_stmt);
+  tree _1 = gimple_assign_rhs2 (and_stmt);
+
+  /* Check that _1 is defined by _b11 + -1 (_1 = b_11 + -1).
+ Also make sure that b_11 is the same in and_stmt and _1 defining stmt.
+ Also canonicalize if _1 and _b11 are revrsed.  */
+  if (ssa_defined_by_minus_one_stmt_p (b_11, _1))
+std::swap (b_11, _1);
+  else if (ssa_defined_by_minus_one_stmt_p (_1, b_11))
+;
+  else
+return false;
+  /* Check the recurrence:
+   ... = PHI .  */
+  gimple *phi = SSA_NAME_DEF_STMT (b_11);
+  if (gimple_code (phi) != GIMPLE_PHI
+  || (gimple_bb (phi) != loop_latch_edge (loop)->dest)
+  || (gimple_assign_lhs (and_stmt)
+ != gimple_phi_arg_def (phi, loop_latch_edge (loop)->dest_idx)))
+return false;
+
+  /* We found a match. Get the corresponding popcount builtin.  */
+  tree src = gimple_phi_arg_def (phi, loop_preheader_edge (loop)->dest_idx);
+  if (TYPE_PRECISION (TREE_TYPE (src)) <= TYPE_PRECISION (integer_type_node))
+fn = builtin_decl_implicit (BUILT_IN_POPCOUNT);
+  else if (TYPE_PRECISION (TREE_TYPE (src))
+  == TYPE_PRECISION (long_integer_type_node))
+fn = builtin_decl_implicit (BUILT_IN_POPCOUNTL);
+  else if (TYPE_PRECISION (TREE_TYPE (src))
+  == TYPE_PRECISION (long_long_integer_type_node)
+  || (TYPE_PRECISION (TREE_TYPE (src))
+  == 2 * TYPE_PRECISION (long_long_integer_type_node)))
+fn = 

[PATCH 0/8] middle-end: Ensure at_stmt is defined before an early exit

2022-11-11 Thread Andrew Carlotti via Gcc-patches
This prevents a null dereference error when outputing debug information
following an early exit from number_of_iterations_exit_assumptions.

gcc/ChangeLog:

* tree-ssa-loop-niter.cc (number_of_iterations_exit_assumptions):
Move at_stmt assignment.


--


diff --git a/gcc/tree-ssa-loop-niter.cc b/gcc/tree-ssa-loop-niter.cc
index 
4ffcef4f4ff2fe182fbe711553c8e4575560ab07..cdbb924216243ebcabe6c695698a4aee71882c49
 100644
--- a/gcc/tree-ssa-loop-niter.cc
+++ b/gcc/tree-ssa-loop-niter.cc
@@ -2537,6 +2537,9 @@ number_of_iterations_exit_assumptions (class loop *loop, 
edge exit,
   if (!stmt)
 return false;
 
+  if (at_stmt)
+*at_stmt = stmt;
+
   /* We want the condition for staying inside loop.  */
   code = gimple_cond_code (stmt);
   if (exit->flags & EDGE_TRUE_VALUE)
@@ -2642,9 +2645,6 @@ number_of_iterations_exit_assumptions (class loop *loop, 
edge exit,
   if (TREE_CODE (niter->niter) == INTEGER_CST)
 niter->max = wi::to_widest (niter->niter);
 
-  if (at_stmt)
-*at_stmt = stmt;
-
   return (!integer_zerop (niter->assumptions));
 }


[PATCH 0/8] middle-end: Popcount and clz/ctz idiom recognition improvements

2022-11-11 Thread Andrew Carlotti via Gcc-patches
This is a series of patches to improve recognition of popcount and
clz/ctz idioms, along with some related fixes.

- Patches 1 and 8 are independent fixes or improvements.
- Patch 4 is a dependency of patch 5, as it improves the robustness of a
  test that would otherwise begin failing.
- Patches 2, 3, 5 and 7 form the main dependent sequence.
- Patch 6 is a documentation update, covering attributes in patch 5 and
  existing code.
- Patch 7 may require other work before it can be merged, as it seems to
  expose a latent issue in the vectoriser.

Each patch has been bootstrapped and regression tested on
aarch64-none-linux-gnu.


[committed] Improve comment for tree_niter_desc.{control,bound,cmp}

2022-08-12 Thread Andrew Carlotti via Gcc-patches
Fix typos and explain ERROR_MARK usage.

gcc/ChangeLog:

* tree-ssa-loop.h: Improve comment

---

diff --git a/gcc/tree-ssa-loop.h b/gcc/tree-ssa-loop.h
index 
415f461c37e4cd7df0b49f6104f796c49cc830fa..6c70f795d171f22b3ed75873fec4920fea75255b
 100644
--- a/gcc/tree-ssa-loop.h
+++ b/gcc/tree-ssa-loop.h
@@ -54,11 +54,11 @@ public:
   widest_int max;  /* The upper bound on the number of iterations of
   the loop.  */
 
-  /* The simplified shape of the exit condition.  The loop exits if
- CONTROL CMP BOUND is false, where CMP is one of NE_EXPR,
- LT_EXPR, or GT_EXPR, and step of CONTROL is positive if CMP is
- LE_EXPR and negative if CMP is GE_EXPR.  This information is used
- by loop unrolling.  */
+  /* The simplified shape of the exit condition.  This information is used by
+ loop unrolling.  If CMP is ERROR_MARK, then the loop cannot be unrolled.
+ Otherwise, the loop exits if CONTROL CMP BOUND is false, where CMP is one
+ of NE_EXPR, LT_EXPR, or GT_EXPR, and CONTROL.STEP is positive if CMP is
+ LT_EXPR and negative if CMP is GT_EXPR.  */
   affine_iv control;
   tree bound;
   enum tree_code cmp;


[committed] docs: Fix outdated reference to LOOPS_HAVE_MARKED_SINGLE_EXITS

2022-07-27 Thread Andrew Carlotti via Gcc-patches
This reference has been wrong since 2007; committed as an obvious fix.

gcc/ChangeLog:

 * doc/loop.texi: Refer to LOOPS_HAVE_RECORDED_EXITS instead.

diff --git a/gcc/doc/loop.texi b/gcc/doc/loop.texi
index 
d7b71a24dbfed284b13da702bd5037691a515535..6e8657a074d2447db7ae9b75cbfbb71282b84287
 100644
--- a/gcc/doc/loop.texi
+++ b/gcc/doc/loop.texi
@@ -210,7 +210,7 @@ loop in depth-first search order in reversed CFG, ordered 
by dominance
 relation, and breath-first search order, respectively.
 @item @code{single_exit}: Returns the single exit edge of the loop, or
 @code{NULL} if the loop has more than one exit.  You can only use this
-function if LOOPS_HAVE_MARKED_SINGLE_EXITS property is used.
+function if @code{LOOPS_HAVE_RECORDED_EXITS} is used.
 @item @code{get_loop_exit_edges}: Enumerates the exit edges of a loop.
 @item @code{just_once_each_iteration_p}: Returns true if the basic block
 is executed exactly once during each iteration of a loop (that is, it


[PATCH v2.1 3/4] aarch64: Consolidate simd type lookup functions

2022-07-19 Thread Andrew Carlotti via Gcc-patches
On Wed, Jul 13, 2022 at 05:36:04PM +0100, Richard Sandiford wrote:
> I like the part about getting rid of:
> 
> static tree
> aarch64_simd_builtin_type (machine_mode mode,
>  bool unsigned_p, bool poly_p)
> 
> and the flow of the new function.  However, I think it's still
> slightly more readable if we keep the switch and lookup routines
> separate, partly to keep down the size of the main routine and
> partly to avoid the goto.

I agree.

> So how about:
> 
> - aarch64_simd_builtin_std_type becomes aarch64_int_or_fp_element_type
>   but otherwise stays as-is
>
> ...

I've called it aarch64_int_or_fp_type, because it's sometimes used for
an operand that doesn't represent an element of a vector.


Updated patch below.

---

There were several similarly-named functions, which each built or looked up an
operand type using a different subset of valid modes or qualifiers.

This change provides a single function to return operand types, which can
additionally handle const and pointer qualifiers. For clarity, the existing
functionality is kept in separate helper functions.

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc
(aarch64_simd_builtin_std_type): Rename to...
(aarch64_int_or_fp_type): ...this, and allow irrelevant qualifiers.
(aarch64_lookup_simd_builtin_type): Rename to...
(aarch64_simd_builtin_type): ...this. Add const/pointer
support, and extract table lookup to...
(aarch64_lookup_simd_type_in_table): ...this function.
(aarch64_init_crc32_builtins): Update to use aarch64_simd_builtin_type.
(aarch64_init_fcmla_laneq_builtins): Ditto.
(aarch64_init_simd_builtin_functions): Ditto.

---


diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
55ad2e8b6831d6cc2b039270c8656d429347092d..cd7c2a79d9b4d67adf1d9de1f9b56eb3a0d1ee2b
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -788,12 +788,13 @@ aarch64_general_mangle_builtin_type (const_tree type)
   return NULL;
 }
 
+/* Helper function for aarch64_simd_builtin_type. */
 static tree
-aarch64_simd_builtin_std_type (machine_mode mode,
-  enum aarch64_type_qualifiers q)
+aarch64_int_or_fp_type (machine_mode mode,
+  enum aarch64_type_qualifiers qualifiers)
 {
-#define QUAL_TYPE(M)  \
-  ((q == qualifier_none) ? int##M##_type_node : unsigned_int##M##_type_node);
+#define QUAL_TYPE(M) ((qualifiers & qualifier_unsigned) \
+  ? unsigned_int##M##_type_node : int##M##_type_node);
   switch (mode)
 {
 case E_QImode:
@@ -826,16 +827,14 @@ aarch64_simd_builtin_std_type (machine_mode mode,
 #undef QUAL_TYPE
 }
 
+/* Helper function for aarch64_simd_builtin_type. */
 static tree
-aarch64_lookup_simd_builtin_type (machine_mode mode,
- enum aarch64_type_qualifiers q)
+aarch64_lookup_simd_type_in_table (machine_mode mode,
+  enum aarch64_type_qualifiers qualifiers)
 {
   int i;
   int nelts = sizeof (aarch64_simd_types) / sizeof (aarch64_simd_types[0]);
-
-  /* Non-poly scalar modes map to standard types not in the table.  */
-  if (q != qualifier_poly && !VECTOR_MODE_P (mode))
-return aarch64_simd_builtin_std_type (mode, q);
+  int q = qualifiers & (qualifier_poly | qualifier_unsigned);
 
   for (i = 0; i < nelts; i++)
 {
@@ -852,16 +851,32 @@ aarch64_lookup_simd_builtin_type (machine_mode mode,
   return NULL_TREE;
 }
 
+/* Return a type for an operand with specified mode and qualifiers. */
 static tree
 aarch64_simd_builtin_type (machine_mode mode,
-  bool unsigned_p, bool poly_p)
+  enum aarch64_type_qualifiers qualifiers)
 {
-  if (poly_p)
-return aarch64_lookup_simd_builtin_type (mode, qualifier_poly);
-  else if (unsigned_p)
-return aarch64_lookup_simd_builtin_type (mode, qualifier_unsigned);
+  tree type = NULL_TREE;
+
+  /* For pointers, we want a pointer to the basic type of the vector.  */
+  if ((qualifiers & qualifier_pointer) && VECTOR_MODE_P (mode))
+mode = GET_MODE_INNER (mode);
+
+  /* Non-poly scalar modes map to standard types not in the table.  */
+  if ((qualifiers & qualifier_poly) || VECTOR_MODE_P (mode))
+type = aarch64_lookup_simd_type_in_table (mode, qualifiers);
   else
-return aarch64_lookup_simd_builtin_type (mode, qualifier_none);
+type = aarch64_int_or_fp_type (mode, qualifiers);
+
+  gcc_assert (type != NULL_TREE);
+
+  /* Add qualifiers.  */
+  if (qualifiers & qualifier_const)
+type = build_qualified_type (type, TYPE_QUAL_CONST);
+  if (qualifiers & qualifier_pointer)
+type = build_pointer_type (type);
+
+  return type;
 }
  
 static void
@@ -1110,12 +1125,11 @@ aarch64_init_fcmla_laneq_builtins (void)
 {
   aarch64_fcmla_laneq_builtin_datum* d
= _fcmla_lane_builtin_data[i];
-  tree 

Re: [PATCH v2 1/2] aarch64: Don't return invalid GIMPLE assign statements

2022-07-15 Thread Andrew Carlotti via Gcc-patches
On Wed, Jul 13, 2022 at 02:32:16PM +0200, Richard Biener wrote:
> On Wed, Jul 13, 2022 at 12:50 PM Andrew Carlotti
>  wrote:
> > I specifically wanted to avoid not folding the call, because always
> > folding means that the builtin doesn't need to be implemented anywhere
> > else (which isn't relevant here, but may become relevant when folding
> > newly defined builtins in the future).
> >
> > I considered dropping the statement, but I wasn't sure at the time that
> > I could do it safely. I could send a patch to instead replace new_stmt
> > with a GIMPLE_NOP.
> 
> If you can be sure there's no side-effect on the RHS then I think
> I'd prefer that over allocating an SSA name for something that's
> going to be DCEd anyway.
> 
> Richard.

I discussed this off-list with Richard Sandiford, and we agreed that it
would be better to leave this code as it is.

The only time this form is likely to arise is if the statement has
side-effects (e.g. reading from volatile memory or triggering
floating-point exceptions), in which case we can't just replace it with
a nop. On the other hand, in the event that someone has written an
entirely redundant statement, then it will quickly get eliminated
anyway.

Adding code to distinguish between the two cases here, or to handle
the hard case, is unnecessary and wouldn't be worthwhile.

> > > >> gcc/ChangeLog:
> > > >>
> > > >>  * config/aarch64/aarch64-builtins.cc
> > > >>  (aarch64_general_gimple_fold_builtin): Add fixup for invalid GIMPLE.
> > > >>
> > > >> gcc/testsuite/ChangeLog:
> > > >>
> > > >>  * gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c: New test.
> > > >>
> > > >> ---
> > > >>
> > > >> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> > > >> b/gcc/config/aarch64/aarch64-builtins.cc
> > > >> index 
> > > >> e0a741ac663188713e21f457affa57217d074783..5753988a9964967c27a03aca5fddb9025fd8ed6e
> > > >>  100644
> > > >> --- a/gcc/config/aarch64/aarch64-builtins.cc
> > > >> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > > >> @@ -3022,6 +3022,16 @@ aarch64_general_gimple_fold_builtin (unsigned 
> > > >> int fcode, gcall *stmt,
> > > >>  default:
> > > >>break;
> > > >>  }
> > > >> +
> > > >> +  /* GIMPLE assign statements (unlike calls) require a non-null lhs. 
> > > >> If we
> > > >> + created an assign statement with a null lhs, then fix this by 
> > > >> assigning
> > > >> + to a new (and subsequently unused) variable. */
> > > >> +  if (new_stmt && is_gimple_assign (new_stmt) && !gimple_assign_lhs 
> > > >> (new_stmt))
> > > >> +{
> > > >> +  tree new_lhs = make_ssa_name (gimple_call_return_type (stmt));
> > > >> +  gimple_assign_set_lhs (new_stmt, new_lhs);
> > > >> +}
> > > >> +
> > > >>return new_stmt;
> > > >>  }
> > > >>
> > > >> diff --git 
> > > >> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c
> > > >>  
> > > >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c
> > > >> new file mode 100644
> > > >> index 
> > > >> ..345307456b175307f5cb22de5e59cfc6254f2737
> > > >> --- /dev/null
> > > >> +++ 
> > > >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c
> > > >> @@ -0,0 +1,9 @@
> > > >> +/* { dg-do compile { target { aarch64*-*-* } } } */
> > > >> +
> > > >> +#include 
> > > >> +
> > > >> +int8_t *bar();
> > > >> +
> > > >> +void foo() {
> > > >> +  __builtin_aarch64_ld1v16qi(bar());
> > > >> +}


[committed] MAINTAINERS: Add myself to Write After Approval

2022-07-15 Thread Andrew Carlotti via Gcc-patches
ChangeLog:

* MAINTAINERS: Add myself to Write After Approval.

diff --git a/MAINTAINERS b/MAINTAINERS
index 
7d9aab76dd9676c806bd08abc7542553fcf81928..7a7ad42ced3027f1f7970916b355fd5fc7b0088c
 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -352,6 +352,7 @@ Kevin Buettner  

 Andrew Burgess 
 Adam Butcher   
 Andrew Cagney  
+Andrew Carlotti

 Daniel Carrera 
 Stephane Carrez
 Gabriel Charette   


[PATCH v2 4/4] aarch64: Move vreinterpret definitions into the compiler

2022-07-13 Thread Andrew Carlotti via Gcc-patches
This removes a significant number of intrinsic definitions from the arm_neon.h
header file, and reduces the amount of code duplication. The new macros and
data structures are intended to also facilitate moving other intrinsic
definitions out of the header file in future.

There is a a slight change in the behaviour of the bf16 vreinterpret intrinsics
when compiling without bf16 support. Expressions like:

b = vreinterpretq_s32_bf16(vreinterpretq_bf16_s64(a))

are now compiled successfully, instead of causing a 'target specific option
mismatch' during inlining.

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc
(MODE_d_bf16, MODE_d_f16, MODE_d_f32, MODE_d_f64, MODE_d_s8)
(MODE_d_s16, MODE_d_s32, MODE_d_s64, MODE_d_u8, MODE_d_u16)
(MODE_d_u32, MODE_d_u64, MODE_d_p8, MODE_d_p16, MODE_d_p64)
(MODE_q_bf16, MODE_q_f16, MODE_q_f32, MODE_q_f64, MODE_q_s8)
(MODE_q_s16, MODE_q_s32, MODE_q_s64, MODE_q_u8, MODE_q_u16)
(MODE_q_u32, MODE_q_u64, MODE_q_p8, MODE_q_p16, MODE_q_p64)
(MODE_q_p128): Define macro to map to corresponding mode name.
(QUAL_bf16, QUAL_f16, QUAL_f32, QUAL_f64, QUAL_s8, QUAL_s16)
(QUAL_s32, QUAL_s64, QUAL_u8, QUAL_u16, QUAL_u32, QUAL_u64)
(QUAL_p8, QUAL_p16, QUAL_p64, QUAL_p128): Define macro to map to
corresponding qualifier name.
(LENGTH_d, LENGTH_q): Define macro to map to "" or "q" suffix.
(SIMD_INTR_MODE, SIMD_INTR_QUAL, SIMD_INTR_LENGTH_CHAR): Macro
functions for the above mappings
(VREINTERPRET_BUILTIN2, VREINTERPRET_BUILTINS1, VREINTERPRET_BUILTINS)
(VREINTERPRETQ_BUILTIN2, VREINTERPRETQ_BUILTINS1)
(VREINTERPRETQ_BUILTINS, VREINTERPRET_BUILTIN)
(AARCH64_SIMD_VREINTERPRET_BUILTINS): New macros to create definitions
for all vreinterpret intrinsics
(enum aarch64_builtins): Add vreinterpret function codes
(aarch64_init_simd_intrinsics): New
(handle_arm_neon_h): Improved comment.
(aarch64_general_fold_builtin): Fold vreinterpret calls
* config/aarch64/arm_neon.h
(vreinterpret_p8_f16, vreinterpret_p8_f64, vreinterpret_p8_s8)
(vreinterpret_p8_s16, vreinterpret_p8_s32, vreinterpret_p8_s64)
(vreinterpret_p8_f32, vreinterpret_p8_u8, vreinterpret_p8_u16)
(vreinterpret_p8_u32, vreinterpret_p8_u64, vreinterpret_p8_p16)
(vreinterpret_p8_p64, vreinterpretq_p8_f64, vreinterpretq_p8_s8)
(vreinterpretq_p8_s16, vreinterpretq_p8_s32, vreinterpretq_p8_s64)
(vreinterpretq_p8_f16, vreinterpretq_p8_f32, vreinterpretq_p8_u8)
(vreinterpretq_p8_u16, vreinterpretq_p8_u32, vreinterpretq_p8_u64)
(vreinterpretq_p8_p16, vreinterpretq_p8_p64, vreinterpretq_p8_p128)
(vreinterpret_p16_f16, vreinterpret_p16_f64, vreinterpret_p16_s8)
(vreinterpret_p16_s16, vreinterpret_p16_s32, vreinterpret_p16_s64)
(vreinterpret_p16_f32, vreinterpret_p16_u8, vreinterpret_p16_u16)
(vreinterpret_p16_u32, vreinterpret_p16_u64, vreinterpret_p16_p8)
(vreinterpret_p16_p64, vreinterpretq_p16_f64, vreinterpretq_p16_s8)
(vreinterpretq_p16_s16, vreinterpretq_p16_s32, vreinterpretq_p16_s64)
(vreinterpretq_p16_f16, vreinterpretq_p16_f32, vreinterpretq_p16_u8)
(vreinterpretq_p16_u16, vreinterpretq_p16_u32, vreinterpretq_p16_u64)
(vreinterpretq_p16_p8, vreinterpretq_p16_p64, vreinterpretq_p16_p128)
(vreinterpret_p64_f16, vreinterpret_p64_f64, vreinterpret_p64_s8)
(vreinterpret_p64_s16, vreinterpret_p64_s32, vreinterpret_p64_s64)
(vreinterpret_p64_f32, vreinterpret_p64_u8, vreinterpret_p64_u16)
(vreinterpret_p64_u32, vreinterpret_p64_u64, vreinterpret_p64_p8)
(vreinterpret_p64_p16, vreinterpretq_p64_f64, vreinterpretq_p64_s8)
(vreinterpretq_p64_s16, vreinterpretq_p64_s32, vreinterpretq_p64_s64)
(vreinterpretq_p64_f16, vreinterpretq_p64_f32, vreinterpretq_p64_p128)
(vreinterpretq_p64_u8, vreinterpretq_p64_u16, vreinterpretq_p64_p16)
(vreinterpretq_p64_u32, vreinterpretq_p64_u64, vreinterpretq_p64_p8)
(vreinterpretq_p128_p8, vreinterpretq_p128_p16, vreinterpretq_p128_f16)
(vreinterpretq_p128_f32, vreinterpretq_p128_p64, vreinterpretq_p128_s64)
(vreinterpretq_p128_u64, vreinterpretq_p128_s8, vreinterpretq_p128_s16)
(vreinterpretq_p128_s32, vreinterpretq_p128_u8, vreinterpretq_p128_u16)
(vreinterpretq_p128_u32, vreinterpret_f16_f64, vreinterpret_f16_s8)
(vreinterpret_f16_s16): (vreinterpret_f16_s32): (vreinterpret_f16_s64):
(vreinterpret_f16_f32, vreinterpret_f16_u8, vreinterpret_f16_u16)
(vreinterpret_f16_u32, vreinterpret_f16_u64, vreinterpret_f16_p8)
(vreinterpret_f16_p16, vreinterpret_f16_p64, vreinterpretq_f16_f64)
(vreinterpretq_f16_s8, vreinterpretq_f16_s16, vreinterpretq_f16_s32)
(vreinterpretq_f16_s64, vreinterpretq_f16_f32, vreinterpretq_f16_u8)

[PATCH v2 2/4] aarch64: Remove qualifier_internal

2022-07-13 Thread Andrew Carlotti via Gcc-patches
This has been unused since 2014, so there's no reason to retain it.

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc
(enum aarch64_type_qualifiers): Remove qualifier_internal.
(aarch64_init_simd_builtin_functions): Remove qualifier_internal check.

---

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
52d27c6978990ca3e6c523654fe1cdc952e77ad7..55ad2e8b6831d6cc2b039270c8656d429347092d
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -145,9 +145,7 @@ enum aarch64_type_qualifiers
   qualifier_maybe_immediate = 0x10, /* 1 << 4  */
   /* void foo (...).  */
   qualifier_void = 0x20, /* 1 << 5  */
-  /* Some patterns may have internal operands, this qualifier is an
- instruction to the initialisation code to skip this operand.  */
-  qualifier_internal = 0x40, /* 1 << 6  */
+  /* 1 << 6 is now unused */
   /* Some builtins should use the T_*mode* encoded in a simd_builtin_datum
  rather than using the type of the operand.  */
   qualifier_map_mode = 0x80, /* 1 << 7  */
@@ -1207,10 +1205,6 @@ aarch64_init_simd_builtin_functions (bool 
called_from_pragma)
  else
type_signature[op_num] = 's';
 
- /* Skip an internal operand for vget_{low, high}.  */
- if (qualifiers & qualifier_internal)
-   continue;
-
  /* Some builtins have different user-facing types
 for certain arguments, encoded in d->mode.  */
  if (qualifiers & qualifier_map_mode)


[PATCH v2 3/4] aarch64: Consolidate simd type lookup functions

2022-07-13 Thread Andrew Carlotti via Gcc-patches
There were several similarly-named functions, which each built or looked up a
type using a different subset of valid modes or qualifiers.

This change combines these all into a single function, which can additionally
handle const and pointer qualifiers.

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc
(aarch64_simd_builtin_std_type, aarch64_lookup_simd_builtin_type)
(aarch64_simd_builtin_type): Combine and replace with...
(aarch64_build_simd_builtin_type): ...this new function.
(aarch64_init_fcmla_laneq_builtins): Update to call new function.
(aarch64_init_simd_builtin_functions): Ditto.
(aarch64_init_crc32_builtins): Ditto.

---

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
55ad2e8b6831d6cc2b039270c8656d429347092d..6b413a36a09c7a4ac41b0fe7c414a3247580f222
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -789,79 +789,101 @@ aarch64_general_mangle_builtin_type (const_tree type)
 }
 
 static tree
-aarch64_simd_builtin_std_type (machine_mode mode,
-  enum aarch64_type_qualifiers q)
-{
-#define QUAL_TYPE(M)  \
-  ((q == qualifier_none) ? int##M##_type_node : unsigned_int##M##_type_node);
-  switch (mode)
-{
-case E_QImode:
-  return QUAL_TYPE (QI);
-case E_HImode:
-  return QUAL_TYPE (HI);
-case E_SImode:
-  return QUAL_TYPE (SI);
-case E_DImode:
-  return QUAL_TYPE (DI);
-case E_TImode:
-  return QUAL_TYPE (TI);
-case E_OImode:
-  return aarch64_simd_intOI_type_node;
-case E_CImode:
-  return aarch64_simd_intCI_type_node;
-case E_XImode:
-  return aarch64_simd_intXI_type_node;
-case E_HFmode:
-  return aarch64_fp16_type_node;
-case E_SFmode:
-  return float_type_node;
-case E_DFmode:
-  return double_type_node;
-case E_BFmode:
-  return aarch64_bf16_type_node;
-default:
-  gcc_unreachable ();
-}
-#undef QUAL_TYPE
-}
-
-static tree
-aarch64_lookup_simd_builtin_type (machine_mode mode,
- enum aarch64_type_qualifiers q)
+aarch64_build_simd_builtin_type (machine_mode mode,
+enum aarch64_type_qualifiers qualifiers)
 {
+  tree type = NULL_TREE;
   int i;
   int nelts = sizeof (aarch64_simd_types) / sizeof (aarch64_simd_types[0]);
 
-  /* Non-poly scalar modes map to standard types not in the table.  */
-  if (q != qualifier_poly && !VECTOR_MODE_P (mode))
-return aarch64_simd_builtin_std_type (mode, q);
+  /* For pointers, we want a pointer to the basic type of the vector.  */
+  if ((qualifiers & qualifier_pointer) && VECTOR_MODE_P (mode))
+mode = GET_MODE_INNER (mode);
 
-  for (i = 0; i < nelts; i++)
+  if ((qualifiers & qualifier_poly) || VECTOR_MODE_P (mode))
 {
-  if (aarch64_simd_types[i].mode == mode
- && aarch64_simd_types[i].q == q)
-   return aarch64_simd_types[i].itype;
-  if (aarch64_simd_tuple_types[i][0] != NULL_TREE)
-   for (int j = 0; j < 3; j++)
- if (aarch64_simd_tuple_modes[i][j] == mode
+  int q = qualifiers & (qualifier_poly | qualifier_unsigned);
+  /* Poly or vector modes map to types in the table.  */
+  for (i = 0; i < nelts; i++)
+   {
+ if (aarch64_simd_types[i].mode == mode
  && aarch64_simd_types[i].q == q)
-   return aarch64_simd_tuple_types[i][j];
+   {
+ type = aarch64_simd_types[i].itype;
+ goto finished_type_lookup;
+   }
+ if (aarch64_simd_tuple_types[i][0] != NULL_TREE)
+   {
+ for (int j = 0; j < 3; j++)
+   {
+ if (aarch64_simd_tuple_modes[i][j] == mode
+   && aarch64_simd_types[i].q == q)
+   {
+ type = aarch64_simd_tuple_types[i][j];
+ goto finished_type_lookup;
+   }
+   }
+   }
+   }
 }
+  else
+{
+  /* Non-poly scalar modes map to standard types.  */
+#define QUAL_TYPE(M) ((qualifiers & qualifier_unsigned) \
+  ? unsigned_int##M##_type_node : int##M##_type_node);
+  switch (mode)
+   {
+   case E_QImode:
+ type = QUAL_TYPE (QI);
+ break;
+   case E_HImode:
+ type = QUAL_TYPE (HI);
+ break;
+   case E_SImode:
+ type = QUAL_TYPE (SI);
+ break;
+   case E_DImode:
+ type = QUAL_TYPE (DI);
+ break;
+   case E_TImode:
+ type = QUAL_TYPE (TI);
+ break;
+   case E_OImode:
+ type = aarch64_simd_intOI_type_node;
+ break;
+   case E_CImode:
+ type = aarch64_simd_intCI_type_node;
+ break;
+   case E_XImode:
+ type = aarch64_simd_intXI_type_node;
+ break;
+   case E_HFmode:
+ type = aarch64_fp16_type_node;
+ 

[PATCH v2 1/4] aarch64: Add V1DI mode

2022-07-13 Thread Andrew Carlotti via Gcc-patches
We already have a V1DF mode, so this makes the vector modes more consistent.

Additionally, this allows us to recognise uint64x1_t and int64x1_t types given
only the mode and type qualifiers (e.g. in aarch64_lookup_simd_builtin_type).

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc
(v1di_UP): Add V1DI mode to _UP macros.
* config/aarch64/aarch64-modes.def (VECTOR_MODE): Add V1DI mode
* config/aarch64/aarch64-simd-builtin-types.def: Use V1DI mode
* config/aarch64/aarch64-simd.md
(vec_extractv2dfv1df): Replace with...
(vec_extract): ...this.
* config/aarch64/aarch64.cc (aarch64_classify_vector_mode): Add V1DI 
mode
* config/aarch64/iterators.md
(VQ_2E, V1HALF, V1half): New.
(nunits): Add V1DI mode.

---

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
c21476d7ae963450b12efa24418ce4004a3c74bf..52d27c6978990ca3e6c523654fe1cdc952e77ad7
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -55,6 +55,7 @@
 #define v2si_UP  E_V2SImode
 #define v2sf_UP  E_V2SFmode
 #define v1df_UP  E_V1DFmode
+#define v1di_UP  E_V1DImode
 #define di_UPE_DImode
 #define df_UPE_DFmode
 #define v16qi_UP E_V16QImode
diff --git a/gcc/config/aarch64/aarch64-modes.def 
b/gcc/config/aarch64/aarch64-modes.def
index 
8f399225a8048d93108e33e9d49c736aeb5612ce..d3c9b74434cd2c0d0cb1a2fd26af8c0bf38a4cfa
 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -70,6 +70,7 @@ VECTOR_MODES (INT, 8);/*   V8QI V4HI V2SI.  */
 VECTOR_MODES (INT, 16);   /* V16QI V8HI V4SI V2DI.  */
 VECTOR_MODES (FLOAT, 8);  /* V2SF.  */
 VECTOR_MODES (FLOAT, 16); /*V4SF V2DF.  */
+VECTOR_MODE (INT, DI, 1); /* V1DI.  */
 VECTOR_MODE (FLOAT, DF, 1);   /* V1DF.  */
 VECTOR_MODE (FLOAT, HF, 2);   /* V2HF.  */
 
diff --git a/gcc/config/aarch64/aarch64-simd-builtin-types.def 
b/gcc/config/aarch64/aarch64-simd-builtin-types.def
index 
248e51e96549fb640817d79c099a3f5e62c71317..40545581408e2ee2be84f08abb5801058c4ea42e
 100644
--- a/gcc/config/aarch64/aarch64-simd-builtin-types.def
+++ b/gcc/config/aarch64/aarch64-simd-builtin-types.def
@@ -24,7 +24,7 @@
   ENTRY (Int16x8_t, V8HI, none, 11)
   ENTRY (Int32x2_t, V2SI, none, 11)
   ENTRY (Int32x4_t, V4SI, none, 11)
-  ENTRY (Int64x1_t, DI, none, 11)
+  ENTRY (Int64x1_t, V1DI, none, 11)
   ENTRY (Int64x2_t, V2DI, none, 11)
   ENTRY (Uint8x8_t, V8QI, unsigned, 11)
   ENTRY (Uint8x16_t, V16QI, unsigned, 12)
@@ -32,7 +32,7 @@
   ENTRY (Uint16x8_t, V8HI, unsigned, 12)
   ENTRY (Uint32x2_t, V2SI, unsigned, 12)
   ENTRY (Uint32x4_t, V4SI, unsigned, 12)
-  ENTRY (Uint64x1_t, DI, unsigned, 12)
+  ENTRY (Uint64x1_t, V1DI, unsigned, 12)
   ENTRY (Uint64x2_t, V2DI, unsigned, 12)
   ENTRY (Poly8_t, QI, poly, 9)
   ENTRY (Poly16_t, HI, poly, 10)
@@ -42,7 +42,7 @@
   ENTRY (Poly8x16_t, V16QI, poly, 12)
   ENTRY (Poly16x4_t, V4HI, poly, 12)
   ENTRY (Poly16x8_t, V8HI, poly, 12)
-  ENTRY (Poly64x1_t, DI, poly, 12)
+  ENTRY (Poly64x1_t, V1DI, poly, 12)
   ENTRY (Poly64x2_t, V2DI, poly, 12)
   ENTRY (Float16x4_t, V4HF, none, 13)
   ENTRY (Float16x8_t, V8HF, none, 13)
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
a00e1c6ef8d6b43d8b1a0fe4701e6b8c1f0f622f..587a45d77721e1b39accbad7dbeca4d741eccb10
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -8026,16 +8026,16 @@
 })
 
 ;; Extract a single-element 64-bit vector from one half of a 128-bit vector.
-(define_expand "vec_extractv2dfv1df"
-  [(match_operand:V1DF 0 "register_operand")
-   (match_operand:V2DF 1 "register_operand")
+(define_expand "vec_extract"
+  [(match_operand: 0 "register_operand")
+   (match_operand:VQ_2E 1 "register_operand")
(match_operand 2 "immediate_operand")]
   "TARGET_SIMD"
 {
-  /* V1DF is rarely used by other patterns, so it should be better to hide
- it in a subreg destination of a normal DF op.  */
-  rtx scalar0 = gen_lowpart (DFmode, operands[0]);
-  emit_insn (gen_vec_extractv2dfdf (scalar0, operands[1], operands[2]));
+  /* V1DI and V1DF are rarely used by other patterns, so it should be better
+ to hide it in a subreg destination of a normal DI or DF op.  */
+  rtx scalar0 = gen_lowpart (mode, operands[0]);
+  emit_insn (gen_vec_extract (scalar0, operands[1], operands[2]));
   DONE;
 })
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
f650abbc4ce49cf0947049931f86bad1130c3428..278910af0a38c0203a962d34c6792191f0fe9e31
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -3568,7 +3568,7 @@ aarch64_classify_vector_mode (machine_mode mode)
 case E_V8QImode:
 case E_V4HImode:
 case E_V2SImode:
-/* ...E_V1DImode doesn't exist.  */
+case E_V1DImode:
 

Re: [PATCH v2 1/2] aarch64: Don't return invalid GIMPLE assign statements

2022-07-13 Thread Andrew Carlotti via Gcc-patches
On Wed, Jul 13, 2022 at 09:10:25AM +0100, Richard Sandiford wrote:
> Richard Biener via Gcc-patches  writes:
> > On Tue, Jul 12, 2022 at 4:38 PM Andrew Carlotti  
> > wrote:
> >>
> >> aarch64_general_gimple_fold_builtin doesn't check whether the LHS of a
> >> function call is null before converting it to an assign statement. To avoid
> >> returning an invalid GIMPLE statement in this case, we instead assign the
> >> expression result to a new (unused) variable.
> >>
> >> This change only affects code that:
> >> 1) Calls an intrinsic function that has no side effects;
> >> 2) Does not use or store the value returned by the intrinsic;
> >> 3) Uses parameters that prevent the front-end eliminating the call prior to
> >> gimplification.
> >>
> >> The ICE is unlikely to have occurred in the wild, as it relies on the 
> >> presence
> >> of a redundant intrinsic call.
> >
> > Other targets usually simply refrain from folding intrinsic calls with no 
> > LHS.
> > Another option is to just drop it on the floor if it does not have any
> > side-effects which for the gimple_fold_builtin hook means folding it to
> > a GIMPLE_NOP (gimple_build_nop ()).
> 
> Sorry, I just pushed the patch before seeing this.
> 
> I guess the problem with refraining from folding calls with no lhs
> is that it has to be done on a per-function basis.  (E.g. stores
> should still be folded.)  It then becomes something that we need
> to remember for each individual call.  E.g. ix86_gimple_fold_builtin
> seems to have three different pieces of code for handling null lhses,
> even with its heavy use of gotos.
> 
> So a nice thing about the current patch is that it handles all this
> in one place only.
> 
> Thanks,
> Richard

I specifically wanted to avoid not folding the call, because always
folding means that the builtin doesn't need to be implemented anywhere
else (which isn't relevant here, but may become relevant when folding
newly defined builtins in the future).

I considered dropping the statement, but I wasn't sure at the time that
I could do it safely. I could send a patch to instead replace new_stmt
with a GIMPLE_NOP.

> >> gcc/ChangeLog:
> >>
> >>  * config/aarch64/aarch64-builtins.cc
> >>  (aarch64_general_gimple_fold_builtin): Add fixup for invalid GIMPLE.
> >>
> >> gcc/testsuite/ChangeLog:
> >>
> >>  * gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c: New test.
> >>
> >> ---
> >>
> >> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> >> b/gcc/config/aarch64/aarch64-builtins.cc
> >> index 
> >> e0a741ac663188713e21f457affa57217d074783..5753988a9964967c27a03aca5fddb9025fd8ed6e
> >>  100644
> >> --- a/gcc/config/aarch64/aarch64-builtins.cc
> >> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> >> @@ -3022,6 +3022,16 @@ aarch64_general_gimple_fold_builtin (unsigned int 
> >> fcode, gcall *stmt,
> >>  default:
> >>break;
> >>  }
> >> +
> >> +  /* GIMPLE assign statements (unlike calls) require a non-null lhs. If we
> >> + created an assign statement with a null lhs, then fix this by 
> >> assigning
> >> + to a new (and subsequently unused) variable. */
> >> +  if (new_stmt && is_gimple_assign (new_stmt) && !gimple_assign_lhs 
> >> (new_stmt))
> >> +{
> >> +  tree new_lhs = make_ssa_name (gimple_call_return_type (stmt));
> >> +  gimple_assign_set_lhs (new_stmt, new_lhs);
> >> +}
> >> +
> >>return new_stmt;
> >>  }
> >>
> >> diff --git 
> >> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c 
> >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c
> >> new file mode 100644
> >> index 
> >> ..345307456b175307f5cb22de5e59cfc6254f2737
> >> --- /dev/null
> >> +++ 
> >> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c
> >> @@ -0,0 +1,9 @@
> >> +/* { dg-do compile { target { aarch64*-*-* } } } */
> >> +
> >> +#include 
> >> +
> >> +int8_t *bar();
> >> +
> >> +void foo() {
> >> +  __builtin_aarch64_ld1v16qi(bar());
> >> +}


[PATCH v2 2/2] aarch64: Lower vcombine to GIMPLE

2022-07-12 Thread Andrew Carlotti via Gcc-patches
This lowers vcombine intrinsics to a GIMPLE vector constructor, which enables
better optimisation during GIMPLE passes.

gcc/

* config/aarch64/aarch64-builtins.c
(aarch64_general_gimple_fold_builtin): Add combine.

gcc/testsuite/

* gcc.target/aarch64/advsimd-intrinsics/combine.c:
New test.

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
5753988a9964967c27a03aca5fddb9025fd8ed6e..a25756cfed5fab3a98ebf3e2ee29a5e117cbd2aa
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2857,6 +2857,28 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, 
gcall *stmt,
gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
break;
 
+ BUILTIN_VDC (BINOP, combine, 0, AUTO_FP)
+ BUILTIN_VD_I (BINOPU, combine, 0, NONE)
+ BUILTIN_VDC_P (BINOPP, combine, 0, NONE)
+   {
+ tree first_part, second_part;
+ if (BYTES_BIG_ENDIAN)
+   {
+ second_part = args[0];
+ first_part = args[1];
+   }
+ else
+   {
+ first_part = args[0];
+ second_part = args[1];
+   }
+ tree ret_type = TREE_TYPE (gimple_call_lhs (stmt));
+ tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, first_part,
+   NULL_TREE, second_part);
+ new_stmt = gimple_build_assign (gimple_call_lhs (stmt), ctor);
+   }
+   break;
+
  /*lower store and load neon builtins to gimple.  */
  BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
  BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c
new file mode 100644
index 
..d08faf7a4a160a1e83428ed9b270731bbf7b8c8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include 
+
+/*
+** foo:
+** umovw0, v1\.s\[1\]
+** ret
+*/
+
+int32_t foo (int32x2_t a, int32x2_t b)
+{
+  int32x4_t c = vcombine_s32(a, b);
+  return vgetq_lane_s32(c, 3);
+}
+


[PATCH v2 1/2] aarch64: Don't return invalid GIMPLE assign statements

2022-07-12 Thread Andrew Carlotti via Gcc-patches
aarch64_general_gimple_fold_builtin doesn't check whether the LHS of a
function call is null before converting it to an assign statement. To avoid
returning an invalid GIMPLE statement in this case, we instead assign the
expression result to a new (unused) variable.

This change only affects code that:
1) Calls an intrinsic function that has no side effects;
2) Does not use or store the value returned by the intrinsic;
3) Uses parameters that prevent the front-end eliminating the call prior to
gimplification.

The ICE is unlikely to have occurred in the wild, as it relies on the presence
of a redundant intrinsic call.

gcc/ChangeLog:

 * config/aarch64/aarch64-builtins.cc
 (aarch64_general_gimple_fold_builtin): Add fixup for invalid GIMPLE.

gcc/testsuite/ChangeLog:

 * gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c: New test.

---

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
e0a741ac663188713e21f457affa57217d074783..5753988a9964967c27a03aca5fddb9025fd8ed6e
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -3022,6 +3022,16 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, 
gcall *stmt,
 default:
   break;
 }
+
+  /* GIMPLE assign statements (unlike calls) require a non-null lhs. If we
+ created an assign statement with a null lhs, then fix this by assigning
+ to a new (and subsequently unused) variable. */
+  if (new_stmt && is_gimple_assign (new_stmt) && !gimple_assign_lhs (new_stmt))
+{
+  tree new_lhs = make_ssa_name (gimple_call_return_type (stmt));
+  gimple_assign_set_lhs (new_stmt, new_lhs);
+}
+
   return new_stmt;
 }
 
diff --git 
a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c
new file mode 100644
index 
..345307456b175307f5cb22de5e59cfc6254f2737
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/ignored_return_1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+
+#include 
+
+int8_t *bar();
+
+void foo() {
+  __builtin_aarch64_ld1v16qi(bar());
+}


Re: [PATCH] aarch64: Fix pure/const function attributes for intrinsics

2022-07-01 Thread Andrew Carlotti via Gcc-patches
On Fri, Jul 01, 2022 at 08:42:15AM +0200, Richard Biener wrote:
> On Thu, Jun 30, 2022 at 6:04 PM Andrew Carlotti via Gcc-patches
>  wrote:
> > diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> > b/gcc/config/aarch64/aarch64-builtins.cc
> > index 
> > e0a741ac663188713e21f457affa57217d074783..877f54aab787862794413259cd36ca0fb7bd49c5
> >  100644
> > --- a/gcc/config/aarch64/aarch64-builtins.cc
> > +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > @@ -1085,9 +1085,9 @@ aarch64_get_attributes (unsigned int f, machine_mode 
> > mode)
> >if (!aarch64_modifies_global_state_p (f, mode))
> >  {
> >if (aarch64_reads_global_state_p (f, mode))
> > -   attrs = aarch64_add_attribute ("pure", attrs);
> > -  else
> > attrs = aarch64_add_attribute ("const", attrs);
> > +  else
> > +   attrs = aarch64_add_attribute ("pure", attrs);
> 
> that looks backwards.  'pure' allows read of global memory while
> 'const' does not.  Is
> aarch64_reads_global_state_p really backwards?

Oh - the thing that's backwards is my understanding of what "pure" and
"const" mean. Their meanings as GCC function attributes seem to be
approximately the opposite way round to their meanings in general usage.


[PATCH] aarch64: Fix pure/const function attributes for intrinsics

2022-06-30 Thread Andrew Carlotti via Gcc-patches
No testcase for this, since I haven't found a way to turn the incorrect
attribute into incorrect codegen.

Bootstrapped and tested on aarch64-none-linux gnu.

gcc/

* config/aarch64/aarch64-builtins.c
(aarch64_get_attributes): Fix choice of pure/const attributes.

---

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
e0a741ac663188713e21f457affa57217d074783..877f54aab787862794413259cd36ca0fb7bd49c5
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -1085,9 +1085,9 @@ aarch64_get_attributes (unsigned int f, machine_mode mode)
   if (!aarch64_modifies_global_state_p (f, mode))
 {
   if (aarch64_reads_global_state_p (f, mode))
-   attrs = aarch64_add_attribute ("pure", attrs);
-  else
attrs = aarch64_add_attribute ("const", attrs);
+  else
+   attrs = aarch64_add_attribute ("pure", attrs);
 }

   if (!flag_non_call_exceptions || !aarch64_could_trap_p (f, mode))


[PATCH] aarch64: Move vreinterpret definitions into the compiler

2022-06-29 Thread Andrew Carlotti via Gcc-patches
Hi,

This removes a significant number of intrinsic definitions from the arm_neon.h
header file, and reduces the amount of code duplication. The new macros and
data structures are intended to also facilitate moving other intrinsic
definitions out of the header file in future.

There is a a slight change in the behaviour of the bf16 vreinterpret intrinsics
when compiling without bf16 support. Expressions like:

b = vreinterpretq_s32_bf16(vreinterpretq_bf16_s64(a));

are now compiled successfully, instead of causing a 'target specific option
mismatch' during inlining.

Bootstrapped and tested on aarch64-none-linux-gnu

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc
(v1di_UP): Add V1DI mode.
(MODE_d_bf16, MODE_d_f16, MODE_d_f32, MODE_d_f64, MODE_d_s8)
(MODE_d_s16, MODE_d_s32, MODE_d_s64, MODE_d_u8, MODE_d_u16)
(MODE_d_u32, MODE_d_u64, MODE_d_p8, MODE_d_p16, MODE_d_p64)
(MODE_q_bf16, MODE_q_f16, MODE_q_f32, MODE_q_f64, MODE_q_s8)
(MODE_q_s16, MODE_q_s32, MODE_q_s64, MODE_q_u8, MODE_q_u16)
(MODE_q_u32, MODE_q_u64, MODE_q_p8, MODE_q_p16, MODE_q_p64)
(MODE_q_p128): Define macro to map to corresponding mode name.
(QUAL_bf16, QUAL_f16, QUAL_f32, QUAL_f64, QUAL_s8, QUAL_s16)
(QUAL_s32, QUAL_s64, QUAL_u8, QUAL_u16, QUAL_u32, QUAL_u64)
(QUAL_p8, QUAL_p16, QUAL_p64, QUAL_p128): Define macro to map to
corresponding qualifier name.
(LENGTH_d, LENGTH_q): Define macro to map to "" or "q" suffix.
(SIMD_INTR_MODE, SIMD_INTR_QUAL, SIMD_INTR_LENGTH_CHAR): Macro
functions for the above mappings
(VREINTERPRET_BUILTIN2, VREINTERPRET_BUILTINS1, VREINTERPRET_BUILTINS)
(VREINTERPRETQ_BUILTIN2, VREINTERPRETQ_BUILTINS1)
(VREINTERPRETQ_BUILTINS, VREINTERPRET_BUILTIN)
(AARCH64_SIMD_VREINTERPRET_BUILTINS): New macros to create definitions
for all vreinterpret intrinsics
(enum aarch64_builtins): Add vreinterpret function codes
(aarch64_init_simd_intrinsics): New
(handle_arm_neon_h): Improved comment.
(aarch64_general_fold_builtin): Fold vreinterpret calls
* config/aarch64/aarch64-modes.def (VECTOR_MODE): Add V1DI mode
* config/aarch64/aarch64-simd-builtin-types.def: Use V1DI mode
* config/aarch64/aarch64-simd.md (vec_extractv2div1di): New
* config/aarch64/aarch64.cc (aarch64_classify_vector_mode): Add V1DI 
mode
* config/aarch64/arm_neon.h
(vreinterpret_p8_f16, vreinterpret_p8_f64, vreinterpret_p8_s8)
(vreinterpret_p8_s16, vreinterpret_p8_s32, vreinterpret_p8_s64)
(vreinterpret_p8_f32, vreinterpret_p8_u8, vreinterpret_p8_u16)
(vreinterpret_p8_u32, vreinterpret_p8_u64, vreinterpret_p8_p16)
(vreinterpret_p8_p64, vreinterpretq_p8_f64, vreinterpretq_p8_s8)
(vreinterpretq_p8_s16, vreinterpretq_p8_s32, vreinterpretq_p8_s64)
(vreinterpretq_p8_f16, vreinterpretq_p8_f32, vreinterpretq_p8_u8)
(vreinterpretq_p8_u16, vreinterpretq_p8_u32, vreinterpretq_p8_u64)
(vreinterpretq_p8_p16, vreinterpretq_p8_p64, vreinterpretq_p8_p128)
(vreinterpret_p16_f16, vreinterpret_p16_f64, vreinterpret_p16_s8)
(vreinterpret_p16_s16, vreinterpret_p16_s32, vreinterpret_p16_s64)
(vreinterpret_p16_f32, vreinterpret_p16_u8, vreinterpret_p16_u16)
(vreinterpret_p16_u32, vreinterpret_p16_u64, vreinterpret_p16_p8)
(vreinterpret_p16_p64, vreinterpretq_p16_f64, vreinterpretq_p16_s8)
(vreinterpretq_p16_s16, vreinterpretq_p16_s32, vreinterpretq_p16_s64)
(vreinterpretq_p16_f16, vreinterpretq_p16_f32, vreinterpretq_p16_u8)
(vreinterpretq_p16_u16, vreinterpretq_p16_u32, vreinterpretq_p16_u64)
(vreinterpretq_p16_p8, vreinterpretq_p16_p64, vreinterpretq_p16_p128)
(vreinterpret_p64_f16, vreinterpret_p64_f64, vreinterpret_p64_s8)
(vreinterpret_p64_s16, vreinterpret_p64_s32, vreinterpret_p64_s64)
(vreinterpret_p64_f32, vreinterpret_p64_u8, vreinterpret_p64_u16)
(vreinterpret_p64_u32, vreinterpret_p64_u64, vreinterpret_p64_p8)
(vreinterpret_p64_p16, vreinterpretq_p64_f64, vreinterpretq_p64_s8)
(vreinterpretq_p64_s16, vreinterpretq_p64_s32, vreinterpretq_p64_s64)
(vreinterpretq_p64_f16, vreinterpretq_p64_f32, vreinterpretq_p64_p128)
(vreinterpretq_p64_u8, vreinterpretq_p64_u16, vreinterpretq_p64_p16)
(vreinterpretq_p64_u32, vreinterpretq_p64_u64, vreinterpretq_p64_p8)
(vreinterpretq_p128_p8, vreinterpretq_p128_p16, vreinterpretq_p128_f16)
(vreinterpretq_p128_f32, vreinterpretq_p128_p64, vreinterpretq_p128_s64)
(vreinterpretq_p128_u64, vreinterpretq_p128_s8, vreinterpretq_p128_s16)
(vreinterpretq_p128_s32, vreinterpretq_p128_u8, vreinterpretq_p128_u16)
(vreinterpretq_p128_u32, vreinterpret_f16_f64, vreinterpret_f16_s8)
(vreinterpret_f16_s16): (vreinterpret_f16_s32): (vreinterpret_f16_s64):
  

[PATCH] aarch64: Lower vcombine to GIMPLE

2022-06-07 Thread Andrew Carlotti via Gcc-patches
Hi all,

This lowers vcombine intrinsics to a GIMPLE vector constructor, which enables 
better optimisation during GIMPLE passes.

Bootstrapped and tested on aarch64-none-linux-gnu, and tested for 
aarch64_be-none-linux-gnu via cross-compilation.


gcc/

* config/aarch64/aarch64-builtins.c
(aarch64_general_gimple_fold_builtin): Add combine.

gcc/testsuite/

* gcc.target/aarch64/advsimd-intrinsics/combine.c:
New test.

---

diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
5217dbdb2ac78bba0a669d22af6d769d1fe91a3d..9d52fb8c5a48c9b743defb340a85fb20a1c8f014
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -2827,6 +2827,18 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, 
gcall *stmt,
gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
break;

+ BUILTIN_VDC (BINOP, combine, 0, AUTO_FP)
+ BUILTIN_VD_I (BINOPU, combine, 0, NONE)
+ BUILTIN_VDC_P (BINOPP, combine, 0, NONE)
+   {
+ if (BYTES_BIG_ENDIAN)
+   std::swap(args[0], args[1]);
+ tree ret_type = TREE_TYPE (gimple_call_lhs (stmt));
+ tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, args[0], 
NULL_TREE, args[1]);
+ new_stmt = gimple_build_assign (gimple_call_lhs (stmt), ctor);
+   }
+   break;
+
  /*lower store and load neon builtins to gimple.  */
  BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
  BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c 
b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c
new file mode 100644
index 
..d08faf7a4a160a1e83428ed9b270731bbf7b8c8a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+
+#include 
+
+/*
+** foo:
+** umovw0, v1\.s\[1\]
+** ret
+*/
+
+int32_t foo (int32x2_t a, int32x2_t b)
+{
+  int32x4_t c = vcombine_s32(a, b);
+  return vgetq_lane_s32(c, 3);
+}
+