[PATCH] aarch64: Fix _BitInt testcases

2024-04-11 Thread Andre Vieira (lists)

This patch fixes some testisms introduced by:

commit 5aa3fec38cc6f52285168b161bab1a869d864b44
Author: Andre Vieira 
Date:   Wed Apr 10 16:29:46 2024 +0100

aarch64: Add support for _BitInt

The testcases were relying on an unnecessary sign-extend that is no longer
generated.

The tested version was just slightly behind top of trunk when the patch 
was committed, and the codegen had changed, for the better, by then.


OK for trunk? (I am away tomorrow, so if you want this in before the 
weekend feel free to commit it on my behalf, if approved ofc...)



gcc/testsuite/ChangeLog:

* gcc.target/aarch64/bitfield-bitint-abi-align16.c (g1, g8, g16, g1p, 
g8p,
g16p): Remove unnecessary sbfx.
* gcc.target/aarch64/bitfield-bitint-abi-align8.c (g1, g8, g16, g1p, 
g8p,
g16p): Likewise.


diff --git a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c 
b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
index 
3f292a45f955d35b802a0bd789cd39d5fa7b5860..4a228b0a1ce696dc80e32305162d58f01d44051d
 100644
--- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
+++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
@@ -55,9 +55,8 @@
 ** g1:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f1
 */
@@ -66,9 +65,8 @@
 ** g8:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f8
 */
@@ -76,9 +74,8 @@
 ** g16:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f16
 */
@@ -107,9 +104,8 @@
 /*
 ** g1p:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f1p
@@ -117,9 +113,8 @@
 /*
 ** g8p:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f8p
@@ -128,9 +123,8 @@
 ** g16p:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f16p
 */
diff --git a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c 
b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
index 
da3c23550bae6734f69e2baf0e8db741fb65cfda..e7f773640f04f56646e5e1a5fb91280ea7e4db98
 100644
--- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
+++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
@@ -54,9 +54,8 @@
 /*
 ** g1:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f1
@@ -65,9 +64,8 @@
 /*
 ** g8:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f8
@@ -76,9 +74,8 @@
 ** g16:
 ** mov (x[0-9]+), x0
 ** mov w0, w1
-** sbfx(x[0-9]+), \1, 0, 63
-** and x4, \2, 9223372036854775807
-** and x2, \2, 1
+** and x4, \1, 9223372036854775807
+** and x2, \1, 1
 ** mov x3, 0
 ** b   f16
 */
@@ -107,9 +104,8 @@
 /*
 ** g1p:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f1p
@@ -117,9 +113,8 @@
 /*
 ** g8p:
 ** mov (w[0-9]+), w1
-** sbfx(x[0-9]+), x0, 0, 63
-** and x3, \2, 9223372036854775807
-** and x1, \2, 1
+** and x3, x0, 9223372036854775807
+** and x1, x0, 1
 ** mov x2, 0
 ** mov w0, \1
 ** b   f8p
@@ -128,9 +123,8 @@
 ** g16p:
 ** mov (x[0-9]+), x0
 ** mov w0, w1

[PATCH][wwwdocs] gcc-14/changes.html: Update _BitInt to include AArch64 (little-endian)

2024-04-10 Thread Andre Vieira (lists)

Hi,

Patch to add AArch64 to the list of supported _BitInt(N) in 
gcc-14/changes.html.


OK?diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index 
a7ba957110183f906938d935bfa17aaed2ba20c8..55ab8c14c6d0b54e05a5f266f25c8ef1a4f959bf
 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -216,7 +216,7 @@ a work-in-progress.
   Bit-precise integer types (_BitInt (N)
   and unsigned _BitInt (N)): integer types with
   a specified number of bits.  These are only supported on
-  IA-32/x86-64 at present.
+  IA-32/x86-64 and AArch64 (little-endian) at present.
   Structure, union and enumeration types may be defined more
   than once in the same scope with the same contents and the same
   tag; if such types are defined with the same contents and the


Re: [PATCHv3 2/2] aarch64: Add support for _BitInt

2024-04-10 Thread Andre Vieira (lists)
Added the target check, also had to change some of the assembly checking 
due to changes upstream, the assembly is still valid, but we do extend 
where not necessary, I do believe that's a general issue though.


The _BitInt(N > 64) codegen for non-powers of 2 did get worse, we see 
similar codegen with _int128 bitfields on aarch64.
I suspect we need to improve the way we 'extend' TImode in the aarch64 
backend to be able to operate only on the affected DImode parts of it 
when relevant. Though I also think we may need to change how _BitInt is 
currently expanded in such situations, right now it does the extension 
as two shifts. Anyway I did not have too much time to look deeper into this.


Bootstrapped on aarch64-unknown-linux-gnu.

OK for trunk?

On 28/03/2024 15:21, Richard Sandiford wrote:

Jakub Jelinek  writes:

On Thu, Mar 28, 2024 at 03:00:46PM +, Richard Sandiford wrote:

* gcc.target/aarch64/bitint-alignments.c: New test.
* gcc.target/aarch64/bitint-args.c: New test.
* gcc.target/aarch64/bitint-sizes.c: New test.
* gcc.target/aarch64/bitfield-bitint-abi.h: New header.
* gcc.target/aarch64/bitfield-bitint-abi-align16.c: New test.
* gcc.target/aarch64/bitfield-bitint-abi-align8.c: New test.


Since we don't support big-endian yet, I assume the tests should be
conditional on aarch64_little_endian.


Perhaps better on bitint effective target, then they'll become available
automatically as soon as big endian aarch64 _BitInt support is turned on.


Ah, yeah, good point.

Richarddiff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
81400cc666472ffeff40df14e98ae00ebc774d31..c0af4ef151a8c46f78c0c3a43c2ab1318a3f610a
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6583,6 +6583,7 @@ aarch64_return_in_memory_1 (const_tree type)
   int count;
 
   if (!AGGREGATE_TYPE_P (type)
+  && TREE_CODE (type) != BITINT_TYPE
   && TREE_CODE (type) != COMPLEX_TYPE
   && TREE_CODE (type) != VECTOR_TYPE)
 /* Simple scalar types always returned in registers.  */
@@ -21996,6 +21997,11 @@ aarch64_composite_type_p (const_tree type,
   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
 return true;
 
+  if (type
+  && TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return true;
+
   if (mode == BLKmode
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
@@ -28477,6 +28483,42 @@ aarch64_excess_precision (enum excess_precision_type 
type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+aarch64_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (TARGET_BIG_END)
+return false;
+
+  if (n <= 8)
+info->limb_mode = QImode;
+  else if (n <= 16)
+info->limb_mode = HImode;
+  else if (n <= 32)
+info->limb_mode = SImode;
+  else if (n <= 64)
+info->limb_mode = DImode;
+  else if (n <= 128)
+info->limb_mode = TImode;
+  else
+/* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
+   type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
+   able to use libgcc's implementation to support large _BitInt's we need
+   to use a LIMB_MODE that is no larger than 'long long'.  This is why we
+   use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
+   be TImode to ensure we are ABI compliant.  */
+info->limb_mode = DImode;
+
+  if (n > 128)
+info->abi_limb_mode = TImode;
+  else
+info->abi_limb_mode = info->limb_mode;
+  info->big_endian = TARGET_BIG_END;
+  info->extended = false;
+  return true;
+}
+
 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
scheduled for speculative execution.  Reject the long-running division
and square-root instructions.  */
@@ -30601,6 +30643,9 @@ aarch64_run_selftests (void)
 #undef TARGET_C_EXCESS_PRECISION
 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
 
+#undef TARGET_C_BITINT_TYPE_INFO
+#define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c 
b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
new file mode 100644
index 
..3f292a45f955d35b802a0bd789cd39d5fa7b5860
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
@@ -0,0 +1,384 @@
+/* { dg-do compile { target bitint } } */
+/* { dg-additional-options "-std=c23 -O2 -fno-stack-protector -save-temps 
-fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define ALIGN 16
+#include "bitfield-bitint-abi.h"
+
+// f1-f16 are all the same
+

Re: [PATCHv2 1/2] aarch64: Do not give ABI change diagnostics for _BitInt(N)

2024-04-10 Thread Andre Vieira (lists)

Hey,

Added the warn_pcs_change_le_gcc14 variable and changed the uses of 
warn_pcs_change to use this new variable.
Also fixed an issue with the loop through TREE_FIELDS to avoid an ICE 
during bootstrap.


OK for trunk?

Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

Kind regards,
Andre

On 28/03/2024 12:54, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:

This patch makes sure we do not give ABI change diagnostics for the ABI
breaks of GCC 9, 13 and 14 for any type involving _BitInt(N), since that
type did not exist before this GCC version.

ChangeLog:

* config/aarch64/aarch64.cc (bitint_or_aggr_of_bitint_p): New function.
(aarch64_layout_arg): Don't emit diagnostics for types involving
_BitInt(N).

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
1ea84c8bd7386e399f6ffa3a5e36408cf8831fc6..b68cf3e7cb9a6fa89b4e5826a39ffa11f64ca20a
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6744,6 +6744,33 @@ aarch64_function_arg_alignment (machine_mode mode, 
const_tree type,
return alignment;
  }
  
+/* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the

+   _BitInt(N) type.  These include ARRAY_TYPE's with an element that is a
+   _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
+   with a field member that is a _BitInt(N) or an aggregate that uses it.
+   Return false otherwise.  */
+
+static bool
+bitint_or_aggr_of_bitint_p (tree type)
+{
+  if (!type)
+return false;
+
+  if (TREE_CODE (type) == BITINT_TYPE)
+return true;
+
+  /* If ARRAY_TYPE, check it's element type.  */
+  if (TREE_CODE (type) == ARRAY_TYPE)
+return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
+
+  /* If RECORD_TYPE or UNION_TYPE, check the fields' types.  */
+  if (RECORD_OR_UNION_TYPE_P (type))
+for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
+  if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
+   return true;
+  return false;
+}
+
  /* Layout a function argument according to the AAPCS64 rules.  The rule
 numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
 mode that was originally given to us by the target hook, whereas the
@@ -6767,12 +6794,6 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const 
function_arg_info )
if (pcum->aapcs_arg_processed)
  return;
  
-  bool warn_pcs_change

-= (warn_psabi
-   && !pcum->silent_p
-   && (currently_expanding_function_start
-  || currently_expanding_gimple_stmt));
-
/* HFAs and HVAs can have an alignment greater than 16 bytes.  For example:
  
 typedef struct foo {

@@ -6907,6 +6928,18 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const 
function_arg_info )
  && (!alignment || abi_break_gcc_9 < alignment)
  && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
  
+

+  bool warn_pcs_change
+= (warn_psabi
+   && !pcum->silent_p
+   && (currently_expanding_function_start
+  || currently_expanding_gimple_stmt)
+  /* warn_pcs_change is currently used to gate diagnostics in case of
+abi_break_gcc_{9,13,14}.  These however, do not apply to _BitInt(N)
+types as they were only introduced in GCC 14.  */
+   && (!type || !bitint_or_aggr_of_bitint_p (type)));


How about making this a new variable such as:

   /* _BitInt(N) was only added in GCC 14.  */
   bool warn_pcs_change_le_gcc14
 = (warn_psabi && !bitint_or_aggr_of_bitint_p (type);

(and keeping warn_pcs_change where it is).  In principle, warn_pcs_change
is meaningful for any future ABI breaks, and we might forget that it
excludes bitints.  The name is just a suggestion.

OK with that change, thanks.

Richard


+
+
/* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
   The following code thus handles passing by SIMD/FP registers first.  */
  
@@ -21266,19 +21299,25 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,

rsize = ROUND_UP (size, UNITS_PER_WORD);
nregs = rsize / UNITS_PER_WORD;
  
-  if (align <= 8 && abi_break_gcc_13 && warn_psabi)

+  if (align <= 8
+ && abi_break_gcc_13
+ && warn_psabi
+ && !bitint_or_aggr_of_bitint_p (type))
inform (input_location, "parameter passing for argument of type "
"%qT changed in GCC 13.1", type);
  
if (warn_psabi

  && abi_break_gcc_14
- && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8))
+ && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
+ && !bitint_or_aggr_of_bitint_p (type))
inform (input_location, "parameter passing for argu

[PATCHv2 2/2] aarch64: Add support for _BitInt

2024-03-27 Thread Andre Vieira (lists)
This patch adds support for C23's _BitInt for the AArch64 port when 
compiling for little endianness.  Big Endianness requires further 
target-agnostic support and we therefor disable it for now.


The tests expose some suboptimal codegen for which I'll create PR's for 
optimizations after this goes in.


gcc/ChangeLog:

* config/aarch64/aarch64.cc (TARGET_C_BITINT_TYPE_INFO): Declare MACRO.
(aarch64_bitint_type_info): New function.
(aarch64_return_in_memory_1): Return large _BitInt's in memory.
(aarch64_function_arg_alignment): Adapt to correctly return the ABI
mandated alignment of _BitInt(N) where N > 128 as the alignment of
TImode.
(aarch64_composite_type_p): Return true for _BitInt(N), where N > 128.

libgcc/ChangeLog:

* config/aarch64/t-softfp (softfp_extras): Add floatbitinthf,
floatbitintbf, floatbitinttf and fixtfbitint.
* config/aarch64/libgcc-softfp.ver (GCC_14.0.0): Add __floatbitinthf,
__floatbitintbf, __floatbitinttf and __fixtfbitint.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/bitint-alignments.c: New test.
* gcc.target/aarch64/bitint-args.c: New test.
* gcc.target/aarch64/bitint-sizes.c: New test.
* gcc.target/aarch64/bitfield-bitint-abi.h: New header.
* gcc.target/aarch64/bitfield-bitint-abi-align16.c: New test.
* gcc.target/aarch64/bitfield-bitint-abi-align8.c: New test.diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
b68cf3e7cb9a6fa89b4e5826a39ffa11f64ca20a..5fe55c6e980bc1ea66df0e4357932123cd049366
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6583,6 +6583,7 @@ aarch64_return_in_memory_1 (const_tree type)
   int count;
 
   if (!AGGREGATE_TYPE_P (type)
+  && TREE_CODE (type) != BITINT_TYPE
   && TREE_CODE (type) != COMPLEX_TYPE
   && TREE_CODE (type) != VECTOR_TYPE)
 /* Simple scalar types always returned in registers.  */
@@ -21991,6 +21992,11 @@ aarch64_composite_type_p (const_tree type,
   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
 return true;
 
+  if (type
+  && TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return true;
+
   if (mode == BLKmode
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
@@ -28472,6 +28478,42 @@ aarch64_excess_precision (enum excess_precision_type 
type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+aarch64_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (TARGET_BIG_END)
+return false;
+
+  if (n <= 8)
+info->limb_mode = QImode;
+  else if (n <= 16)
+info->limb_mode = HImode;
+  else if (n <= 32)
+info->limb_mode = SImode;
+  else if (n <= 64)
+info->limb_mode = DImode;
+  else if (n <= 128)
+info->limb_mode = TImode;
+  else
+/* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
+   type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
+   able to use libgcc's implementation to support large _BitInt's we need
+   to use a LIMB_MODE that is no larger than 'long long'.  This is why we
+   use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
+   be TImode to ensure we are ABI compliant.  */
+info->limb_mode = DImode;
+
+  if (n > 128)
+info->abi_limb_mode = TImode;
+  else
+info->abi_limb_mode = info->limb_mode;
+  info->big_endian = TARGET_BIG_END;
+  info->extended = false;
+  return true;
+}
+
 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
scheduled for speculative execution.  Reject the long-running division
and square-root instructions.  */
@@ -30596,6 +30638,9 @@ aarch64_run_selftests (void)
 #undef TARGET_C_EXCESS_PRECISION
 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
 
+#undef TARGET_C_BITINT_TYPE_INFO
+#define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c 
b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
new file mode 100644
index 
..048d04e4c1bf90215892aa0173f6246a097d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
@@ -0,0 +1,378 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-stack-protector -save-temps -fno-schedule-insns 
-fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#define ALIGN 16
+#include "bitfield-bitint-abi.h"
+
+// f1-f16 are all the same
+
+/*
+** f1:
+** and x0, x2, 1
+** ret
+*/
+/*
+** f8:
+** and x0, x2, 1
+** ret
+*/
+/*
+** f16:
+** and x0, x2, 1

[PATCHv2 1/2] aarch64: Do not give ABI change diagnostics for _BitInt(N)

2024-03-27 Thread Andre Vieira (lists)
This patch makes sure we do not give ABI change diagnostics for the ABI 
breaks of GCC 9, 13 and 14 for any type involving _BitInt(N), since that 
type did not exist before this GCC version.


ChangeLog:

* config/aarch64/aarch64.cc (bitint_or_aggr_of_bitint_p): New function.
(aarch64_layout_arg): Don't emit diagnostics for types involving
_BitInt(N).diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
1ea84c8bd7386e399f6ffa3a5e36408cf8831fc6..b68cf3e7cb9a6fa89b4e5826a39ffa11f64ca20a
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6744,6 +6744,33 @@ aarch64_function_arg_alignment (machine_mode mode, 
const_tree type,
   return alignment;
 }
 
+/* Return true if TYPE describes a _BitInt(N) or an angreggate that uses the
+   _BitInt(N) type.  These include ARRAY_TYPE's with an element that is a
+   _BitInt(N) or an aggregate that uses it, and a RECORD_TYPE or a UNION_TYPE
+   with a field member that is a _BitInt(N) or an aggregate that uses it.
+   Return false otherwise.  */
+
+static bool
+bitint_or_aggr_of_bitint_p (tree type)
+{
+  if (!type)
+return false;
+
+  if (TREE_CODE (type) == BITINT_TYPE)
+return true;
+
+  /* If ARRAY_TYPE, check it's element type.  */
+  if (TREE_CODE (type) == ARRAY_TYPE)
+return bitint_or_aggr_of_bitint_p (TREE_TYPE (type));
+
+  /* If RECORD_TYPE or UNION_TYPE, check the fields' types.  */
+  if (RECORD_OR_UNION_TYPE_P (type))
+for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
+  if (bitint_or_aggr_of_bitint_p (TREE_TYPE (field)))
+   return true;
+  return false;
+}
+
 /* Layout a function argument according to the AAPCS64 rules.  The rule
numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
mode that was originally given to us by the target hook, whereas the
@@ -6767,12 +6794,6 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const 
function_arg_info )
   if (pcum->aapcs_arg_processed)
 return;
 
-  bool warn_pcs_change
-= (warn_psabi
-   && !pcum->silent_p
-   && (currently_expanding_function_start
-  || currently_expanding_gimple_stmt));
-
   /* HFAs and HVAs can have an alignment greater than 16 bytes.  For example:
 
typedef struct foo {
@@ -6907,6 +6928,18 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const 
function_arg_info )
  && (!alignment || abi_break_gcc_9 < alignment)
  && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
 
+
+  bool warn_pcs_change
+= (warn_psabi
+   && !pcum->silent_p
+   && (currently_expanding_function_start
+  || currently_expanding_gimple_stmt)
+  /* warn_pcs_change is currently used to gate diagnostics in case of
+abi_break_gcc_{9,13,14}.  These however, do not apply to _BitInt(N)
+types as they were only introduced in GCC 14.  */
+   && (!type || !bitint_or_aggr_of_bitint_p (type)));
+
+
   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
  The following code thus handles passing by SIMD/FP registers first.  */
 
@@ -21266,19 +21299,25 @@ aarch64_gimplify_va_arg_expr (tree valist, tree type, 
gimple_seq *pre_p,
   rsize = ROUND_UP (size, UNITS_PER_WORD);
   nregs = rsize / UNITS_PER_WORD;
 
-  if (align <= 8 && abi_break_gcc_13 && warn_psabi)
+  if (align <= 8
+ && abi_break_gcc_13
+ && warn_psabi
+ && !bitint_or_aggr_of_bitint_p (type))
inform (input_location, "parameter passing for argument of type "
"%qT changed in GCC 13.1", type);
 
   if (warn_psabi
  && abi_break_gcc_14
- && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8))
+ && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8)
+ && !bitint_or_aggr_of_bitint_p (type))
inform (input_location, "parameter passing for argument of type "
"%qT changed in GCC 14.1", type);
 
   if (align > 8)
{
- if (abi_break_gcc_9 && warn_psabi)
+ if (abi_break_gcc_9
+ && warn_psabi
+ && !bitint_or_aggr_of_bitint_p (type))
inform (input_location, "parameter passing for argument of type "
"%qT changed in GCC 9.1", type);
  dw_align = true;


[PATCHv2 0/2] aarch64, bitint: Add support for _BitInt for AArch64 Little Endian

2024-03-27 Thread Andre Vieira (lists)

Hi,

Introduced a new patch to disable diagnostics for ABI breaks involving 
_BitInt(N) given the type didn't exist, let me know what you think of that.


Also added further testing to replicate the ABI diagnostic tests to use 
_BitInt(N).


Andre Vieira (2)
aarch64: Do not give ABI change diagnostics for _BitInt(N)
aarch64: Add support for _BitInt



Backport PR91838 and PR110838

2024-03-25 Thread Andre Vieira (lists)

Hi,

After the backport off PR target/112787 a failure was reported against 
x86_64, this would be fixed by backporting:
* tree-optimization/91838 - fix FAIL of g++.dg/opt/pr91838.C 
(d1c072a1c3411a6fe29900750b38210af8451eeb)
* tree-optimization/110838 - less aggressively fold out-of-bound shifts 
(04aa0edcace22a7815cfc57575f1f7b1f166ac10)


Patches apply cleanly, just one minor git context conflict with includes.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu for gcc-12 and gcc-13 branches.


OK to backport?

Kind regards,
Andre


Re: [PATCH] testsuite: Fix fallout of turning warnings into errors on 32-bit Arm

2024-03-01 Thread Andre Vieira (lists)

Hi Thiago,

Thanks for this, LGTM but I can't approve this, CC'ing Richard.

Do have a nitpick, in the gcc/testsuite/ChangeLog: remove 
'gcc/testsuite' from bullet points 2-4.


Kind regards,
Andre

On 13/01/2024 00:55, Thiago Jung Bauermann wrote:

Since commits 2c3db94d9fd ("c: Turn int-conversion warnings into
permerrors") and 55e94561e97e ("c: Turn -Wimplicit-function-declaration
into a permerror") these tests fail with errors such as:

   FAIL: gcc.target/arm/pr59858.c (test for excess errors)
   FAIL: gcc.target/arm/pr65647.c (test for excess errors)
   FAIL: gcc.target/arm/pr65710.c (test for excess errors)
   FAIL: gcc.target/arm/pr97969.c (test for excess errors)

Here's one example of the excess errors:

   FAIL: gcc.target/arm/pr65647.c (test for excess errors)
   Excess errors:
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:6:17: error: 
initialization of 'int' from 'int *' makes integer from pointer without a cast 
[-Wint-conversion]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:6:51: error: 
initialization of 'int' from 'int *' makes integer from pointer without a cast 
[-Wint-conversion]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:6:62: error: 
initialization of 'int' from 'int *' makes integer from pointer without a cast 
[-Wint-conversion]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:7:48: error: 
initialization of 'int' from 'int *' makes integer from pointer without a cast 
[-Wint-conversion]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:8:9: error: 
initialization of 'int' from 'int *' makes integer from pointer without a cast 
[-Wint-conversion]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:24:5: error: 
initialization of 'int' from 'int *' makes integer from pointer without a cast 
[-Wint-conversion]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:25:5: error: 
initialization of 'int' from 'struct S1 *' makes integer from pointer without a 
cast [-Wint-conversion]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:41:3: error: implicit 
declaration of function 'fn3'; did you mean 'fn2'? 
[-Wimplicit-function-declaration]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:46:3: error: implicit 
declaration of function 'fn5'; did you mean 'fn4'? 
[-Wimplicit-function-declaration]
   /path/gcc.git/gcc/testsuite/gcc.target/arm/pr65647.c:57:16: error: implicit 
declaration of function 'fn6'; did you mean 'fn4'? 
[-Wimplicit-function-declaration]

PR rtl-optimization/59858 and PR target/65710 test the fix of an ICE.
PR target/65647 and PR target/97969 test for a compilation infinite loop.

Therefore, add -fpermissive so that the tests behave as they did previously.
Tested on armv8l-linux-gnueabihf.

gcc/testsuite/ChangeLog:
* gcc.target/arm/pr59858.c: Add -fpermissive.
* gcc/testsuite/gcc.target/arm/pr65647.c: Likewise.
* gcc/testsuite/gcc.target/arm/pr65710.c: Likewise.
* gcc/testsuite/gcc.target/arm/pr97969.c: Likewise.
---
  gcc/testsuite/gcc.target/arm/pr59858.c | 2 +-
  gcc/testsuite/gcc.target/arm/pr65647.c | 2 +-
  gcc/testsuite/gcc.target/arm/pr65710.c | 2 +-
  gcc/testsuite/gcc.target/arm/pr97969.c | 2 +-
  4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/arm/pr59858.c 
b/gcc/testsuite/gcc.target/arm/pr59858.c
index 3360b48e8586..9336edfce277 100644
--- a/gcc/testsuite/gcc.target/arm/pr59858.c
+++ b/gcc/testsuite/gcc.target/arm/pr59858.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-march=armv5te -fno-builtin -mfloat-abi=soft -mthumb 
-fno-stack-protector -Os -fno-tree-loop-optimize -fno-tree-dominator-opts -fPIC -w" 
} */
+/* { dg-options "-march=armv5te -fno-builtin -mfloat-abi=soft -mthumb 
-fno-stack-protector -Os -fno-tree-loop-optimize -fno-tree-dominator-opts -fPIC -w 
-fpermissive" } */
  /* { dg-require-effective-target fpic } */
  /* { dg-skip-if "Incompatible command line options: -mfloat-abi=soft -mfloat-abi=hard" { *-*-* } 
{ "-mfloat-abi=hard" } { "" } } */
  /* { dg-require-effective-target arm_arch_v5te_thumb_ok } */
diff --git a/gcc/testsuite/gcc.target/arm/pr65647.c 
b/gcc/testsuite/gcc.target/arm/pr65647.c
index 26b4e399f6be..3cbf6b804ec0 100644
--- a/gcc/testsuite/gcc.target/arm/pr65647.c
+++ b/gcc/testsuite/gcc.target/arm/pr65647.c
@@ -1,7 +1,7 @@
  /* { dg-do compile } */
  /* { dg-require-effective-target arm_arch_v6m_ok } */
  /* { dg-skip-if "do not override -mfloat-abi" { *-*-* } { "-mfloat-abi=*" } 
{"-mfloat-abi=soft" } } */
-/* { dg-options "-march=armv6-m -mthumb -O3 -w -mfloat-abi=soft" } */
+/* { dg-options "-march=armv6-m -mthumb -O3 -w -mfloat-abi=soft -fpermissive" 
} */
  
  a, b, c, e, g = , h, i = 7, l = 1, m, n, o, q = , r, s = , u, w = 9, x,

y = 6, z, t6 = 7, t8, t9 = 1, t11 = 5, t12 = , t13 = 3, t15,
diff --git a/gcc/testsuite/gcc.target/arm/pr65710.c 
b/gcc/testsuite/gcc.target/arm/pr65710.c
index 103ce1d45f77..4cbf7817af7e 100644
--- 

Re: [PATCH] tree-optimization/110221 - SLP and loop mask/len

2024-03-01 Thread Andre Vieira (lists)

Hi,

Bootstrapped and tested the gcc-13 backport of this on gcc-12 for 
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu and no regressions.


OK to push to gcc-12 branch?

Kind regards,
Andre Vieira

On 10/11/2023 13:16, Richard Biener wrote:

The following fixes the issue that when SLP stmts are internal defs
but appear invariant because they end up only using invariant defs
then they get scheduled outside of the loop.  This nice optimization
breaks down when loop masks or lens are applied since those are not
explicitly tracked as dependences.  The following makes sure to never
schedule internal defs outside of the vectorized loop when the
loop uses masks/lens.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR tree-optimization/110221
* tree-vect-slp.cc (vect_schedule_slp_node): When loop
masking / len is applied make sure to not schedule
intenal defs outside of the loop.

* gfortran.dg/pr110221.f: New testcase.
---
  gcc/testsuite/gfortran.dg/pr110221.f | 17 +
  gcc/tree-vect-slp.cc | 10 ++
  2 files changed, 27 insertions(+)
  create mode 100644 gcc/testsuite/gfortran.dg/pr110221.f

diff --git a/gcc/testsuite/gfortran.dg/pr110221.f 
b/gcc/testsuite/gfortran.dg/pr110221.f
new file mode 100644
index 000..8b57384313a
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr110221.f
@@ -0,0 +1,17 @@
+C PR middle-end/68146
+C { dg-do compile }
+C { dg-options "-O2 -w" }
+C { dg-additional-options "-mavx512f --param vect-partial-vector-usage=2" { 
target avx512f } }
+  SUBROUTINE CJYVB(V,Z,V0,CBJ,CDJ,CBY,CYY)
+  IMPLICIT DOUBLE PRECISION (A,B,G,O-Y)
+  IMPLICIT COMPLEX*16 (C,Z)
+  DIMENSION CBJ(0:*),CDJ(0:*),CBY(0:*)
+  N=INT(V)
+  CALL GAMMA2(VG,GA)
+  DO 65 K=1,N
+CBY(K)=CYY
+65CONTINUE
+  CDJ(0)=V0/Z*CBJ(0)-CBJ(1)
+  DO 70 K=1,N
+70  CDJ(K)=-(K+V0)/Z*CBJ(K)+CBJ(K-1)
+  END
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 3e5814c3a31..80e279d8f50 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -9081,6 +9081,16 @@ vect_schedule_slp_node (vec_info *vinfo,
/* Emit other stmts after the children vectorized defs which is
 earliest possible.  */
gimple *last_stmt = NULL;
+  if (auto loop_vinfo = dyn_cast  (vinfo))
+   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+   || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
+ {
+   /* But avoid scheduling internal defs outside of the loop when
+  we might have only implicitly tracked loop mask/len defs.  */
+   gimple_stmt_iterator si
+ = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
+   last_stmt = *si;
+ }
bool seen_vector_def = false;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)


Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-02-28 Thread Andre Vieira (lists)



On 27/02/2024 08:47, Richard Biener wrote:

On Mon, 26 Feb 2024, Andre Vieira (lists) wrote:




On 05/02/2024 09:56, Richard Biener wrote:

On Thu, 1 Feb 2024, Andre Vieira (lists) wrote:




On 01/02/2024 07:19, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:


The patch didn't come with a testcase so it's really hard to tell
what goes wrong now and how it is fixed ...


My bad! I had a testcase locally but never added it...

However... now I look at it and ran it past Richard S, the codegen isn't
'wrong', but it does have the potential to lead to some pretty slow
codegen,
especially for inbranch simdclones where it transforms the SVE predicate
into
an Advanced SIMD vector by inserting the elements one at a time...

An example of which can be seen if you do:

gcc -O3 -march=armv8-a+sve -msve-vector-bits=128  -fopenmp-simd t.c -S

with the following t.c:
#pragma omp declare simd simdlen(4) inbranch
int __attribute__ ((const)) fn5(int);

void fn4 (int *a, int *b, int n)
{
  for (int i = 0; i < n; ++i)
  b[i] = fn5(a[i]);
}

Now I do have to say, for our main usecase of libmvec we won't have any
'inbranch' Advanced SIMD clones, so we avoid that issue... But of course
that
doesn't mean user-code will.


It seems to use SVE masks with vector(4)  and the
ABI says the mask is vector(4) int.  You say that's because we choose
a Adv SIMD clone for the SVE VLS vector code (it calls _ZGVnM4v_fn5).

The vectorizer creates

_44 = VEC_COND_EXPR ;

and then vector lowering decomposes this.  That means the vectorizer
lacks a check that the target handles this VEC_COND_EXPR.

Of course I would expect that SVE with VLS vectors is able to
code generate this operation, so it's missing patterns in the end.

Richard.



What should we do for GCC-14? Going forward I think the right thing to do is
to add these patterns. But I am not even going to try to do that right now and
even though we can codegen for this, the result doesn't feel like it would
ever be profitable which means I'd rather not vectorize, or well pick a
different vector mode if possible.

This would be achieved with the change to the targethook. If I change the hook
to take modes, using STMT_VINFO_VECTYPE (stmt_vinfo), is that OK for now?


Passing in a mode is OK.  I'm still not fully understanding why the
clone isn't fully specifying 'mode' and if it does not why the
vectorizer itself can not disregard it.



We could check that the modes of the parameters & return type are the 
same as the vector operands & result in the vectorizer. But then we'd 
also want to make sure we don't reject cases where we have simdclones 
with compatible modes, aka same element type, but a multiple element 
count.  Which is where'd we get in trouble again I think, because we'd 
want to accept V8SI -> 2x V4SI, but not V8SI -> 2x VNx4SI (with VLS and 
aarch64_sve_vg = 2), not because it's invalid, but because right now the 
codegen is bad. And it's easier to do this in the targethook, which we 
can technically also use to 'rank' simdclones by setting a 
target_badness value, so in the future we could decide to assign some 
'badness' to influence the rank a SVE simdclone for Advanced SIMD loops 
vs an Advanced SIMD clone for Advanced SIMD loops.


This does touch another issue of simdclone costing, which is a larger 
issue in general and one we (arm) might want to approach in the future. 
It's a complex issue, because the vectorizer doesn't know the 
performance impact of a simdclone, we assume (as we should) that its 
faster than the original scalar, though we currently don't record costs 
for either, but we don't know by how much or how much impact it has, so 
the vectorizer can't reason whether it's beneficial to use a simdclone 
if it has to do a lot of operand preparation, we can merely tell it to 
use it, or not and all the other operations in the loop will determine 
costing.




 From the past discussion I understood the existing situation isn't
as bad as initially thought and no bad things happen right now?
Nope, I thought they compiler would fall apart, but it seems to be able 
to transform the operands from one mode into the other, so without the 
targethook it just generates slower loops in certain cases, which we'd 
rather avoid given the usecase for simdclones is to speed things up ;)



Attached reworked patch.


This patch adds a machine_mode argument to TARGET_SIMD_CLONE_USABLE to 
make sure the target can reject a simd_clone based on the vector mode it 
is using.  This is needed because for VLS SVE vectorization the 
vectorizer accepts Advanced SIMD simd clones when vectorizing using SVE 
types because the simdlens might match, this currently leads to 
suboptimal codegen.


Other targets do not currently need to use this argument.

gcc/ChangeLog:

* target.def (TARGET_SIMD_CLONE_USABLE): Add argument.
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Pass vector_mod

Re: [PATCH 2/2] aarch64: Add support for _BitInt

2024-02-27 Thread Andre Vieira (lists)

Hey,

Dropped the first patch and dealt with the comments above, hopefully I 
didn't miss any this time.


--

This patch adds support for C23's _BitInt for the AArch64 port when 
compiling

for little endianness.  Big Endianness requires further target-agnostic
support and we therefor disable it for now.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (TARGET_C_BITINT_TYPE_INFO): Declare MACRO.
(aarch64_bitint_type_info): New function.
(aarch64_return_in_memory_1): Return large _BitInt's in memory.
(aarch64_function_arg_alignment): Adapt to correctly return the ABI
mandated alignment of _BitInt(N) where N > 128 as the alignment of
TImode.
(aarch64_composite_type_p): Return true for _BitInt(N), where N > 128.

libgcc/ChangeLog:

* config/aarch64/t-softfp (softfp_extras): Add floatbitinthf,
floatbitintbf, floatbitinttf and fixtfbitint.
* config/aarch64/libgcc-softfp.ver (GCC_14.0.0): Add __floatbitinthf,
__floatbitintbf, __floatbitinttf and __fixtfbitint.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/bitint-alignments.c: New test.
* gcc.target/aarch64/bitint-args.c: New test.
* gcc.target/aarch64/bitint-sizes.c: New test.


On 02/02/2024 14:46, Jakub Jelinek wrote:

On Thu, Jan 25, 2024 at 05:45:01PM +, Andre Vieira wrote:

This patch adds support for C23's _BitInt for the AArch64 port when compiling
for little endianness.  Big Endianness requires further target-agnostic
support and we therefor disable it for now.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (TARGET_C_BITINT_TYPE_INFO): Declare MACRO.
(aarch64_bitint_type_info): New function.
(aarch64_return_in_memory_1): Return large _BitInt's in memory.
(aarch64_function_arg_alignment): Adapt to correctly return the ABI
mandated alignment of _BitInt(N) where N > 128 as the alignment of
TImode.
(aarch64_composite_type_p): Return true for _BitInt(N), where N > 128.

libgcc/ChangeLog:

* config/aarch64/t-softfp: Add fixtfbitint, floatbitinttf and
floatbitinthf to the softfp_extras variable to ensure the
runtime support is available for _BitInt.


I think this lacks some config/aarch64/t-whatever.ver
additions.
See PR113700 for some more details.
We want the support routines for binary floating point <-> _BitInt
conversions in both libgcc.a and libgcc_s.so.1 and exported from the latter
too at GCC_14.0.0 symver, while decimal floating point <-> _BitInt solely in
libgcc.a (as with all the huge dfp/bid stuff).

Jakub
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
16318bf925883ecedf9345e53fc0824a553b2747..9bd8d22f6edd9f6c77907ec383f9e8bf055cfb8b
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6583,6 +6583,7 @@ aarch64_return_in_memory_1 (const_tree type)
   int count;
 
   if (!AGGREGATE_TYPE_P (type)
+  && TREE_CODE (type) != BITINT_TYPE
   && TREE_CODE (type) != COMPLEX_TYPE
   && TREE_CODE (type) != VECTOR_TYPE)
 /* Simple scalar types always returned in registers.  */
@@ -21895,6 +21896,11 @@ aarch64_composite_type_p (const_tree type,
   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
 return true;
 
+  if (type
+  && TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return true;
+
   if (mode == BLKmode
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
@@ -28400,6 +28406,42 @@ aarch64_excess_precision (enum excess_precision_type 
type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+aarch64_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (TARGET_BIG_END)
+return false;
+
+  if (n <= 8)
+info->limb_mode = QImode;
+  else if (n <= 16)
+info->limb_mode = HImode;
+  else if (n <= 32)
+info->limb_mode = SImode;
+  else if (n <= 64)
+info->limb_mode = DImode;
+  else if (n <= 128)
+info->limb_mode = TImode;
+  else
+/* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
+   type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
+   able to use libgcc's implementation to support large _BitInt's we need
+   to use a LIMB_MODE that is no larger than 'long long'.  This is why we
+   use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
+   be TImode to ensure we are ABI compliant.  */
+info->limb_mode = DImode;
+
+  if (n > 128)
+info->abi_limb_mode = TImode;
+  else
+info->abi_limb_mode = info->limb_mode;
+  info->big_endian = TARGET_BIG_END;
+  info->extended = false;
+  return true;
+}
+
 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
scheduled for speculative 

Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-02-26 Thread Andre Vieira (lists)




On 05/02/2024 09:56, Richard Biener wrote:

On Thu, 1 Feb 2024, Andre Vieira (lists) wrote:




On 01/02/2024 07:19, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:


The patch didn't come with a testcase so it's really hard to tell
what goes wrong now and how it is fixed ...


My bad! I had a testcase locally but never added it...

However... now I look at it and ran it past Richard S, the codegen isn't
'wrong', but it does have the potential to lead to some pretty slow codegen,
especially for inbranch simdclones where it transforms the SVE predicate into
an Advanced SIMD vector by inserting the elements one at a time...

An example of which can be seen if you do:

gcc -O3 -march=armv8-a+sve -msve-vector-bits=128  -fopenmp-simd t.c -S

with the following t.c:
#pragma omp declare simd simdlen(4) inbranch
int __attribute__ ((const)) fn5(int);

void fn4 (int *a, int *b, int n)
{
 for (int i = 0; i < n; ++i)
 b[i] = fn5(a[i]);
}

Now I do have to say, for our main usecase of libmvec we won't have any
'inbranch' Advanced SIMD clones, so we avoid that issue... But of course that
doesn't mean user-code will.


It seems to use SVE masks with vector(4)  and the
ABI says the mask is vector(4) int.  You say that's because we choose
a Adv SIMD clone for the SVE VLS vector code (it calls _ZGVnM4v_fn5).

The vectorizer creates

   _44 = VEC_COND_EXPR ;

and then vector lowering decomposes this.  That means the vectorizer
lacks a check that the target handles this VEC_COND_EXPR.

Of course I would expect that SVE with VLS vectors is able to
code generate this operation, so it's missing patterns in the end.

Richard.



What should we do for GCC-14? Going forward I think the right thing to 
do is to add these patterns. But I am not even going to try to do that 
right now and even though we can codegen for this, the result doesn't 
feel like it would ever be profitable which means I'd rather not 
vectorize, or well pick a different vector mode if possible.


This would be achieved with the change to the targethook. If I change 
the hook to take modes, using STMT_VINFO_VECTYPE (stmt_vinfo), is that 
OK for now?


Kind regards,
Andre


Re: [comitted] bitint: Fix testism where __seg_gs was being used for all targets

2024-02-19 Thread Andre Vieira (lists)




On 19/02/2024 16:17, Jakub Jelinek wrote:

On Mon, Feb 19, 2024 at 04:13:29PM +, Andre Vieira (lists) wrote:

Replaced uses of __seg_gs with the MACRO SEG defined in the testcase to pick
(if any) the right __seg_{gs,fs} keyword based on target.

gcc/testsuite/ChangeLog:

* gcc.dg/bitint-86.c (__seg_gs): Replace with SEG MACRO.


ChangeLog should be
* gcc.dg/bitint-86.c (foo, bar, baz): Replace __seg_gs with SEG.
Otherwise, LGTM.
Sorry for forgetting to do that myself.


Jakub



That makes sense ... but I already pushed it upstream, thought it was 
obvious. Apologies for the ChangeLog mistake :(


[comitted] bitint: Fix testism where __seg_gs was being used for all targets

2024-02-19 Thread Andre Vieira (lists)
Replaced uses of __seg_gs with the MACRO SEG defined in the testcase to 
pick (if any) the right __seg_{gs,fs} keyword based on target.


gcc/testsuite/ChangeLog:

* gcc.dg/bitint-86.c (__seg_gs): Replace with SEG MACRO.diff --git a/gcc/testsuite/gcc.dg/bitint-86.c b/gcc/testsuite/gcc.dg/bitint-86.c
index 
4e5761a203bc39150540326df9c0d88544bb02ef..10a2392b6f530ae165252bdac750061e92d53131
 100644
--- a/gcc/testsuite/gcc.dg/bitint-86.c
+++ b/gcc/testsuite/gcc.dg/bitint-86.c
@@ -15,14 +15,14 @@ struct T { struct S b[4]; };
 #endif
 
 void
-foo (__seg_gs struct T *p)
+foo (SEG struct T *p)
 {
   struct S s;
   p->b[0] = s;
 }
 
 void
-bar (__seg_gs struct T *p, _BitInt(710) x, int y, double z)
+bar (SEG struct T *p, _BitInt(710) x, int y, double z)
 {
   p->b[0].a = x + 42;
   p->b[1].a = x << y;
@@ -31,7 +31,7 @@ bar (__seg_gs struct T *p, _BitInt(710) x, int y, double z)
 }
 
 int
-baz (__seg_gs struct T *p, _BitInt(710) x, _BitInt(710) y)
+baz (SEG struct T *p, _BitInt(710) x, _BitInt(710) y)
 {
   return __builtin_add_overflow (x, y, >b[1].a);
 }


Re: veclower: improve selection of vector mode when lowering [PR 112787]

2024-02-19 Thread Andre Vieira (lists)

Hi all,

OK to backport this to gcc-12 and gcc-13? Patch applies cleanly, 
bootstrapped and regression tested on aarch64-unknown-linux-gnu. Only 
change is in the testcase as I had to use -march=armv9-a because 
-march=armv8-a+sve conflicts with -mcpu=neoverse-n2 in previous gcc 
versions.


Kind Regards,
Andre

On 20/12/2023 14:30, Richard Biener wrote:

On Wed, 20 Dec 2023, Andre Vieira (lists) wrote:


Thanks, fully agree with all comments.

gcc/ChangeLog:

PR target/112787
* tree-vect-generic (type_for_widest_vector_mode): Change function
 to use original vector type and check widest vector mode has at most
 the same number of elements.
(get_compute_type): Pass original vector type rather than the element
 type to type_for_widest_vector_mode and remove now obsolete check
 for the number of elements.


OK.

Richard.


On 07/12/2023 07:45, Richard Biener wrote:

On Wed, 6 Dec 2023, Andre Vieira (lists) wrote:


Hi,

This patch addresses the issue reported in PR target/112787 by improving
the
compute type selection.  We do this by not considering types with more
elements
than the type we are lowering since we'd reject such types anyway.

gcc/ChangeLog:

  PR target/112787
  * tree-vect-generic (type_for_widest_vector_mode): Add a parameter to
  control maximum amount of elements in resulting vector mode.
  (get_compute_type): Restrict vector_compute_type to a mode no wider
  than the original compute type.

gcc/testsuite/ChangeLog:

  * gcc.target/aarch64/pr112787.c: New test.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
x86_64-pc-linux-gnu.

Is this OK for trunk?


@@ -1347,7 +1347,7 @@ optimize_vector_constructor (gimple_stmt_iterator
*gsi)
  TYPE, or NULL_TREE if none is found.  */

Can you improve the function comment?  It also doesn't mention OP ...

   static tree
-type_for_widest_vector_mode (tree type, optab op)
+type_for_widest_vector_mode (tree type, optab op, poly_int64 max_nunits =
0)
   {
 machine_mode inner_mode = TYPE_MODE (type);
 machine_mode best_mode = VOIDmode, mode;
@@ -1371,7 +1371,9 @@ type_for_widest_vector_mode (tree type, optab op)
 FOR_EACH_MODE_FROM (mode, mode)
   if (GET_MODE_INNER (mode) == inner_mode
  && maybe_gt (GET_MODE_NUNITS (mode), best_nunits)
-   && optab_handler (op, mode) != CODE_FOR_nothing)
+   && optab_handler (op, mode) != CODE_FOR_nothing
+   && (known_eq (max_nunits, 0)
+   || known_lt (GET_MODE_NUNITS (mode), max_nunits)))

max_nunits suggests that known_le would be appropriate instead.

I see the only other caller with similar "problems":

  }
/* Can't use get_compute_type here, as supportable_convert_operation
   doesn't necessarily use an optab and needs two arguments.  */
tree vec_compute_type
  = type_for_widest_vector_mode (TREE_TYPE (arg_type), mov_optab);
if (vec_compute_type
&& VECTOR_MODE_P (TYPE_MODE (vec_compute_type))
&& subparts_gt (arg_type, vec_compute_type))

so please do not default to 0 but adjust this one as well.  It also
seems you then can remove the subparts_gt guards on both
vec_compute_type uses.

I think the API would be cleaner if we'd pass the original vector type
we can then extract TYPE_VECTOR_SUBPARTS from, avoiding the extra arg.

No?

Thanks,
Richard.






Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-02-01 Thread Andre Vieira (lists)




On 01/02/2024 07:19, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:


The patch didn't come with a testcase so it's really hard to tell
what goes wrong now and how it is fixed ...


My bad! I had a testcase locally but never added it...

However... now I look at it and ran it past Richard S, the codegen isn't 
'wrong', but it does have the potential to lead to some pretty slow 
codegen, especially for inbranch simdclones where it transforms the SVE 
predicate into an Advanced SIMD vector by inserting the elements one at 
a time...


An example of which can be seen if you do:

gcc -O3 -march=armv8-a+sve -msve-vector-bits=128  -fopenmp-simd t.c -S

with the following t.c:
#pragma omp declare simd simdlen(4) inbranch
int __attribute__ ((const)) fn5(int);

void fn4 (int *a, int *b, int n)
{
for (int i = 0; i < n; ++i)
b[i] = fn5(a[i]);
}

Now I do have to say, for our main usecase of libmvec we won't have any 
'inbranch' Advanced SIMD clones, so we avoid that issue... But of course 
that doesn't mean user-code will.


I'm gonna remove this patch and run another test regression to see if it 
catches anything weird, but if not then I guess we do have the option to 
not use this patch and aim to solve the costing or codegen issue in 
GCC-15. We don't currently do any simdclone costing and I don't have a 
clear suggestion for how given openmp has no mechanism that I know off 
to expose the speedup of a simdclone over it's scalar variant, so how 
would we 'compare' a simdclone call with extra overhead of argument 
preparation vs scalar, though at least we could prefer a call to a 
different simdclone with less argument preparation. Anyways I digress.


Other tests, these require aarch64-autovec-preference=2 so that also has 
me worried less...


gcc -O3 -march=armv8-a+sve -msve-vector-bits=128 --param 
aarch64-autovec-preference=2 -fopenmp-simd t.c -S


t.c:
#pragma omp declare simd simdlen(2) notinbranch
float __attribute__ ((const)) fn1(double);

void fn0 (float *a, float *b, int n)
{
for (int i = 0; i < n; ++i)
b[i] = fn1((double) a[i]);
}

#pragma omp declare simd simdlen(2) notinbranch
float __attribute__ ((const)) fn3(float);

void fn2 (float *a, double *b, int n)
{
for (int i = 0; i < n; ++i)
b[i] = (double) fn3(a[i]);
}


Richard.



That said, I wonder how we end up mixing things up in the first place.

Richard.






Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-01-31 Thread Andre Vieira (lists)




On 31/01/2024 14:35, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:




On 31/01/2024 13:58, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:




On 31/01/2024 12:13, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Tue, 30 Jan 2024, Andre Vieira wrote:



This patch adds stmt_vec_info to TARGET_SIMD_CLONE_USABLE to make sure
the
target can reject a simd_clone based on the vector mode it is using.
This is needed because for VLS SVE vectorization the vectorizer accepts
Advanced SIMD simd clones when vectorizing using SVE types because the
simdlens
might match.  This will cause type errors later on.

Other targets do not currently need to use this argument.


Can you instead pass down the mode?


Thinking about that again the cgraph_simd_clone info in the clone
should have sufficient information to disambiguate.  If it doesn't
then we should amend it.

Richard.


Hi Richard,

Thanks for the review, I don't think cgraph_simd_clone_info is the right
place
to pass down this information, since this is information about the caller
rather than the simdclone itself. What we are trying to achieve here is
making
the vectorizer being able to accept or reject simdclones based on the ISA
we
are vectorizing for. To distinguish between SVE and Advanced SIMD ISAs we
use
modes, I am also not sure that's ideal but it is what we currently use. So
to
answer your earlier question, yes I can also pass down mode if that's
preferable.


Note cgraph_simd_clone_info has simdlen and we seem to check elsewhere
whether that's POLY or constant.  I wonder how aarch64_sve_mode_p
comes into play here which in the end classifies VLS SVE modes as
non-SVE?



Using -msve-vector-bits=128
(gdb) p TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))
$4 = E_VNx4SImode
(gdb) p  TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo))
$5 = (tree) 0xf741c1b0
(gdb) p debug (TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo)))
128
(gdb) p aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo)))
$5 = true

and for reference without vls codegen:
(gdb) p TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))
$1 = E_VNx4SImode
(gdb) p  debug (TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo)))
POLY_INT_CST [128, 128]

Having said that I believe that the USABLE targethook implementation for
aarch64 should also block other uses, like an Advanced SIMD mode being used as
input for a SVE VLS SIMDCLONE. The reason being that for instance 'half'
registers like VNx2SI are packed differently from V2SI.

We could teach the vectorizer to support these of course, but that requires
more work and is not extremely useful just yet. I'll add the extra check that
to the patch once we agree on how to pass down the information we need. Happy
to use either mode, or stmt_vec_info and extract the mode from it like it does
now.


As said, please pass down 'mode'.  But I wonder how to document it,
which mode is that supposed to be?  Any of result or any argument
mode that happens to be a vector?  I think that we might be able
to mix Advanced SIMD modes and SVE modes with -msve-vector-bits=128
in the same loop?

Are the simd clones you don't want to use with -msve-vector-bits=128
having constant simdlen?  If so why do you generate them in the first
place?


So this is where things get a bit confusing and I will write up some 
text for these cases to put in our ABI document (currently in Beta and 
in need of some tlc).


Our intended behaviour is for a 'declare simd' without a simdlen to 
generate simdclones for:
* Advanced SIMD 128 and 64-bit vectors, where possible (we don't allow 
for simdlen 1, Tamar fixed that in gcc recently),

* SVE VLA vectors.

Let me illustrate this with an example:

__attribute__ ((simd (notinbranch), const)) float cosf(float);

Should tell the compiler the following simd clones are available:
__ZGVnN4v_cosf 128-bit 4x4 float Advanced SIMD clone
__ZGVnN2v_cosf 64-bit  4x2 float Advanced SIMD clone
__ZGVsMxv_cosf [128, 128]-bit 4x4xN SVE SIMD clone

[To save you looking into the abi let me break this down, _ZGV is 
prefix, then 'n' or 's' picks between Advanced SIMD and SVE, 'N' or 'M' 
picks between Not Masked and Masked (SVE is always masked even if we ask 
for notinbranch), then a digit or 'x' picks between Vector Length or 
VLA, and after that you get a letter per argument, where v = vector mapped]


Regardless of -msve-vector-bits, however, the vectorizer (and any other 
part of the compiler) may assume that the VL of the VLA SVE clone is 
that specified by -msve-vector-bits, which if the clone is written in a 
VLA way will still work.


If the attribute is used with a function definition rather than 
declaration, so:


__attribute__ ((simd (notinbranch), const)) float fn0(float a)
{
  return a + 1.0f;
}

the compiler should again generate the three simd clones:
__ZGVnN4v_fn0 128-bit 4x4 float Advanced SIMD clone
__ZGVnN2v_fn0 64-bit  4x2 float Advanced SIMD clone
__ZGVsMxv_fn0 [128, 128]-bit 4x4xN SVE SIMD

Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-01-31 Thread Andre Vieira (lists)




On 31/01/2024 14:03, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:




On 31/01/2024 12:13, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Tue, 30 Jan 2024, Andre Vieira wrote:



This patch adds stmt_vec_info to TARGET_SIMD_CLONE_USABLE to make sure the
target can reject a simd_clone based on the vector mode it is using.
This is needed because for VLS SVE vectorization the vectorizer accepts
Advanced SIMD simd clones when vectorizing using SVE types because the
simdlens
might match.  This will cause type errors later on.

Other targets do not currently need to use this argument.


Can you instead pass down the mode?


Thinking about that again the cgraph_simd_clone info in the clone
should have sufficient information to disambiguate.  If it doesn't
then we should amend it.

Richard.


Hi Richard,

Thanks for the review, I don't think cgraph_simd_clone_info is the right place
to pass down this information, since this is information about the caller
rather than the simdclone itself. What we are trying to achieve here is making
the vectorizer being able to accept or reject simdclones based on the ISA we
are vectorizing for. To distinguish between SVE and Advanced SIMD ISAs we use
modes, I am also not sure that's ideal but it is what we currently use. So to
answer your earlier question, yes I can also pass down mode if that's
preferable.


Note cgraph_simd_clone_info has simdlen and we seem to check elsewhere
whether that's POLY or constant.  I wonder how aarch64_sve_mode_p
comes into play here which in the end classifies VLS SVE modes as
non-SVE?


Maybe it's just a bit non-obvious as you key on mangling:

  static int
-aarch64_simd_clone_usable (struct cgraph_node *node)
+aarch64_simd_clone_usable (struct cgraph_node *node, stmt_vec_info
stmt_vinfo)
  {
switch (node->simdclone->vecsize_mangle)
  {
  case 'n':
if (!TARGET_SIMD)
 return -1;
+  if (STMT_VINFO_VECTYPE (stmt_vinfo)
+ && aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE
(stmt_vinfo
+   return -1;

?  What does 'n' mean?  It's documented as

   /* The mangling character for a given vector size.  This is used
  to determine the ISA mangling bit as specified in the Intel
  Vector ABI.  */
   unsigned char vecsize_mangle;


I'll update the comment, but yeh 'n' is for Advanced SIMD, 's' is for SVE.


which is slightly misleading.


Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-01-31 Thread Andre Vieira (lists)




On 31/01/2024 13:58, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:




On 31/01/2024 12:13, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Tue, 30 Jan 2024, Andre Vieira wrote:



This patch adds stmt_vec_info to TARGET_SIMD_CLONE_USABLE to make sure the
target can reject a simd_clone based on the vector mode it is using.
This is needed because for VLS SVE vectorization the vectorizer accepts
Advanced SIMD simd clones when vectorizing using SVE types because the
simdlens
might match.  This will cause type errors later on.

Other targets do not currently need to use this argument.


Can you instead pass down the mode?


Thinking about that again the cgraph_simd_clone info in the clone
should have sufficient information to disambiguate.  If it doesn't
then we should amend it.

Richard.


Hi Richard,

Thanks for the review, I don't think cgraph_simd_clone_info is the right place
to pass down this information, since this is information about the caller
rather than the simdclone itself. What we are trying to achieve here is making
the vectorizer being able to accept or reject simdclones based on the ISA we
are vectorizing for. To distinguish between SVE and Advanced SIMD ISAs we use
modes, I am also not sure that's ideal but it is what we currently use. So to
answer your earlier question, yes I can also pass down mode if that's
preferable.


Note cgraph_simd_clone_info has simdlen and we seem to check elsewhere
whether that's POLY or constant.  I wonder how aarch64_sve_mode_p
comes into play here which in the end classifies VLS SVE modes as
non-SVE?



Using -msve-vector-bits=128
(gdb) p TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))
$4 = E_VNx4SImode
(gdb) p  TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo))
$5 = (tree) 0xf741c1b0
(gdb) p debug (TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo)))
128
(gdb) p aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo)))
$5 = true

and for reference without vls codegen:
(gdb) p TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))
$1 = E_VNx4SImode
(gdb) p  debug (TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo)))
POLY_INT_CST [128, 128]

Having said that I believe that the USABLE targethook implementation for 
aarch64 should also block other uses, like an Advanced SIMD mode being 
used as input for a SVE VLS SIMDCLONE. The reason being that for 
instance 'half' registers like VNx2SI are packed differently from V2SI.


We could teach the vectorizer to support these of course, but that 
requires more work and is not extremely useful just yet. I'll add the 
extra check that to the patch once we agree on how to pass down the 
information we need. Happy to use either mode, or stmt_vec_info and 
extract the mode from it like it does now.



Regards,
Andre





Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-01-31 Thread Andre Vieira (lists)




On 31/01/2024 12:13, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Tue, 30 Jan 2024, Andre Vieira wrote:



This patch adds stmt_vec_info to TARGET_SIMD_CLONE_USABLE to make sure the
target can reject a simd_clone based on the vector mode it is using.
This is needed because for VLS SVE vectorization the vectorizer accepts
Advanced SIMD simd clones when vectorizing using SVE types because the simdlens
might match.  This will cause type errors later on.

Other targets do not currently need to use this argument.


Can you instead pass down the mode?


Thinking about that again the cgraph_simd_clone info in the clone
should have sufficient information to disambiguate.  If it doesn't
then we should amend it.

Richard.


Hi Richard,

Thanks for the review, I don't think cgraph_simd_clone_info is the right 
place to pass down this information, since this is information about the 
caller rather than the simdclone itself. What we are trying to achieve 
here is making the vectorizer being able to accept or reject simdclones 
based on the ISA we are vectorizing for. To distinguish between SVE and 
Advanced SIMD ISAs we use modes, I am also not sure that's ideal but it 
is what we currently use. So to answer your earlier question, yes I can 
also pass down mode if that's preferable.


Regards,
Andre


[RFC] aarch64: Add support for __BitInt

2024-01-10 Thread Andre Vieira (lists)

Hi,

This patch is still work in progress, but posting to show failure with 
bitint-7 test where handle_stmt called from lower_mergeable_stmt ICE's 
because the idx (3) is out of range for the __BitInt(135) with a 
limb_prec of 64.


I hacked gcc locally to work around this issue and still have one 
outstanding failure, so will look to resolve that failure before posting 
a new version.


Kind Regards,
Andrediff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
a5a6b52730d6c5013346d128e89915883f1707ae..15fb0ece5256f25c2ca8bb5cb82fc61488d0393e
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6534,7 +6534,7 @@ aarch64_return_in_memory_1 (const_tree type)
   machine_mode ag_mode;
   int count;
 
-  if (!AGGREGATE_TYPE_P (type)
+  if (!(AGGREGATE_TYPE_P (type) || TREE_CODE (type) == BITINT_TYPE)
   && TREE_CODE (type) != COMPLEX_TYPE
   && TREE_CODE (type) != VECTOR_TYPE)
 /* Simple scalar types always returned in registers.  */
@@ -6618,6 +6618,10 @@ aarch64_function_arg_alignment (machine_mode mode, 
const_tree type,
 
   gcc_assert (TYPE_MODE (type) == mode);
 
+  if (TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return GET_MODE_ALIGNMENT (TImode);
+
   if (!AGGREGATE_TYPE_P (type))
 {
   /* The ABI alignment is the natural alignment of the type, without
@@ -21773,6 +21777,11 @@ aarch64_composite_type_p (const_tree type,
   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
 return true;
 
+  if (type
+  && TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return true;
+
   if (mode == BLKmode
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
@@ -28265,6 +28274,29 @@ aarch64_excess_precision (enum excess_precision_type 
type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+aarch64_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (n <= 8)
+info->limb_mode = QImode;
+  else if (n <= 16)
+info->limb_mode = HImode;
+  else if (n <= 32)
+info->limb_mode = SImode;
+  else
+info->limb_mode = DImode;
+
+  if (n > 128)
+info->abi_limb_mode = TImode;
+  else
+info->abi_limb_mode = info->limb_mode;
+  info->big_endian = TARGET_BIG_END;
+  info->extended = false;
+  return true;
+}
+
 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
scheduled for speculative execution.  Reject the long-running division
and square-root instructions.  */
@@ -30374,6 +30406,9 @@ aarch64_run_selftests (void)
 #undef TARGET_C_EXCESS_PRECISION
 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
 
+#undef TARGET_C_BITINT_TYPE_INFO
+#define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/libgcc/config/aarch64/t-softfp b/libgcc/config/aarch64/t-softfp
index 
2e32366f891361e2056c680b2e36edb1871c7670..4302ad52eb881825d0fb65b9ebd21031781781f5
 100644
--- a/libgcc/config/aarch64/t-softfp
+++ b/libgcc/config/aarch64/t-softfp
@@ -4,7 +4,8 @@ softfp_extensions := sftf dftf hftf bfsf
 softfp_truncations := tfsf tfdf tfhf tfbf dfbf sfbf hfbf
 softfp_exclude_libgcc2 := n
 softfp_extras += fixhfti fixunshfti floattihf floatuntihf \
-floatdibf floatundibf floattibf floatuntibf
+floatdibf floatundibf floattibf floatuntibf \
+fixtfbitint floatbitinttf
 
 TARGET_LIBGCC2_CFLAGS += -Wno-missing-prototypes
 


Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-20 Thread Andre Vieira (lists)
Squashed the definition and changes to predicated_doloop_end_internal 
and dlstp*_insn into this patch to make sure the first patch builds 
independently


On 18/12/2023 11:53, Andre Vieira wrote:


Reworked Stam's patch after comments in:
https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640362.html

The original gcc ChangeLog remains unchanged, but I did split up some tests so
here is the testsuite ChangeLog.


gcc/testsuite/ChangeLog:

* gcc.target/arm/lob.h: Update framework.
* gcc.target/arm/lob1.c: Likewise.
* gcc.target/arm/lob6.c: Likewise.
* gcc.target/arm/mve/dlstp-compile-asm.c: New test.
* gcc.target/arm/mve/dlstp-int16x8.c: New test.
* gcc.target/arm/mve/dlstp-int16x8-run.c: New test.
* gcc.target/arm/mve/dlstp-int32x4.c: New test.
* gcc.target/arm/mve/dlstp-int32x4-run.c: New test.
* gcc.target/arm/mve/dlstp-int64x2.c: New test.
* gcc.target/arm/mve/dlstp-int64x2-run.c: New test.
* gcc.target/arm/mve/dlstp-int8x16.c: New test.
* gcc.target/arm/mve/dlstp-int8x16-run.c: New test.
* gcc.target/arm/mve/dlstp-invalid-asm.c: New test.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 
2f5ca79ed8ddd647b212782a0454ee4fefc07257..4f164c547406c43219900c111401540c7ef9d7d1
 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
 extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern rtx arm_attempt_dlstp_transform (rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 
0c0cb14a8a4f043357b8acd7042a9f9386af1eb1..1ee72bcb7ec4bea5feea8453ceef7702b0088a73
 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const 
arm_attribute_table[] =
 #undef TARGET_HAVE_CONDITIONAL_EXECUTION
 #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
 
@@ -34483,19 +34489,1147 @@ arm_invalid_within_doloop (const rtx_insn *insn)
 }
 
 bool
-arm_target_insn_ok_for_lob (rtx insn)
+arm_target_bb_ok_for_lob (basic_block bb)
 {
-  basic_block bb = BLOCK_FOR_INSN (insn);
   /* Make sure the basic block of the target insn is a simple latch
  having as single predecessor and successor the body of the loop
  itself.  Only simple loops with a single basic block as body are
  supported for 'low over head loop' making sure that LE target is
  above LE itself in the generated code.  */
-
   return single_succ_p (bb)
-&& single_pred_p (bb)
-&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-&& contains_no_active_insn_p (bb);
+&& single_pred_p (bb)
+&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src;
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+   lanes based on the machine mode being used.  */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+  rtx insn_set = single_set (insn);
+  if (insn_set
+  && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+  && (XINT (SET_SRC (insn_set), 1) == VCTP
+ || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+{
+  machine_mode mode = GET_MODE (SET_SRC (insn_set));
+  return (VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+? GET_MODE_NUNITS (mode) : 0;
+}
+  return 0;
+}
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+   sub-rtx of the VPR reg.  The TYPE argument controls whether
+   this function should:
+   * For TYPE == 0, check all operands, including the OUT operands,
+ and return the first occurrence of the VPR reg.
+   * For TYPE == 1, only check the input operands.
+   * For TYPE == 2, only check the output operands.
+   (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0)
+{
+  gcc_assert (type < 3);
+  if (!NONJUMP_INSN_P (insn))
+return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+ this insn.  */
+  preprocess_constraints (insn);
+
+  for (int op = 0; op < n_operands; op++)
+{
+  requires_vpr = true;
+  if (type == 1 && 

Re: [PATCH 1/2] arm: Add define_attr to to create a mapping between MVE predicated and unpredicated insns

2023-12-20 Thread Andre Vieira (lists)
Reworked patch after Richard's comments and moved 
predicated_doloop_end_internal and dlstp*_insn to the next patch in the 
series to make sure this one builds on its own.


On 18/12/2023 11:53, Andre Vieira wrote:


Re-sending Stam's first patch, same as:
https://gcc.gnu.org/pipermail/gcc-patches/2023-November/635301.html

Hopefully patchworks can pick this up :)
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 
a9c2752c0ea5ecd4597ded254e9426753ac0a098..f0b01b7461f883994a0be137cb6cbf079d54618b
 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -2375,6 +2375,21 @@ extern int making_const_table;
   else if (TARGET_THUMB1)  \
 thumb1_final_prescan_insn (INSN)
 
+/* These defines are useful to refer to the value of the mve_unpredicated_insn
+   insn attribute.  Note that, because these use the get_attr_* function, these
+   will change recog_data if (INSN) isn't current_insn.  */
+#define MVE_VPT_PREDICABLE_INSN_P(INSN)
\
+  (recog_memoized (INSN) >= 0  \
+   && get_attr_mve_unpredicated_insn (INSN) != CODE_FOR_nothing)
+
+#define MVE_VPT_PREDICATED_INSN_P(INSN)
\
+  (MVE_VPT_PREDICABLE_INSN_P (INSN)\
+   && recog_memoized (INSN) != get_attr_mve_unpredicated_insn (INSN))
+
+#define MVE_VPT_UNPREDICATED_INSN_P(INSN)  \
+  (MVE_VPT_PREDICABLE_INSN_P (INSN)\
+   && recog_memoized (INSN) == get_attr_mve_unpredicated_insn (INSN))
+
 #define ARM_SIGN_EXTEND(x)  ((HOST_WIDE_INT)   \
   (HOST_BITS_PER_WIDE_INT <= 32 ? (unsigned HOST_WIDE_INT) (x) \
: unsigned HOST_WIDE_INT)(x)) & (unsigned HOST_WIDE_INT) 0x) |\
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 
07eaf06cdeace750fe1c7d399deb833ef5fc2b66..296212be33ffe6397b05491d8854d2a59f7c54df
 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -124,6 +124,12 @@ (define_attr "fpu" "none,vfp"
 ; and not all ARM insns do.
 (define_attr "predicated" "yes,no" (const_string "no"))
 
+; An attribute that encodes the CODE_FOR_ of the MVE VPT unpredicated
+; version of a VPT-predicated instruction.  For unpredicated instructions
+; that are predicable, encode the same pattern's CODE_FOR_ as a way to
+; encode that it is a predicable instruction.
+(define_attr "mve_unpredicated_insn" "" (symbol_ref "CODE_FOR_nothing"))
+
 ; LENGTH of an instruction (in bytes)
 (define_attr "length" ""
   (const_int 4))
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 
a980353810166312d5bdfc8ad58b2825c910d0a0..5ea2d9e866891bdb3dc73fcf6cbd6cdd2f989951
 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2305,6 +2305,7 @@ (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") 
(UNSPEC_QSUB8 "qsub8")
 
 (define_int_attr mmla_sfx [(UNSPEC_MATMUL_S "s8") (UNSPEC_MATMUL_U "u8")
   (UNSPEC_MATMUL_US "s8")])
+
 ;;MVE int attribute.
 (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
   (VREV16Q_U "u") (VMVNQ_N_S "s") (VMVNQ_N_U "u")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 
b0d3443da9cee991193d390200738290806a1e69..b1862d7977e91605cd971e634105bed3fa6e75cb
 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -17,7 +17,7 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; .
 
-(define_insn "*mve_mov"
+(define_insn "mve_mov"
   [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w   , w,   
r,Ux,w")
(match_operand:MVE_types 1 "general_operand"  " 
w,r,w,DnDm,UxUi,r,w, Ul"))]
   "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT"
@@ -81,18 +81,27 @@ (define_insn "*mve_mov"
   return "";
 }
 }
-  [(set_attr "type" 
"mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_store,mve_load")
+   [(set_attr_alternative "mve_unpredicated_insn" [(symbol_ref 
"CODE_FOR_mve_mov")
+  (symbol_ref 
"CODE_FOR_nothing")
+  (symbol_ref 
"CODE_FOR_nothing")
+  (symbol_ref 
"CODE_FOR_mve_mov")
+  (symbol_ref 
"CODE_FOR_mve_mov")
+  (symbol_ref 
"CODE_FOR_nothing")
+  (symbol_ref 
"CODE_FOR_mve_mov")
+  (symbol_ref 
"CODE_FOR_nothing")])
+   (set_attr "type" 
"mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_store,mve_load")
(set_attr "length" "4,8,8,4,4,8,4,8")
(set_attr "thumb2_pool_range" "*,*,*,*,1018,*,*,*")
(set_attr "neg_pool_range" "*,*,*,*,996,*,*,*")])
 
-(define_insn "*mve_vdup"
+(define_insn "mve_vdup"
   

omp: Fix simdclone arguments with veclen lower than simdlen [PR113040]

2023-12-20 Thread Andre Vieira (lists)

This patch fixes an issue introduced by:
commit ea4a3d08f11a59319df7b750a955ac613a3f438a
Author: Andre Vieira 
Date:   Wed Nov 1 17:02:41 2023 +

omp: Reorder call for TARGET_SIMD_CLONE_ADJUST

The problem was that after this patch we no longer added multiple 
arguments for vector arguments where the veclen was lower than the simdlen.


gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_adjust_argument_types): Add multiple
vector arguments where simdlen is larger than veclen.

Bootstrapped and regression tested on x86_64-pc-linux-gnu and 
aarch64-unknown-linux-gnu.


OK for trunk?

PS: struggling to add a testcase for this, the dumps don't show the 
simdclone prototype and I can't easily create a run-test for this as it 
requires glibc.  Only option is a very flaky assembly scan test to see 
if it's writing to ymm4 (i.e. it is passing enough parameters), but I 
haven't because I don't think that's a good idea.
PPS: maybe we ought to print the simdclone prototype when passing 
-fdump-ipa-simdclone ?diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
3fbe428125243bc02bd58f6e50ac773e8df8..5151fef3bcdaa76802184df43ba13b8709645fd4
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -781,6 +781,7 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
   struct cgraph_simd_clone *sc = node->simdclone;
   unsigned i, k;
   poly_uint64 veclen;
+  auto_vec new_params;
 
   for (i = 0; i < sc->nargs; ++i)
 {
@@ -798,9 +799,11 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
   switch (sc->args[i].arg_type)
{
default:
+ new_params.safe_push (parm_type);
  break;
case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_CONSTANT_STEP:
case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_VARIABLE_STEP:
+ new_params.safe_push (parm_type);
  if (node->definition)
sc->args[i].simd_array
  = create_tmp_simd_array (IDENTIFIER_POINTER (DECL_NAME (parm)),
@@ -828,6 +831,9 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
  else
vtype = build_vector_type (parm_type, veclen);
  sc->args[i].vector_type = vtype;
+ k = vector_unroll_factor (sc->simdlen, veclen);
+ for (unsigned j = 0; j < k; j++)
+   new_params.safe_push (vtype);
 
  if (node->definition)
sc->args[i].simd_array
@@ -893,22 +899,8 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
last_parm_void = true;
 
   gcc_assert (TYPE_ARG_TYPES (TREE_TYPE (node->decl)));
-  for (i = 0; i < sc->nargs; i++)
-   {
- tree ptype;
- switch (sc->args[i].arg_type)
-   {
-   default:
- ptype = sc->args[i].orig_type;
- break;
-   case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
-   case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
-   case SIMD_CLONE_ARG_TYPE_VECTOR:
- ptype = sc->args[i].vector_type;
- break;
-   }
- new_arg_types = tree_cons (NULL_TREE, ptype, new_arg_types);
-   }
+  for (i = 0; i < new_params.length (); i++)
+   new_arg_types = tree_cons (NULL_TREE, new_params[i], new_arg_types);
   new_reversed = nreverse (new_arg_types);
   if (last_parm_void)
{


Re: veclower: improve selection of vector mode when lowering [PR 112787]

2023-12-20 Thread Andre Vieira (lists)

Thanks, fully agree with all comments.

gcc/ChangeLog:

PR target/112787
* tree-vect-generic (type_for_widest_vector_mode): Change function
to use original vector type and check widest vector mode has at 
most

the same number of elements.
(get_compute_type): Pass original vector type rather than the element
type to type_for_widest_vector_mode and remove now obsolete check
for the number of elements.

On 07/12/2023 07:45, Richard Biener wrote:

On Wed, 6 Dec 2023, Andre Vieira (lists) wrote:


Hi,

This patch addresses the issue reported in PR target/112787 by improving the
compute type selection.  We do this by not considering types with more
elements
than the type we are lowering since we'd reject such types anyway.

gcc/ChangeLog:

PR target/112787
* tree-vect-generic (type_for_widest_vector_mode): Add a parameter to
control maximum amount of elements in resulting vector mode.
(get_compute_type): Restrict vector_compute_type to a mode no wider
than the original compute type.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr112787.c: New test.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
x86_64-pc-linux-gnu.

Is this OK for trunk?


@@ -1347,7 +1347,7 @@ optimize_vector_constructor (gimple_stmt_iterator
*gsi)
 TYPE, or NULL_TREE if none is found.  */

Can you improve the function comment?  It also doesn't mention OP ...

  static tree
-type_for_widest_vector_mode (tree type, optab op)
+type_for_widest_vector_mode (tree type, optab op, poly_int64 max_nunits =
0)
  {
machine_mode inner_mode = TYPE_MODE (type);
machine_mode best_mode = VOIDmode, mode;
@@ -1371,7 +1371,9 @@ type_for_widest_vector_mode (tree type, optab op)
FOR_EACH_MODE_FROM (mode, mode)
  if (GET_MODE_INNER (mode) == inner_mode
 && maybe_gt (GET_MODE_NUNITS (mode), best_nunits)
-   && optab_handler (op, mode) != CODE_FOR_nothing)
+   && optab_handler (op, mode) != CODE_FOR_nothing
+   && (known_eq (max_nunits, 0)
+   || known_lt (GET_MODE_NUNITS (mode), max_nunits)))

max_nunits suggests that known_le would be appropriate instead.

I see the only other caller with similar "problems":

 }
   /* Can't use get_compute_type here, as supportable_convert_operation
  doesn't necessarily use an optab and needs two arguments.  */
   tree vec_compute_type
 = type_for_widest_vector_mode (TREE_TYPE (arg_type), mov_optab);
   if (vec_compute_type
   && VECTOR_MODE_P (TYPE_MODE (vec_compute_type))
   && subparts_gt (arg_type, vec_compute_type))

so please do not default to 0 but adjust this one as well.  It also
seems you then can remove the subparts_gt guards on both
vec_compute_type uses.

I think the API would be cleaner if we'd pass the original vector type
we can then extract TYPE_VECTOR_SUBPARTS from, avoiding the extra arg.

No?

Thanks,
Richard.diff --git a/gcc/testsuite/gcc.target/aarch64/pr112787.c 
b/gcc/testsuite/gcc.target/aarch64/pr112787.c
new file mode 100644
index 
..caca1bf7ef447e4489b2c134d7200a4afd16763f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr112787.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=armv8-a+sve -mcpu=neoverse-n2" } */
+
+typedef int __attribute__((__vector_size__ (64))) vec;
+
+vec fn (vec a, vec b)
+{
+  return a + b;
+}
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+} 4 } } */
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 
a7e6cb87a5e31e3dd2a893ea5652eeebf8d5d214..c906eb3521ea01fd2bdfc89c3476d02c555cf8cc
 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1343,12 +1343,16 @@ optimize_vector_constructor (gimple_stmt_iterator *gsi)
   gsi_replace (gsi, g, false);
 }
 
-/* Return a type for the widest vector mode whose components are of type
-   TYPE, or NULL_TREE if none is found.  */
+/* Return a type for the widest vector mode with the same element type as
+   type ORIGINAL_VECTOR_TYPE, with at most the same number of elements as type
+   ORIGINAL_VECTOR_TYPE and that is supported by the target for an operation
+   with optab OP, or return NULL_TREE if none is found.  */
 
 static tree
-type_for_widest_vector_mode (tree type, optab op)
+type_for_widest_vector_mode (tree original_vector_type, optab op)
 {
+  gcc_assert (VECTOR_TYPE_P (original_vector_type));
+  tree type = TREE_TYPE (original_vector_type);
   machine_mode inner_mode = TYPE_MODE (type);
   machine_mode best_mode = VOIDmode, mode;
   poly_int64 best_nunits = 0;
@@ -1371,7 +1375,9 @@ type_for_widest_vector_mode (tree type, optab op)
   FOR_EACH_MODE_FROM (mode, mode)
 if (GET_MODE_INNER (mode) == inner_mode
&& maybe_gt (GET_MODE_NUNITS (mode), best_nunits)
-   && optab_handler (op

Re: [PATCH] Fix tests for gomp

2023-12-13 Thread Andre Vieira (lists)




On 13/12/2023 10:55, Jakub Jelinek wrote:

On Wed, Dec 13, 2023 at 10:43:16AM +, Andre Vieira (lists) wrote:

Hi,

Apologies for the delay and this mixup. I need to do something different

This is to fix testisms initially introduced by:
commit f5fc001a84a7dbb942a6252b3162dd38b4aae311
Author: Andre Vieira 
Date:   Mon Dec 11 14:24:41 2023 +

 aarch64: enable mixed-types for aarch64 simdclones

gcc/testsuite/ChangeLog:

* gcc.dg/gomp/pr87887-1.c: Fixed test.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.

libgomp/ChangeLog:

* testsuite/libgomp.c/declare-variant-1.c: Fixed test.
* testsuite/libgomp.fortran/declare-simd-1.f90: Likewise.

OK for trunk? I was intending to commit as obvious, but jakub had made a
comment about declare-simd-1.f90 so I thought it might be worth just sending
it up to the mailing list first.



--- a/libgomp/testsuite/libgomp.c/declare-variant-1.c
+++ b/libgomp/testsuite/libgomp.c/declare-variant-1.c
@@ -40,16 +40,17 @@ f04 (int a)
  int
  test1 (int x)
  {
-  /* At gimplification time, we can't decide yet which function to call.  */
-  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */
+  /* At gimplification time, we can't decide yet which function to call for
+ x86_64 targets, given the f01 variant.  */
+  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target 
x86_64-*-* } } } */
/* After simd clones are created, the original non-clone test1 shall
   call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones
   shall call f01 with score 8.  */
/* { dg-final { scan-ltrans-tree-dump-not "f04 \\\(x" "optimized" } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { 
target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { 
target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { 
target { aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { 
aarch64*-*-* } } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { 
target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { 
target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { 
target { aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { 
aarch64*-*-* } } } } */


The changes in this test look all wrong.  The differences are
i?86-*-* x86_64-*-* (which can support avx512f isa) vs. other targets (which
can't).
So, there is nothing aarch64 specific in there and { target x86_64-*-* }
is also incorrect.  It should be simply
{ target i?86-*-* x86_64-*-* }
vs.
{ target { ! { i?86-*-* x86_64-*-* } } }
(never sure about the ! syntaxes).



Hmm I think I understand what you are saying, but I'm not sure I 
agree. So before I enabled simdclone testing for aarch64, this test had 
no target selectors. So it checked the same for 'all simdclone test 
targets'. Which seem to be x86 and amdgcn:


@@ -4321,7 +4321,8 @@ proc check_effective_target_vect_simd_clones { } {
 return [check_cached_effective_target_indexed vect_simd_clones {
   expr { (([istarget i?86-*-*] || [istarget x86_64-*-*])
  && [check_effective_target_avx512f])
|| [istarget amdgcn-*-*]
|| [istarget aarch64*-*-*] }}]
 }

I haven't checked what amdgcn does with this test, but I'd have to 
assume they were passing before? Though I'm not sure how amdgcn would 
pass the original:
 -  /* At gimplification time, we can't decide yet which function to 
call.  */

 -  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */

I've added Andrew to the mail to see if he can comment on that. Either 
way I'd suggest we either add scan's per target with the expected value 
or stick with my original change of aarch64 vs non-aarch64 as I think 
that would better reflect the changes of enabling this for aarch64 where 
it wasn't ran before.


[PATCH] Fix tests for gomp

2023-12-13 Thread Andre Vieira (lists)

Hi,

Apologies for the delay and this mixup. I need to do something different

This is to fix testisms initially introduced by:
commit f5fc001a84a7dbb942a6252b3162dd38b4aae311
Author: Andre Vieira 
Date:   Mon Dec 11 14:24:41 2023 +

aarch64: enable mixed-types for aarch64 simdclones

gcc/testsuite/ChangeLog:

* gcc.dg/gomp/pr87887-1.c: Fixed test.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.

libgomp/ChangeLog:

* testsuite/libgomp.c/declare-variant-1.c: Fixed test.
* testsuite/libgomp.fortran/declare-simd-1.f90: Likewise.

OK for trunk? I was intending to commit as obvious, but jakub had made a 
comment about declare-simd-1.f90 so I thought it might be worth just 
sending it up to the mailing list first.


Kind regards,
Andrediff --git a/gcc/testsuite/gcc.dg/gomp/pr87887-1.c 
b/gcc/testsuite/gcc.dg/gomp/pr87887-1.c
index 
281898300c7794d862e62c70a83a33d5aaa8f89e..8b04ffd0809be4e6f5ab97c2e32e800edffbee4f
 100644
--- a/gcc/testsuite/gcc.dg/gomp/pr87887-1.c
+++ b/gcc/testsuite/gcc.dg/gomp/pr87887-1.c
@@ -10,7 +10,6 @@ foo (int x)
 {
   return (struct S) { x };
 }
-/* { dg-warning "unsupported return type ‘struct S’ for ‘simd’ functions" "" { 
target aarch64*-*-* } .-4 } */
 
 #pragma omp declare simd
 int
@@ -18,7 +17,6 @@ bar (struct S x)
 {
   return x.n;
 }
-/* { dg-warning "unsupported argument type ‘struct S’ for ‘simd’ functions" "" 
{ target aarch64*-*-* } .-4 } */
 
 #pragma omp declare simd uniform (x)
 int
diff --git a/gcc/testsuite/gcc.dg/gomp/pr89246-1.c 
b/gcc/testsuite/gcc.dg/gomp/pr89246-1.c
index 
4a0fd74f0639b2832dcb9101e006d127568fbcbd..dfe629c1c6a51624cd94878c638606220cfe94eb
 100644
--- a/gcc/testsuite/gcc.dg/gomp/pr89246-1.c
+++ b/gcc/testsuite/gcc.dg/gomp/pr89246-1.c
@@ -8,7 +8,6 @@ int foo (__int128 x)
 {
   return x;
 }
-/* { dg-warning "unsupported argument type ‘__int128’ for ‘simd’ functions" "" 
{ target aarch64*-*-* } .-4 } */
 
 #pragma omp declare simd
 extern int bar (int x);
diff --git a/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c 
b/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
index 
f12244054bd46fa10e51cc3a688c4cf683689994..354078acd9f3073b8400621a0e7149aee571594b
 100644
--- a/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
+++ b/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
@@ -19,7 +19,6 @@ float setArray(float *a, float x, int k)
 /* { dg-final { scan-tree-dump "_ZGVnN2ua32vl_setArray" "optimized" { target 
aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "_ZGVnN4ua32vl_setArray" "optimized" { target 
aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "_ZGVnN2vvva32_addit" "optimized" { target 
aarch64*-*-* } } } */
-/* { dg-final { scan-tree-dump "_ZGVnN4vvva32_addit" "optimized" { target 
aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "_ZGVnM2vl66u_addit" "optimized" { target 
aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "_ZGVnM4vl66u_addit" "optimized" { target 
aarch64*-*-* } } } */
 
diff --git a/libgomp/testsuite/libgomp.c/declare-variant-1.c 
b/libgomp/testsuite/libgomp.c/declare-variant-1.c
index 
6129f23a0f80585246957022d63608dc3a68f1ff..790e9374054fe3e0ae609796640ff295b61e8389
 100644
--- a/libgomp/testsuite/libgomp.c/declare-variant-1.c
+++ b/libgomp/testsuite/libgomp.c/declare-variant-1.c
@@ -40,16 +40,17 @@ f04 (int a)
 int
 test1 (int x)
 {
-  /* At gimplification time, we can't decide yet which function to call.  */
-  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */
+  /* At gimplification time, we can't decide yet which function to call for
+ x86_64 targets, given the f01 variant.  */
+  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target 
x86_64-*-* } } } */
   /* After simd clones are created, the original non-clone test1 shall
  call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones
  shall call f01 with score 8.  */
   /* { dg-final { scan-ltrans-tree-dump-not "f04 \\\(x" "optimized" } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { 
target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { 
target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { 
target { aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { 
aarch64*-*-* } } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { 
target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { 
target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { 
target { aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { 
aarch64*-*-* } } } } */
   int a = f04 (x);
   int b = f04 (x);
   return a + b;
diff --git a/libgomp/testsuite/libgomp.fortran/declare-simd-1.f90 

Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

2023-12-12 Thread Andre Vieira (lists)




On 11/12/2023 21:42, Thomas Schwinge wrote:

Hi Andre!

On 2023-10-16T16:03:26+0100, "Andre Vieira (lists)" 
 wrote:

Just a minor update to the patch, I had missed the libgomp testsuite, so
had to make some adjustments there too.


Unfortunately, there appear to be a number of DejaGnu directive errors in
your test case changes -- do you not see those in your testing?


I hadn't seen those... I wonder whether they don't show up if you do 
dg-cmp-results with just one -v, I have binned the build, but I'll rerun 
it and double check, may need to use '-v -v' instead.


Thanks for letting me know.
..., and the following change also doesn't look quite right:



--- a/libgomp/testsuite/libgomp.fortran/declare-simd-1.f90
+++ b/libgomp/testsuite/libgomp.fortran/declare-simd-1.f90
@@ -1,5 +1,5 @@
  ! { dg-do run { target vect_simd_clones } }
-! { dg-options "-fno-inline" }
+! { dg-options "-fno-inline -cpp -D__aarch64__" }




Yeah, that needs a target selector. Thanks!


Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-07 Thread Andre Vieira (lists)
Thanks for addressing my comments. I have reviewed this and the other 
patch before and they LGTM. I however do not have approval rights so you 
will need the OK from a maintainer.


Thanks for doing this :)

Andre

On 30/11/2023 12:55, Stamatis Markianos-Wright wrote:

Hi Andre,

Thanks for the comments, see latest revision attached.

On 27/11/2023 12:47, Andre Vieira (lists) wrote:

Hi Stam,

Just some comments.

+/* Recursively scan through the DF chain backwards within the basic 
block and
+   determine if any of the USEs of the original insn (or the USEs of 
the insns
s/Recursively scan/Scan/ as you no longer recurse, thanks for that by 
the way :) +   where thy were DEF-ed, etc., recursively) were affected 
by implicit VPT

remove recursively for the same reasons.

+  if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P 
(cond_temp_iv.step))

+    return NULL;
+  /* Look at the steps and swap around the rtx's if needed. Error 
out if

+ one of them cannot be identified as constant.  */
+  if (INTVAL (cond_counter_iv.step) != 0 && INTVAL 
(cond_temp_iv.step) != 0)

+    return NULL;

Move the comment above the if before, as the erroring out it talks 
about is there.

Done


+  emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body));
 space after 'insn_note)'

@@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 -  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
   || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !CONST_INT_P (XEXP (inc_src, 1)))

Do we ever check that inc_src is negative? We used to check if it was 
-1, now we only check it's a constnat, but not a negative one, so I 
suspect this needs a:

|| INTVAL (XEXP (inc_src, 1)) >= 0

Good point. Done


@@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc 
*desc,

 case GE:
   /* Currently only GE tests against zero are supported.  */
   gcc_assert (XEXP (condition, 1) == const0_rtx);
-
+  /* FALLTHRU */
+    case GTU:
   noloop = constm1_rtx;

I spent a very long time staring at this trying to understand why 
noloop = constm1_rtx for GTU, where I thought it should've been (count 
& (n-1)). For the current use of doloop it doesn't matter because ARM 
is the only target using it and you set desc->noloop_assumptions to 
null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. 
However, if a different target accepts this GTU pattern then this 
target agnostic code will do the wrong thing.  I suggest we either:
 - set noloop to what we think might be the correct value, which if 
you ask me should be 'count & (XEXP (condition, 1))',
 - or add a gcc_assert (GET_CODE (condition) != GTU); under the if 
(desc->noloop_assumption); part and document why.  I have a slight 
preference for the assert given otherwise we are adding code that we 
can't test.


Yea, that's true tbh. I've done the latter, but also separated out the 
"case GTU:" and added a comment, so that it's more clear that the noloop 
things aren't used in the only implemented GTU case (Arm)


Thank you :)



LGTM otherwise (but I don't have the power to approve this ;)).

Kind regards,
Andre

From: Stamatis Markianos-Wright 
Sent: Thursday, November 16, 2023 11:36 AM
To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; 
Richard Sandiford; Kyrylo Tkachov
Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated 
Low Overhead Loops


Pinging back to the top of reviewers' inboxes due to worry about Stage 1
End in a few days :)


See the last email for the latest version of the 2/2 patch. The 1/2
patch is A-Ok from Kyrill's earlier target-backend review.


On 10/11/2023 12:41, Stamatis Markianos-Wright wrote:


On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:


On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:
One of the main reasons for reading the arm bits was to try to 
answer

the question: if we switch to a downcounting loop with a GE
condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?

So AFAICT this is actually handled in the generic code in
`doloop_valid_p`:

This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standa

veclower: improve selection of vector mode when lowering [PR 112787]

2023-12-06 Thread Andre Vieira (lists)

Hi,

This patch addresses the issue reported in PR target/112787 by improving the
compute type selection.  We do this by not considering types with more 
elements

than the type we are lowering since we'd reject such types anyway.

gcc/ChangeLog:

PR target/112787
* tree-vect-generic (type_for_widest_vector_mode): Add a parameter to
control maximum amount of elements in resulting vector mode.
(get_compute_type): Restrict vector_compute_type to a mode no wider
than the original compute type.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr112787.c: New test.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


Is this OK for trunk?

Kind regards,
Andre Vieiradiff --git a/gcc/testsuite/gcc.target/aarch64/pr112787.c 
b/gcc/testsuite/gcc.target/aarch64/pr112787.c
new file mode 100644
index 
..caca1bf7ef447e4489b2c134d7200a4afd16763f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr112787.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=armv8-a+sve -mcpu=neoverse-n2" } */
+
+typedef int __attribute__((__vector_size__ (64))) vec;
+
+vec fn (vec a, vec b)
+{
+  return a + b;
+}
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+} 4 } } */
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 
a7e6cb87a5e31e3dd2a893ea5652eeebf8d5d214..2dbf3c8f5f64f2623944110dbc371fe0944198f0
 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1347,7 +1347,7 @@ optimize_vector_constructor (gimple_stmt_iterator *gsi)
TYPE, or NULL_TREE if none is found.  */
 
 static tree
-type_for_widest_vector_mode (tree type, optab op)
+type_for_widest_vector_mode (tree type, optab op, poly_int64 max_nunits = 0)
 {
   machine_mode inner_mode = TYPE_MODE (type);
   machine_mode best_mode = VOIDmode, mode;
@@ -1371,7 +1371,9 @@ type_for_widest_vector_mode (tree type, optab op)
   FOR_EACH_MODE_FROM (mode, mode)
 if (GET_MODE_INNER (mode) == inner_mode
&& maybe_gt (GET_MODE_NUNITS (mode), best_nunits)
-   && optab_handler (op, mode) != CODE_FOR_nothing)
+   && optab_handler (op, mode) != CODE_FOR_nothing
+   && (known_eq (max_nunits, 0)
+   || known_lt (GET_MODE_NUNITS (mode), max_nunits)))
   best_mode = mode, best_nunits = GET_MODE_NUNITS (mode);
 
   if (best_mode == VOIDmode)
@@ -1702,7 +1704,8 @@ get_compute_type (enum tree_code code, optab op, tree 
type)
  || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing))
 {
   tree vector_compute_type
-   = type_for_widest_vector_mode (TREE_TYPE (type), op);
+   = type_for_widest_vector_mode (TREE_TYPE (type), op,
+  TYPE_VECTOR_SUBPARTS (compute_type));
   if (vector_compute_type != NULL_TREE
  && subparts_gt (compute_type, vector_compute_type)
  && maybe_ne (TYPE_VECTOR_SUBPARTS (vector_compute_type), 1U)


Re: [PATCH 8/8] aarch64: Add SVE support for simd clones [PR 96342]

2023-12-01 Thread Andre Vieira (lists)




On 29/11/2023 17:01, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:

Rebased, no major changes, still needs review.

On 30/08/2023 10:19, Andre Vieira (lists) via Gcc-patches wrote:

This patch finalizes adding support for the generation of SVE simd
clones when no simdlen is provided, following the ABI rules where the
widest data type determines the minimum amount of elements in a length
agnostic vector.

gcc/ChangeLog:

      * config/aarch64/aarch64-protos.h (add_sve_type_attribute):
Declare.
  * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute):
Make
  visibility global.
  * config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
  chosen over SIMD ABI if a SVE type is used in return or arguments.
  (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd
clone
  when no simdlen is provided, according to ABI rules.
  (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
  (aarch64_simd_clone_adjust_ret_or_param): New.
  (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
  * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
  (simd_clone_adjust): Adapt safelen check to be compatible with VLA
  simdlen.

gcc/testsuite/ChangeLog:

  * c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
  * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
  * gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
  longer necessary.
  * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.


diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..769d637f63724a7f0044f48f3dd683e0fb46049c
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1005,6 +1005,8 @@ namespace aarch64_sve {
  #ifdef GCC_TARGET_H
bool verify_type_context (location_t, type_context_kind, const_tree, bool);
  #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+ const char *, const char *);
  }
  
  extern void aarch64_split_combinev16qi (rtx operands[3]);

diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 
161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
  /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE 
vectors
 and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
 mangling of the type.  ACLE_NAME is the  name of the type.  */
-static void
+void
  add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
const char *mangled_name, const char *acle_name)
  {
tree mangled_name_tree
  = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+= (acle_name ? get_identifier (acle_name) : NULL_TREE);
  
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);

+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
value = tree_cons (NULL_TREE, mangled_name_tree, value);
value = tree_cons (NULL_TREE, size_int (num_pr), value);
value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
37507f091c2a6154fa944c3a9fad6a655ab5d5a1..cb0947b18c6a611d55579b5b08d93f6a4a9c3b2c
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4080,13 +4080,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree 
fntype)
  static const predefined_function_abi &
  aarch64_fntype_abi (const_tree fntype)
  {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-return aarch64_simd_abi ();
-
if (aarch64_returns_value_in_sve_regs_p (fntype)
|| aarch64_takes_arguments_in_sve_regs_p (fntype))
  return aarch64_sve_abi ();
  
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))

+return aarch64_simd_abi ();
+
return default_function_abi;
  }
  


I think we discussed this off-list later, but the change above shouldn't
be necessary.  aarch64_vector_pcs must not be attached to SVE PCS functions,
so the two cases should be mutually exclusive.


Yeah I had made the changes locally, but not updated the patch yet.



@@ -27467,7 +27467,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
int num, bool explicit_p)
  {
tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
int count;
unsigned HOST_WIDE_INT const_simdlen;
  
@@ -27513,10 +27513,14 @@ aarch64_simd_clone_compute_vecsize_and

Re: [RFC] vect: disable multiple calls of poly simdclones

2023-11-27 Thread Andre Vieira (lists)




On 06/11/2023 07:52, Richard Biener wrote:

On Fri, 3 Nov 2023, Andre Vieira (lists) wrote:


Hi,

The current codegen code to support VF's that are multiples of a simdclone
simdlen rely on BIT_FIELD_REF to create multiple input vectors.  This does not
work for non-constant simdclones, so we should disable using such clones when
the VF is a multiple of the non-constant simdlen until we change the codegen
to support those.

Enabling SVE simdclone support will cause ICEs if the vectorizer decides to
use a SVE simdclone with a VF that is larger than the simdlen. I'll be away
for the next two weeks, so cant' really discuss this further.
I initially tried to solve the problem, but the way
vectorizable_simd_clone_call is structured doesn't make it easy to replace
BIT_FIELD_REF with the poly-suitable solution right now of using
unpack_{hi,lo}.


I think it should be straight-forward to use unpack_{even,odd} (it's
even/odd for VLA, right?  If lo/hi would be possible then doing
BIT_FIELD_REF would be, too?  Also you need to have multiple stages
of unpack/pack when the factor is more than 2).

There's plenty of time even during stage3 to address this.

At least your patch should have come with a testcase (or two).


Yeah I didn't add one as it didn't trigger on AArch64 without my two 
outstanding aarch64 simdclone patches.


Is there a bugreport tracking this issue?  It should affect GCN as well
I guess.


No, since I can't trigger them yet on trunk until the reviews on my 
target specific patches are done and they are committed.


I don't have a GCN backend lying around but I suspect GCN doesn't use 
poly simdlen simdclones yet either... I haven't checked. The issue 
triggers for aarch64 when trying to generate SVE simdclones for 
functions with mixed types.  I'll give the unpack thing a go locally.


Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-11-27 Thread Andre Vieira (lists)

Hi Stam,

Just some comments.

+/* Recursively scan through the DF chain backwards within the basic 
block and
+   determine if any of the USEs of the original insn (or the USEs of 
the insns
s/Recursively scan/Scan/ as you no longer recurse, thanks for that by 
the way :) +   where thy were DEF-ed, etc., recursively) were affected 
by implicit VPT

remove recursively for the same reasons.

+  if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P 
(cond_temp_iv.step))

+   return NULL;
+  /* Look at the steps and swap around the rtx's if needed.  Error 
out if

+one of them cannot be identified as constant.  */
+  if (INTVAL (cond_counter_iv.step) != 0 && INTVAL 
(cond_temp_iv.step) != 0)

+   return NULL;

Move the comment above the if before, as the erroring out it talks about 
is there.


+  emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body));
 space after 'insn_note)'

@@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 -  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
   || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !CONST_INT_P (XEXP (inc_src, 1)))

Do we ever check that inc_src is negative? We used to check if it was 
-1, now we only check it's a constnat, but not a negative one, so I 
suspect this needs a:

|| INTVAL (XEXP (inc_src, 1)) >= 0

@@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 case GE:
   /* Currently only GE tests against zero are supported.  */
   gcc_assert (XEXP (condition, 1) == const0_rtx);
-
+  /* FALLTHRU */
+case GTU:
   noloop = constm1_rtx;

I spent a very long time staring at this trying to understand why noloop 
= constm1_rtx for GTU, where I thought it should've been (count & 
(n-1)). For the current use of doloop it doesn't matter because ARM is 
the only target using it and you set desc->noloop_assumptions to 
null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. 
However, if a different target accepts this GTU pattern then this target 
agnostic code will do the wrong thing.  I suggest we either:
 - set noloop to what we think might be the correct value, which if you 
ask me should be 'count & (XEXP (condition, 1))',
 - or add a gcc_assert (GET_CODE (condition) != GTU); under the if 
(desc->noloop_assumption); part and document why.  I have a slight 
preference for the assert given otherwise we are adding code that we 
can't test.


LGTM otherwise (but I don't have the power to approve this ;)).

Kind regards,
Andre

From: Stamatis Markianos-Wright 
Sent: Thursday, November 16, 2023 11:36 AM
To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; Richard 
Sandiford; Kyrylo Tkachov
Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low 
Overhead Loops


Pinging back to the top of reviewers' inboxes due to worry about Stage 1
End in a few days :)


See the last email for the latest version of the 2/2 patch. The 1/2
patch is A-Ok from Kyrill's earlier target-backend review.


On 10/11/2023 12:41, Stamatis Markianos-Wright wrote:


On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:


On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:

One of the main reasons for reading the arm bits was to try to answer
the question: if we switch to a downcounting loop with a GE
condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?

So AFAICT this is actually handled in the generic code in
`doloop_valid_p`:

This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standard
dls/le loops)

Thanks to that check I haven't been able to trigger anything like the
behaviour you describe, do you think the doloop_valid_p checks are
robust enough?

The loops I was thinking of are provably not infinite though. E.g.:

   for (unsigned int i = 0; i < UINT_MAX - 100; ++i)
 ...

is known to terminate.  And doloop conversion is safe with the normal
count-down-by-1 approach, so I don't think current code would need
to reject it.  I.e. a conversion to:

   unsigned int i = UINT_MAX - 101;
   do
 ...
   while (--i != ~0U);

would be safe, but a conversion to:

   int i = UINT_MAX - 101;
   do
 ...
   while ((i -= step, i > 0));

wouldn't, 

[RFC] vect: disable multiple calls of poly simdclones

2023-11-03 Thread Andre Vieira (lists)

Hi,

The current codegen code to support VF's that are multiples of a 
simdclone simdlen rely on BIT_FIELD_REF to create multiple input 
vectors.  This does not work for non-constant simdclones, so we should 
disable using such clones when
the VF is a multiple of the non-constant simdlen until we change the 
codegen to support those.


Enabling SVE simdclone support will cause ICEs if the vectorizer decides 
to use a SVE simdclone with a VF that is larger than the simdlen. I'll 
be away for the next two weeks, so cant' really discuss this further.
I initially tried to solve the problem, but the way 
vectorizable_simd_clone_call is structured doesn't make it easy to 
replace BIT_FIELD_REF with the poly-suitable solution right now of using 
unpack_{hi,lo}. Unfortunately I only found this now as I was adding 
further tests for SVE :(


gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_simd_clone_call): Reject simdclones
with non-constant simdlen when VF is not exactly the same.diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
5f262cae2aae784e3ef4fd07455b7aa742797b51..dc3e0716161838aef66cf37342499006673336d6
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4165,7 +4165,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
if (!constant_multiple_p (vf * group_size, n->simdclone->simdlen,
  _calls)
|| (!n->simdclone->inbranch && (masked_call_offset > 0))
-   || (nargs != simd_nargs))
+   || (nargs != simd_nargs)
+   /* Currently we do not support multiple calls of non-constant
+  simdlen as poly vectors can not be accessed by BIT_FIELD_REF.  */
+   || (!n->simdclone->simdlen.is_constant () && num_calls != 1))
  continue;
if (num_calls != 1)
  this_badness += exact_log2 (num_calls) * 4096;


Re: [PATCH] vect: allow using inbranch simdclones for masked loops

2023-11-03 Thread Andre Vieira (lists)




On 03/11/2023 07:31, Richard Biener wrote:



OK.

I do wonder about the gfortran testsuite adjustments though.

!GCC$ builtin (sin) attributes simd (inbranch)

   ! this should not be using simd clone
   y4 = sin(x8)

previously we wouldn't vectorize this as no notinbranch simd function
is available but now we do since we use the inbranch function for the
notinbranch call.  If that's desired then a better modification of
the test would be to expect vectorization, no?



I was in two minds about this. I interpreted the test to be about the 
fact that sin is overloaded in fortran, given the name of the program 
'program test_overloaded_intrinsic', and thus I thought it was testing 
that it calls sinf when a real(4) is passed and sin for a real(8) and 
that simdclones aren't used for the wrong overload. That doesn't quite 
explain why the pragma for sin(double) was added in the first place, 
that wouldn't have been necessary, but then again neither are the cos 
and cosf.


Happy to put it back in and test that the 'masked' simdclone is used 
using some regexp too.


[PATCH] vect: allow using inbranch simdclones for masked loops

2023-11-02 Thread Andre Vieira (lists)

Hi,

In a previous patch I did most of the work for this, but forgot to 
change the check for number of arguments matching between call and 
simdclone.  This check should accept calls without a mask to be matched 
against simdclones with mask arguments.  I also added tests to verify 
this feature actually works.



For the simd-builtins tests I decided to remove the sin (double) 
simdclone which would now be used, because it was inbranch and we enable 
their use for not inbranch.  Given the nature of the test, removing it 
made more sense, but thats not a strong opinion, happy to change.


Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


OK for trunk?

PS: I'll be away for two weeks from tomorrow, it would be really nice if 
this can go in for gcc-14, otherwise the previous work I did for this 
won't have any actual visible effect :(



gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_simd_clone_call): Allow unmasked
calls to use masked simdclones.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-20.c: New file.
* gfortran.dg/simd-builtins-1.h: Adapt.
* gfortran.dg/simd-builtins-6.f90: Adapt.diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c
new file mode 100644
index 
..9f51a68f3a0c8851af2cd26bd8235c771b851d7d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c
@@ -0,0 +1,87 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd --param vect-epilogues-nomask=0" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+/* Test that simd inbranch clones work correctly.  */
+
+#ifndef TYPE
+#define TYPE int
+#endif
+
+/* A simple function that will be cloned.  */
+#pragma omp declare simd inbranch
+TYPE __attribute__((noinline))
+foo (TYPE a)
+{
+  return a + 1;
+}
+
+/* Check that "inbranch" clones are called correctly.  */
+
+void __attribute__((noipa))
+masked (TYPE * __restrict a, TYPE * __restrict b, int size)
+{
+  #pragma omp simd
+  for (int i = 0; i < size; i++)
+b[i] = foo(a[i]);
+}
+
+/* Check that "inbranch" works when there might be unrolling.  */
+
+void __attribute__((noipa))
+masked_fixed (TYPE * __restrict a, TYPE * __restrict b)
+{
+  #pragma omp simd
+  for (int i = 0; i < 128; i++)
+b[i] = foo(a[i]);
+}
+
+/* Validate the outputs.  */
+
+void
+check_masked (TYPE *b, int size)
+{
+  for (int i = 0; i < size; i++)
+if (b[i] != (TYPE)(i + 1))
+  {
+   __builtin_printf ("error at %d\n", i);
+   __builtin_exit (1);
+  }
+}
+
+int
+main ()
+{
+  TYPE a[1024];
+  TYPE b[1024];
+
+  for (int i = 0; i < 1024; i++)
+a[i] = i;
+
+  masked_fixed (a, b);
+  check_masked (b, 128);
+
+  /* Test various sizes to cover machines with different vectorization
+ factors.  */
+  for (int size = 8; size <= 1024; size *= 2)
+{
+  masked (a, b, size);
+  check_masked (b, size);
+}
+
+  /* Test sizes that might exercise the partial vector code-path.  */
+  for (int size = 8; size <= 1024; size *= 2)
+{
+  masked (a, b, size-4);
+  check_masked (b, size-4);
+}
+
+  return 0;
+}
+
+/* Ensure the the in-branch simd clones are used on targets that support them. 
 */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { aarch64*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { x86_64*-*-* } } } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gfortran.dg/simd-builtins-1.h 
b/gcc/testsuite/gfortran.dg/simd-builtins-1.h
index 
88d555cf41ad065ea525a63d7c05d15d3e5b54ed..08b73514a67d5791d35203530d039741946e9dcc
 100644
--- a/gcc/testsuite/gfortran.dg/simd-builtins-1.h
+++ b/gcc/testsuite/gfortran.dg/simd-builtins-1.h
@@ -1,4 +1,3 @@
-!GCC$ builtin (sin) attributes simd (inbranch)
 !GCC$ builtin (sinf) attributes simd (notinbranch)
 !GCC$ builtin (cosf) attributes simd
 !GCC$ builtin (cosf) attributes simd (notinbranch)
diff --git a/gcc/testsuite/gfortran.dg/simd-builtins-6.f90 
b/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
index 
60bcac78f3e0cc492930f3eb73cf97065312dc1c..2c68f9f1818a35674a0aef15793aa312a48199a8
 100644
--- a/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
+++ b/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
@@ -2,7 +2,6 @@
 ! { dg-additional-options "-nostdinc -Ofast -fdump-tree-optimized" }
 ! { dg-additional-options "-msse2 -mno-avx" { target i?86-*-linux* 
x86_64-*-linux* } }
 
-!GCC$ builtin (sin) attributes simd (inbranch)
 !GCC$ builtin (sinf) attributes simd (notinbranch)
 !GCC$ builtin (cosf) attributes simd
 !GCC$ builtin (cosf) attributes simd (notinbranch)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 

Re: [PATCH6/8] omp: Reorder call for TARGET_SIMD_CLONE_ADJUST (was Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM)

2023-10-30 Thread Andre Vieira (lists)

Hi Richi,

Friendly ping on this. I'm going away for two weeks end of this week, so 
I won't be here for end of stage-1, but I'd still very much like to get 
this done for GCC 14.


I don't know if you had a chance to look at this yet when you reviewed 
the other patches or if you maybe just missed it? A quick td;lr this 
moves around the TARGET_SIMD_CLONE_ADJUST call after we've vectorized 
the types in simdclones to avoid having to add the extra target hooks to 
change the types.  This required some moving around of the code that 
constructed the adjustments and the code that constructed the array for 
the return value.


Kind regards,
Andre

On 18/10/2023 15:41, Andre Vieira (lists) wrote:
This patch moves the call to TARGET_SIMD_CLONE_ADJUST until after the 
arguments and return types have been transformed into vector types.  It 
also constructs the adjuments and retval modifications after this call, 
allowing targets to alter the types of the arguments and return of the 
clone prior to the modifications to the function definition.


Is this OK?

gcc/ChangeLog:

     * omp-simd-clone.cc (simd_clone_adjust_return_type): Hoist out
     code to create return array and don't return new type.
     (simd_clone_adjust_argument_types): Hoist out code that creates
     ipa_param_body_adjustments and don't return them.
     (simd_clone_adjust): Call TARGET_SIMD_CLONE_ADJUST after return
     and argument types have been vectorized, create adjustments and
     return array after the hook.
     (expand_simd_clones): Call TARGET_SIMD_CLONE_ADJUST after return
     and argument types have been vectorized.

On 04/10/2023 13:40, Andre Vieira (lists) wrote:



On 04/10/2023 11:41, Richard Biener wrote:

On Wed, 4 Oct 2023, Andre Vieira (lists) wrote:




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:

This patch adds a new target hook to enable us to adapt the types 
of return
and parameters of simd clones.  We use this in two ways, the first 
one is

to
make sure we can create valid SVE types, including the SVE type 
attribute,
when creating a SVE simd clone, even when the target options do 
not support
SVE.  We are following the same behaviour seen with x86 that 
creates simd
clones according to the ABI rules when no simdlen is provided, 
even if that
simdlen is not supported by the current target options.  Note that 
this

doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by 
the time we
call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not 
vector.  The

same is true for the return type one.

Also the changes to the types need to be taken into consideration in
'adjustments' I think.


Nothing in the three existing implementations of 
TARGET_SIMD_CLONE_ADJUST

relies on this ordering I think, how about moving the hook invocation
after simd_clone_adjust_argument_types?



But that wouldn't change the 'ipa_param_body_adjustments' for when we 
have a function definition and we need to redo the body.

Richard.

PS: I hope the subject line survived, my email client is having a 
bit of a

wobble this morning... it's what you get for updating software :(


Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-10-23 Thread Andre Vieira (lists)
Ping for Jeff or another global maintainer to review the target agnostic 
bits of this, that's:

loop-doloop.cc
df-core.{c,h}

I do have a nitpick myself that I missed last time around:
  /* We expect the condition to be of the form (reg != 0)  */
  cond = XEXP (SET_SRC (cmp), 0);
- if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+ if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE)
+ || XEXP (cond, 1) != const0_rtx)
return 0;
}
Could do with updating the comment to reflect allowing >= now. But happy 
for you to change this once approved by a maintainer.


Kind regards,
Andre

On 11/10/2023 12:34, Stamatis Markianos-Wright wrote:

Hi all,

On 28/09/2023 13:51, Andre Vieira (lists) wrote:

Hi,

On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote:

Hi Stam,





The arm parts look sensible but we'd need review for the df-core.h 
and df-core.cc changes.

Maybe Jeff can help or can recommend someone to take a look?


Just thought I'd do a follow-up "ping" on this :)



Thanks,
Kyrill



FWIW the changes LGTM, if we don't want these in df-core we can always 
implement the extra utility locally. It's really just a helper 
function to check if df_bb_regno_first_def_find and 
df_bb_regno_last_def_find yield the same result, meaning we only have 
a single definition.


Kind regards,
Andre


Thanks,

Stam



Re: [PATCH] ifcvt: Don't lower bitfields with non-constant offsets [PR 111882]

2023-10-20 Thread Andre Vieira (lists)




On 20/10/2023 14:41, Richard Biener wrote:

On Fri, 20 Oct 2023, Andre Vieira (lists) wrote:


Hi,

This patch stops lowering of bitfields by ifcvt when they have non-constant
offsets as we are not likely to be able to do anything useful with those
during
vectorization.  That also fixes the issue reported in PR 111882, which was
being caused by an offset with a side-effect being lowered, but constants have
no side-effects so we will no longer run into that problem.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

OK for trunk?


+  if (!TREE_CONSTANT (DECL_FIELD_OFFSET (rep_decl))
+  || !TREE_CONSTANT (DECL_FIELD_BIT_OFFSET (rep_decl))
+  || !TREE_CONSTANT (ref_offset)
+  || !TREE_CONSTANT (DECL_FIELD_BIT_OFFSET (field_decl)))
+return NULL_TREE;

DECL_FIELD_BIT_OFFSET is always constant.  Please test
TREE_CODE (..) == INTEGER_CST instead of TREE_CONSTANT.


OK with those changes.
After I sent it I realized it would've been nicer to add a diagnostic, 
you OK with:

+  if (dump_file && (dump_flags & TDF_DETAILS))
+   fprintf (dump_file, "\t Bitfield NOT OK to lower,"
+   " offset is non-constant.\n");


Richard.



gcc/ChangeLog:

PR tree-optimization/111882
* tree-if-conv.cc (get_bitfield_rep): Return NULL_TREE for bitfields
with
non-constant offsets.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr111882.c: New test.





[PATCH] ifcvt: Don't lower bitfields with non-constant offsets [PR 111882]

2023-10-20 Thread Andre Vieira (lists)

Hi,

This patch stops lowering of bitfields by ifcvt when they have non-constant
offsets as we are not likely to be able to do anything useful with those 
during

vectorization.  That also fixes the issue reported in PR 111882, which was
being caused by an offset with a side-effect being lowered, but 
constants have

no side-effects so we will no longer run into that problem.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

OK for trunk?

gcc/ChangeLog:

PR tree-optimization/111882
* tree-if-conv.cc (get_bitfield_rep): Return NULL_TREE for bitfields 
with
non-constant offsets.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr111882.c: New test.diff --git a/gcc/testsuite/gcc.dg/vect/pr111882.c 
b/gcc/testsuite/gcc.dg/vect/pr111882.c
new file mode 100644
index 
..024ad57b6930dd1f516e9c8127e2b440016360c6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr111882.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options { -fdump-tree-ifcvt-all } } */
+
+static void __attribute__((noipa)) f(int n) {
+  int i, j;
+  struct S { char d[n]; int a; int b : 17; int c : 12; };
+  struct S A[100][];
+  for (i = 0; i < 100; i++) {
+asm volatile("" : : "g"([0][0]) : "memory");
+for (j = 0; j < ; j++) A[i][j].b = 2;
+  }
+}
+void g(void) { f(1); }
+
+/* { dg-final { scan-tree-dump-not "Bitfield OK to lower" "ifcvt" } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 
dab7eeb7707ae8f1f342a571f8e5c99e0ef39309..66f3c882cb688049224b344b81637e7b1ae7e36c
 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -3495,6 +3495,7 @@ get_bitfield_rep (gassign *stmt, bool write, tree *bitpos,
: gimple_assign_rhs1 (stmt);
 
   tree field_decl = TREE_OPERAND (comp_ref, 1);
+  tree ref_offset = component_ref_field_offset (comp_ref);
   tree rep_decl = DECL_BIT_FIELD_REPRESENTATIVE (field_decl);
 
   /* Bail out if the representative is not a suitable type for a scalar
@@ -3509,6 +3510,12 @@ get_bitfield_rep (gassign *stmt, bool write, tree 
*bitpos,
   if (compare_tree_int (DECL_SIZE (field_decl), bf_prec) != 0)
 return NULL_TREE;
 
+  if (!TREE_CONSTANT (DECL_FIELD_OFFSET (rep_decl))
+  || !TREE_CONSTANT (DECL_FIELD_BIT_OFFSET (rep_decl))
+  || !TREE_CONSTANT (ref_offset)
+  || !TREE_CONSTANT (DECL_FIELD_BIT_OFFSET (field_decl)))
+return NULL_TREE;
+
   if (struct_expr)
 *struct_expr = TREE_OPERAND (comp_ref, 0);
 
@@ -3529,7 +3536,7 @@ get_bitfield_rep (gassign *stmt, bool write, tree *bitpos,
 the structure and the container from the number of bits from the start
 of the structure and the actual bitfield member. */
   tree bf_pos = fold_build2 (MULT_EXPR, bitsizetype,
-DECL_FIELD_OFFSET (field_decl),
+ref_offset,
 build_int_cst (bitsizetype, BITS_PER_UNIT));
   bf_pos = fold_build2 (PLUS_EXPR, bitsizetype, bf_pos,
DECL_FIELD_BIT_OFFSET (field_decl));


[PATCH6/8] omp: Reorder call for TARGET_SIMD_CLONE_ADJUST (was Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM)

2023-10-18 Thread Andre Vieira (lists)
This patch moves the call to TARGET_SIMD_CLONE_ADJUST until after the 
arguments and return types have been transformed into vector types.  It 
also constructs the adjuments and retval modifications after this call, 
allowing targets to alter the types of the arguments and return of the 
clone prior to the modifications to the function definition.


Is this OK?

gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_adjust_return_type): Hoist out
code to create return array and don't return new type.
(simd_clone_adjust_argument_types): Hoist out code that creates
ipa_param_body_adjustments and don't return them.
(simd_clone_adjust): Call TARGET_SIMD_CLONE_ADJUST after return
and argument types have been vectorized, create adjustments and
return array after the hook.
(expand_simd_clones): Call TARGET_SIMD_CLONE_ADJUST after return
and argument types have been vectorized.

On 04/10/2023 13:40, Andre Vieira (lists) wrote:



On 04/10/2023 11:41, Richard Biener wrote:

On Wed, 4 Oct 2023, Andre Vieira (lists) wrote:




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:

This patch adds a new target hook to enable us to adapt the types 
of return
and parameters of simd clones.  We use this in two ways, the first 
one is

to
make sure we can create valid SVE types, including the SVE type 
attribute,
when creating a SVE simd clone, even when the target options do not 
support
SVE.  We are following the same behaviour seen with x86 that 
creates simd
clones according to the ABI rules when no simdlen is provided, even 
if that
simdlen is not supported by the current target options.  Note that 
this

doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by the 
time we
call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not 
vector.  The

same is true for the return type one.

Also the changes to the types need to be taken into consideration in
'adjustments' I think.


Nothing in the three existing implementations of TARGET_SIMD_CLONE_ADJUST
relies on this ordering I think, how about moving the hook invocation
after simd_clone_adjust_argument_types?



But that wouldn't change the 'ipa_param_body_adjustments' for when we 
have a function definition and we need to redo the body.

Richard.

PS: I hope the subject line survived, my email client is having a bit 
of a

wobble this morning... it's what you get for updating software :(diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
ef0b9b48c7212900023bc0eaebca5e1f9389db77..fb80888190c88e29895ecfbbe1b17d390c9a9dfe
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -701,10 +701,9 @@ simd_clone_create (struct cgraph_node *old_node, bool 
force_local)
 }
 
 /* Adjust the return type of the given function to its appropriate
-   vector counterpart.  Returns a simd array to be used throughout the
-   function as a return value.  */
+   vector counterpart.  */
 
-static tree
+static void
 simd_clone_adjust_return_type (struct cgraph_node *node)
 {
   tree fndecl = node->decl;
@@ -714,7 +713,7 @@ simd_clone_adjust_return_type (struct cgraph_node *node)
 
   /* Adjust the function return type.  */
   if (orig_rettype == void_type_node)
-return NULL_TREE;
+return;
   t = TREE_TYPE (TREE_TYPE (fndecl));
   if (INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
 veclen = node->simdclone->vecsize_int;
@@ -737,24 +736,6 @@ simd_clone_adjust_return_type (struct cgraph_node *node)
veclen));
 }
   TREE_TYPE (TREE_TYPE (fndecl)) = t;
-  if (!node->definition)
-return NULL_TREE;
-
-  t = DECL_RESULT (fndecl);
-  /* Adjust the DECL_RESULT.  */
-  gcc_assert (TREE_TYPE (t) != void_type_node);
-  TREE_TYPE (t) = TREE_TYPE (TREE_TYPE (fndecl));
-  relayout_decl (t);
-
-  tree atype = build_array_type_nelts (orig_rettype,
-  node->simdclone->simdlen);
-  if (maybe_ne (veclen, node->simdclone->simdlen))
-return build1 (VIEW_CONVERT_EXPR, atype, t);
-
-  /* Set up a SIMD array to use as the return value.  */
-  tree retval = create_tmp_var_raw (atype, "retval");
-  gimple_add_tmp_var (retval);
-  return retval;
 }
 
 /* Each vector argument has a corresponding array to be used locally
@@ -788,7 +769,7 @@ create_tmp_simd_array (const char *prefix, tree type, 
poly_uint64 simdlen)
declarations will be remapped.  New arguments which are not to be remapped
are marked with USER_FLAG.  */
 
-static ipa_param_body_adjustments *
+static void
 simd_clone_adjust_argument_types (struct cgraph_node *node)
 {
   auto_vec args;
@@ -798,15 +779,11 @@ simd_clone_adjust_argument_types (

Re: [PATCH 8/8] aarch64: Add SVE support for simd clones [PR 96342]

2023-10-18 Thread Andre Vieira (lists)

Rebased, no major changes, still needs review.

On 30/08/2023 10:19, Andre Vieira (lists) via Gcc-patches wrote:
This patch finalizes adding support for the generation of SVE simd 
clones when no simdlen is provided, following the ABI rules where the 
widest data type determines the minimum amount of elements in a length 
agnostic vector.


gcc/ChangeLog:

     * config/aarch64/aarch64-protos.h (add_sve_type_attribute): 
Declare.
 * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): 
Make

 visibility global.
 * config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
 chosen over SIMD ABI if a SVE type is used in return or arguments.
 (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd 
clone

 when no simdlen is provided, according to ABI rules.
 (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
 (aarch64_simd_clone_adjust_ret_or_param): New.
 (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
 * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
 (simd_clone_adjust): Adapt safelen check to be compatible with VLA
 simdlen.

gcc/testsuite/ChangeLog:

 * c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
 * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
 * gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
 longer necessary.
 * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..769d637f63724a7f0044f48f3dd683e0fb46049c
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1005,6 +1005,8 @@ namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+ const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 
161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
mangling of the type.  ACLE_NAME is the  name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
 = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+= (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
37507f091c2a6154fa944c3a9fad6a655ab5d5a1..cb0947b18c6a611d55579b5b08d93f6a4a9c3b2c
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4080,13 +4080,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree 
fntype)
 static const predefined_function_abi &
 aarch64_fntype_abi (const_tree fntype)
 {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-return aarch64_simd_abi ();
-
   if (aarch64_returns_value_in_sve_regs_p (fntype)
   || aarch64_takes_arguments_in_sve_regs_p (fntype))
 return aarch64_sve_abi ();
 
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
+return aarch64_simd_abi ();
+
   return default_function_abi;
 }
 
@@ -27467,7 +27467,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
   int count;
   unsigned HOST_WIDE_INT const_simdlen;
 
@@ -27513,10 +27513,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
   if (TREE_CODE (ret_type) != VOID_TYPE)
 {
   nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+  wds_elt_bits = nds_elt_bits;
   vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
 }
   else
-nds_elt_bits = POINTER_SIZE;
+{
+  nds_elt_bits = POINTER_SIZE;
+  wds_elt_bits = 0;
+}
 
   int i;
   tree type_arg_types = T

Re: [PATCH 4/8] vect: don't allow fully masked loops with non-masked simd clones [PR 110485]

2023-10-18 Thread Andre Vieira (lists)
Rebased on top of trunk, minor change to check if loop_vinfo since we 
now do some slp vectorization for simd_clones.


I assume the previous OK still holds.

On 30/08/2023 13:54, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


When analyzing a loop and choosing a simdclone to use it is possible to choose
a simdclone that cannot be used 'inbranch' for a loop that can use partial
vectors.  This may lead to the vectorizer deciding to use partial vectors
which are not supported for notinbranch simd clones. This patch fixes that by
disabling the use of partial vectors once a notinbranch simd clone has been
selected.


OK.


gcc/ChangeLog:

PR tree-optimization/110485
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Disable partial
vectors usage if a notinbranch simdclone has been selected.

gcc/testsuite/ChangeLog:

* gcc.dg/gomp/pr110485.c: New test.

diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c 
b/gcc/testsuite/gcc.dg/gomp/pr110485.c
new file mode 100644
index 
..ba6817a127f40246071e32ccebf692cc4d121d15
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
@@ -0,0 +1,19 @@
+/* PR 110485 */
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -fdump-tree-vect-details" } */
+/* { dg-additional-options "-march=znver4 --param=vect-partial-vector-usage=1" 
{ target x86_64-*-* } } */
+#pragma omp declare simd notinbranch uniform(p)
+extern double __attribute__ ((const)) bar (double a, double p);
+
+double a[1024];
+double b[1024];
+
+void foo (int n)
+{
+  #pragma omp simd
+  for (int i = 0; i < n; ++i)
+a[i] = bar (b[i], 71.2);
+}
+
+/* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a 
non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
a9156975d64c7a335ffd27614e87f9d11b23d1ba..731acc76350cae39c899a866584068cff247183a
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4539,6 +4539,17 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   ? boolean_true_node : boolean_false_node;
simd_clone_info.safe_push (sll);
  }
+
+  if (!bestn->simdclone->inbranch && loop_vinfo)
+   {
+ if (dump_enabled_p ()
+ && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+   dump_printf_loc (MSG_NOTE, vect_location,
+"can't use a fully-masked loop because a"
+" non-masked simd clone was selected.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+   }
+
   STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
   DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
 /*  vect_model_simple_cost (vinfo, stmt_info, ncopies,


[PATCH 0/8] omp: Replace simd_clone_subparts with TYPE_VECTOR_SUBPARTS

2023-10-18 Thread Andre Vieira (lists)


Refactor simd clone handling code ahead of support for poly simdlen.

gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_subparts): Remove.
(simd_clone_init_simd_arrays): Replace simd_clone_supbarts with
TYPE_VECTOR_SUBPARTS.
(ipa_simd_modify_function_body): Likewise.
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Likewise.
(simd_clone_subparts): Remove.
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
c1cb7cc8a5c770940bc2032f824e084b37e96dbe..a42643400ddcf10961633448b49d4caafb999f12
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -255,16 +255,6 @@ ok_for_auto_simd_clone (struct cgraph_node *node)
   return true;
 }
 
-
-/* Return the number of elements in vector type VECTYPE, which is associated
-   with a SIMD clone.  At present these always have a constant length.  */
-
-static unsigned HOST_WIDE_INT
-simd_clone_subparts (tree vectype)
-{
-  return TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-}
-
 /* Allocate a fresh `simd_clone' and return it.  NARGS is the number
of arguments to reserve space for.  */
 
@@ -1028,7 +1018,7 @@ simd_clone_init_simd_arrays (struct cgraph_node *node,
}
  continue;
}
-  if (known_eq (simd_clone_subparts (TREE_TYPE (arg)),
+  if (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg)),
node->simdclone->simdlen))
{
  tree ptype = build_pointer_type (TREE_TYPE (TREE_TYPE (array)));
@@ -1040,7 +1030,7 @@ simd_clone_init_simd_arrays (struct cgraph_node *node,
}
   else
{
- unsigned int simdlen = simd_clone_subparts (TREE_TYPE (arg));
+ poly_uint64 simdlen = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg));
  unsigned int times = vector_unroll_factor (node->simdclone->simdlen,
 simdlen);
  tree ptype = build_pointer_type (TREE_TYPE (TREE_TYPE (array)));
@@ -1226,9 +1216,9 @@ ipa_simd_modify_function_body (struct cgraph_node *node,
  iter, NULL_TREE, NULL_TREE);
   adjustments->register_replacement (&(*adjustments->m_adj_params)[j], r);
 
-  if (multiple_p (node->simdclone->simdlen, simd_clone_subparts (vectype)))
+  if (multiple_p (node->simdclone->simdlen, TYPE_VECTOR_SUBPARTS 
(vectype)))
j += vector_unroll_factor (node->simdclone->simdlen,
-  simd_clone_subparts (vectype)) - 1;
+  TYPE_VECTOR_SUBPARTS (vectype)) - 1;
 }
   adjustments->sort_replacements ();
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
9bb43e98f56d18929c9c02227954fdf38eafefd8..a9156975d64c7a335ffd27614e87f9d11b23d1ba
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4126,16 +4126,6 @@ vect_simd_lane_linear (tree op, class loop *loop,
 }
 }
 
-/* Return the number of elements in vector type VECTYPE, which is associated
-   with a SIMD clone.  At present these vectors always have a constant
-   length.  */
-
-static unsigned HOST_WIDE_INT
-simd_clone_subparts (tree vectype)
-{
-  return TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-}
-
 /* Function vectorizable_simd_clone_call.
 
Check if STMT_INFO performs a function call that can be vectorized
@@ -4429,7 +4419,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
slp_node);
  if (arginfo[i].vectype == NULL
  || !constant_multiple_p (bestn->simdclone->simdlen,
-  simd_clone_subparts 
(arginfo[i].vectype)))
+  TYPE_VECTOR_SUBPARTS 
(arginfo[i].vectype)))
return false;
}
 
@@ -,10 +4434,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
   if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
{
+ tree clone_arg_vectype = bestn->simdclone->args[i].vector_type;
  if (bestn->simdclone->mask_mode == VOIDmode)
{
- if (simd_clone_subparts (bestn->simdclone->args[i].vector_type)
- != simd_clone_subparts (arginfo[i].vectype))
+ if (maybe_ne (TYPE_VECTOR_SUBPARTS (clone_arg_vectype),
+   TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
{
  /* FORNOW we only have partial support for vector-type masks
 that can't hold all of simdlen. */
@@ -4464,7 +4455,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
  if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))
  || maybe_ne (exact_div (bestn->simdclone->simdlen,
  num_mask_args),
-  simd_clone_subparts (arginfo[i].vectype)))
+  TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))

Re: [PATCH 5/8] vect: Use inbranch simdclones in masked loops

2023-10-18 Thread Andre Vieira (lists)

Rebased, needs review.

On 30/08/2023 10:13, Andre Vieira (lists) via Gcc-patches wrote:
This patch enables the compiler to use inbranch simdclones when 
generating masked loops in autovectorization.


gcc/ChangeLog:

 * omp-simd-clone.cc (simd_clone_adjust_argument_types): Make function
 compatible with mask parameters in clone.
 * tree-vect-stmts.cc (vect_convert): New helper function.
 (vect_build_all_ones_mask): Allow vector boolean typed masks.
 (vectorizable_simd_clone_call): Enable the use of masked clones in
 fully masked loops.diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
a42643400ddcf10961633448b49d4caafb999f12..ef0b9b48c7212900023bc0eaebca5e1f9389db77
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -807,8 +807,14 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
 {
   ipa_adjusted_param adj;
   memset (, 0, sizeof (adj));
-  tree parm = args[i];
-  tree parm_type = node->definition ? TREE_TYPE (parm) : parm;
+  tree parm = NULL_TREE;
+  tree parm_type = NULL_TREE;
+  if(i < args.length())
+   {
+ parm = args[i];
+ parm_type = node->definition ? TREE_TYPE (parm) : parm;
+   }
+
   adj.base_index = i;
   adj.prev_clone_index = i;
 
@@ -1547,7 +1553,7 @@ simd_clone_adjust (struct cgraph_node *node)
  mask = gimple_assign_lhs (g);
  g = gimple_build_assign (make_ssa_name (TREE_TYPE (mask)),
   BIT_AND_EXPR, mask,
-  build_int_cst (TREE_TYPE (mask), 1));
+  build_one_cst (TREE_TYPE (mask)));
  gsi_insert_after (, g, GSI_CONTINUE_LINKING);
  mask = gimple_assign_lhs (g);
}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
731acc76350cae39c899a866584068cff247183a..6e2c70c1d3970af652c1e50e41b144162884bf24
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1594,6 +1594,20 @@ check_load_store_for_partial_vectors (loop_vec_info 
loop_vinfo, tree vectype,
 }
 }
 
+/* Return SSA name of the result of the conversion of OPERAND into type TYPE.
+   The conversion statement is inserted at GSI.  */
+
+static tree
+vect_convert (vec_info *vinfo, stmt_vec_info stmt_info, tree type, tree 
operand,
+ gimple_stmt_iterator *gsi)
+{
+  operand = build1 (VIEW_CONVERT_EXPR, type, operand);
+  gassign *new_stmt = gimple_build_assign (make_ssa_name (type),
+  operand);
+  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+  return gimple_get_lhs (new_stmt);
+}
+
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
that needs to be applied to all loads and stores in a vectorized loop.
@@ -2547,7 +2561,8 @@ vect_build_all_ones_mask (vec_info *vinfo,
 {
   if (TREE_CODE (masktype) == INTEGER_TYPE)
 return build_int_cst (masktype, -1);
-  else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+  else if (VECTOR_BOOLEAN_TYPE_P (masktype)
+  || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
 {
   tree mask = build_int_cst (TREE_TYPE (masktype), -1);
   mask = build_vector_from_val (masktype, mask);
@@ -4156,7 +4171,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   size_t i, nargs;
   tree lhs, rtype, ratype;
   vec *ret_ctor_elts = NULL;
-  int arg_offset = 0;
+  int masked_call_offset = 0;
 
   /* Is STMT a vectorizable call?   */
   gcall *stmt = dyn_cast  (stmt_info->stmt);
@@ -4171,7 +4186,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
   fndecl = TREE_OPERAND (fndecl, 0);
   gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
-  arg_offset = 1;
+  masked_call_offset = 1;
 }
   if (fndecl == NULL_TREE)
 return false;
@@ -4199,7 +4214,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 return false;
 
   /* Process function arguments.  */
-  nargs = gimple_call_num_args (stmt) - arg_offset;
+  nargs = gimple_call_num_args (stmt) - masked_call_offset;
 
   /* Bail out if the function has zero arguments.  */
   if (nargs == 0)
@@ -4221,7 +4236,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   thisarginfo.op = NULL_TREE;
   thisarginfo.simd_lane_linear = false;
 
-  int op_no = i + arg_offset;
+  int op_no = i + masked_call_offset;
   if (slp_node)
op_no = vect_slp_child_index_for_operand (stmt, op_no);
   if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
@@ -4303,16 +4318,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   arginfo.quick_push (thisarginfo);
 }
 
-  if (loop_vinfo
-  && !LOOP_VINFO_VEC

Re: [Patch 3/8] vect: Fix vect_get_smallest_scalar_type for simd clones

2023-10-18 Thread Andre Vieira (lists)

Made it a local function and changed prototype according to comments.

Is this OK?

 gcc/ChangeLog:
* tree-vect-data-refs.cc (vect_get_smallest_scalar_type): Special
case
simd clone calls and only use types that are mapped to vectors.
(simd_clone_call_p): New helper function.

On 30/08/2023 13:54, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


The vect_get_smallest_scalar_type helper function was using any argument to a
simd clone call when trying to determine the smallest scalar type that would
be vectorized.  This included the function pointer type in a MASK_CALL for
instance, and would result in the wrong type being selected.  Instead this
patch special cases simd_clone_call's and uses only scalar types of the
original function that get transformed into vector types.


Looks sensible.

+bool
+simd_clone_call_p (gimple *stmt, cgraph_node **out_node)

you could return the cgraph_node * or NULL here.  Are you going to
use the function elsewhere?  Otherwise put it in the same TU as
the only use please and avoid exporting it.

Richard.


gcc/ChangeLog:

* tree-vect-data-refs.cci (vect_get_smallest_scalar_type): Special
case
simd clone calls and only use types that are mapped to vectors.
* tree-vect-stmts.cc (simd_clone_call_p): New helper function.
* tree-vectorizer.h (simd_clone_call_p): Declare new function.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-16f.c: Remove unnecessary differentation
between targets with different pointer sizes.
* gcc.dg/vect/vect-simd-clone-17f.c: Likewise.
* gcc.dg/vect/vect-simd-clone-18f.c: Likewise.

diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
index 
574698d3e133ecb8700e698fa42a6b05dd6b8a18..7cd29e894d0502a59fadfe67db2db383133022d3
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-16.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
index 
8bb6d19301a67a3eebce522daaf7d54d88f708d7..177521dc44531479fca1f1a1a0f2010f30fa3fb5
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-17.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
index 
d34f23f4db8e9c237558cc22fe66b7e02b9e6c20..4dd51381d73c0c7c8ec812f24e5054df038059c5
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-18.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } 

Re: [Patch 2/8] parloops: Allow poly nit and bound

2023-10-18 Thread Andre Vieira (lists)

Posting the changed patch for completion, already reviewed.

On 30/08/2023 13:32, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


Teach parloops how to handle a poly nit and bound e ahead of the changes to
enable non-constant simdlen.


Can you use poly_int_tree_p to combine INTEGER_CST || POLY_INT_CST please?

OK with that change.


gcc/ChangeLog:

* tree-parloops.cc (try_to_transform_to_exit_first_loop_alt): Accept
poly NIT and ALT_BOUND.

diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 
a35f3d5023b06e5ef96eb4222488fcb34dd7bd45..80f3dd6dce281e1eb1d76d38bd09e6638a875142
 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2531,14 +2531,15 @@ try_transform_to_exit_first_loop_alt (class loop *loop,
   tree nit_type = TREE_TYPE (nit);
 
   /* Figure out whether nit + 1 overflows.  */
-  if (TREE_CODE (nit) == INTEGER_CST)
+  if (poly_int_tree_p (nit))
 {
   if (!tree_int_cst_equal (nit, TYPE_MAX_VALUE (nit_type)))
{
  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
   nit, build_one_cst (nit_type));
 
- gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+ gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST
+ || TREE_CODE (alt_bound) == POLY_INT_CST);
  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
  return true;
}


Re: [PATCH 1/8] parloops: Copy target and optimizations when creating a function clone

2023-10-18 Thread Andre Vieira (lists)

Just posting a rebase for completion.

On 30/08/2023 13:31, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:



SVE simd clones require to be compiled with a SVE target enabled or the
argument types will not be created properly. To achieve this we need to copy
DECL_FUNCTION_SPECIFIC_TARGET from the original function declaration to the
clones.  I decided it was probably also a good idea to copy
DECL_FUNCTION_SPECIFIC_OPTIMIZATION in case the original function is meant to
be compiled with specific optimization options.


OK.


gcc/ChangeLog:

* tree-parloops.cc (create_loop_fn): Copy specific target and
optimization options to clone.

diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 
e495bbd65270bdf90bae2c4a2b52777522352a77..a35f3d5023b06e5ef96eb4222488fcb34dd7bd45
 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2203,6 +2203,11 @@ create_loop_fn (location_t loc)
   DECL_CONTEXT (t) = decl;
   TREE_USED (t) = 1;
   DECL_ARGUMENTS (decl) = t;
+  DECL_FUNCTION_SPECIFIC_TARGET (decl)
+= DECL_FUNCTION_SPECIFIC_TARGET (act_cfun->decl);
+  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (decl)
+= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (act_cfun->decl);
+
 
   allocate_struct_function (decl, false);
 


Re: aarch64, vect, omp: Add SVE support for simd clones [PR 96342]

2023-10-18 Thread Andre Vieira (lists)

Hi,

I noticed I had missed one of the preparatory patches at the start of 
this series (first one) added now, also removed the 'vect: Add 
vector_mode paramater to simd_clone_usable' since after review we no 
longer deemed it necessary. And replaced the old vect: Add 
TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM with omp: Reorder call for 
TARGET_SIMD_CLONE_ADJUST after comments.


Bootstrapped and regression tested the series on 
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu.



Andre Vieira (8):

omp: Replace simd_clone_supbarts with TYPE_VECTOR_SUBPARTS [NEW]
parloops: Copy target and optimizations when creating a function clone 
[Reviewed]
parloops: Allow poly nit and bound [Cond Reviewed, made the requested 
changes]
vect: Fix vect_get_smallest_scalar_type for simd clones [First Reviewe, 
made the requested changes, OK?]
vect: don't allow fully masked loops with non-masked simd clones [PR 
110485] [Reviewed]

vect: Use inbranch simdclones in masked loops [Needs review]
vect: omp: Reorder call for TARGET_SIMD_CLONE_ADJUST [NEW]
aarch64: Add SVE support for simd clones [PR 96342] [Needs review]

PS: apologies for the inconsistent numbering of the emails, things got a 
bit confusing with removing and adding patches to the series.


On 30/08/2023 09:49, Andre Vieira (lists) via Gcc-patches wrote:

Hi,

This patch series aims to implement support for SVE simd clones when not 
specifying a 'simdlen' clause for AArch64. This patch depends on my 
earlier patch: '[PATCH] aarch64: enable mixed-types for aarch64 
simdclones'.


Bootstrapped and regression tested the series on 
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu. I also tried building 
the patches separately, but that was before some further clean-up 
restructuring, so will do that again prior to pushing.


Andre Vieira (8):

parloops: Copy target and optimizations when creating a function clone
parloops: Allow poly nit and bound
vect: Fix vect_get_smallest_scalar_type for simd clones
vect: don't allow fully masked loops with non-masked simd clones [PR 
110485]

vect: Use inbranch simdclones in masked loops
vect: Add vector_mode paramater to simd_clone_usable
vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
aarch64: Add SVE support for simd clones [PR 96342]


Re: Check that passes do not forget to define profile

2023-10-17 Thread Andre Vieira (lists)

So OK to commit this?

This patch makes sure the profile_count information is initialized for 
the new

bb created in move_sese_region_to_fn.

gcc/ChangeLog:

* tree-cfg.cc (move_sese_region_to_fn): Initialize profile_count for
new basic block.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


On 04/10/2023 12:02, Jan Hubicka wrote:

Hi Honza,

My current patch set for AArch64 VLA omp codegen started failing on
gcc.dg/gomp/pr87898.c after this. I traced it back to
'move_sese_region_to_fn' in tree/cfg.cc not setting count for the bb
created.

I was able to 'fix' it locally by setting the count of the new bb to the
accumulation of e->count () of all the entry_endges (if initialized). I'm
however not even close to certain that's the right approach, attached patch
for illustration.

Kind regards,
Andre
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc



index 
ffab7518b1568b58e610e26feb9e3cab18ddb3c2..32fc47ae683164bf8fac477fbe6e2c998382e754
 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -8160,11 +8160,15 @@ move_sese_region_to_fn (struct function *dest_cfun, 
basic_block entry_bb,
bb = create_empty_bb (entry_pred[0]);
if (current_loops)
  add_bb_to_loop (bb, loop);
+  profile_count count = profile_count::zero ();
for (i = 0; i < num_entry_edges; i++)
  {
e = make_edge (entry_pred[i], bb, entry_flag[i]);
e->probability = entry_prob[i];
+  if (e->count ().initialized_p ())
+   count += e->count ();
  }
+  bb->count = count;


This looks generally right - if you create a BB you need to set its
count and unless it has self-loop that is the sum of counts of
incommping edges.

However the initialized_p check should be unnecessary: if one of entry
edges to BB is uninitialized, the + operation will make bb count
uninitialized too, which is OK.

Honza
  
for (i = 0; i < num_exit_edges; i++)

  {
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 
ffab7518b1568b58e610e26feb9e3cab18ddb3c2..ffeb20b717aead756844c5f48c2cc23f5e9f14a6
 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -8160,11 +8160,14 @@ move_sese_region_to_fn (struct function *dest_cfun, 
basic_block entry_bb,
   bb = create_empty_bb (entry_pred[0]);
   if (current_loops)
 add_bb_to_loop (bb, loop);
+  profile_count count = profile_count::zero ();
   for (i = 0; i < num_entry_edges; i++)
 {
   e = make_edge (entry_pred[i], bb, entry_flag[i]);
   e->probability = entry_prob[i];
+  count += e->count ();
 }
+  bb->count = count;
 
   for (i = 0; i < num_exit_edges; i++)
 {


Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

2023-10-16 Thread Andre Vieira (lists)

Hey,

Just a minor update to the patch, I had missed the libgomp testsuite, so 
had to make some adjustments there too.


gcc/ChangeLog:

* config/aarch64/aarch64.cc (lane_size): New function.
(aarch64_simd_clone_compute_vecsize_and_simdlen): Determine 
simdlen according to NDS rule
and reject combination of simdlen and types that lead to 
vectors larger than 128bits.


gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add aarch64 targets to vect_simd_clones.
* c-c++-common/gomp/declare-variant-14.c: Adapt test for aarch64.
* c-c++-common/gomp/pr60823-1.c: Likewise.
* c-c++-common/gomp/pr60823-2.c: Likewise.
* c-c++-common/gomp/pr60823-3.c: Likewise.
* g++.dg/gomp/attrs-10.C: Likewise.
* g++.dg/gomp/declare-simd-1.C: Likewise.
* g++.dg/gomp/declare-simd-3.C: Likewise.
* g++.dg/gomp/declare-simd-4.C: Likewise.
* g++.dg/gomp/declare-simd-7.C: Likewise.
* g++.dg/gomp/declare-simd-8.C: Likewise.
* g++.dg/gomp/pr88182.C: Likewise.
* gcc.dg/declare-simd.c: Likewise.
* gcc.dg/gomp/declare-simd-1.c: Likewise.
* gcc.dg/gomp/declare-simd-3.c: Likewise.
* gcc.dg/gomp/pr87887-1.c: Likewise.
* gcc.dg/gomp/pr87895-1.c: Likewise.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/pr99542.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-1.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-2.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-4.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-5.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-8.c: Likewise.
* gfortran.dg/gomp/declare-simd-2.f90: Likewise.
* gfortran.dg/gomp/declare-simd-coarray-lib.f90: Likewise.
* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
* gfortran.dg/gomp/pr79154-1.f90: Likewise.
* gfortran.dg/gomp/pr83977.f90: Likewise.

libgomp/testsuite/ChangeLog:

* libgomp.c/declare-variant-1.c: Adapt test for aarch64.
* libgomp.fortran/declare-simd-1.f90: Likewise.diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
9fbfc548a891f5d11940c6fd3c49a14bfbdec886..37507f091c2a6154fa944c3a9fad6a655ab5d5a1
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27414,33 +27414,62 @@ supported_simd_type (tree t)
   return false;
 }
 
-/* Return true for types that currently are supported as SIMD return
-   or argument types.  */
+/* Determine the lane size for the clone argument/return type.  This follows
+   the LS(P) rule in the VFABIA64.  */
 
-static bool
-currently_supported_simd_type (tree t, tree b)
+static unsigned
+lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
 {
-  if (COMPLEX_FLOAT_TYPE_P (t))
-return false;
+  gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
 
-  if (TYPE_SIZE (t) != TYPE_SIZE (b))
-return false;
+  /* For non map-to-vector types that are pointers we use the element type it
+ points to.  */
+  if (POINTER_TYPE_P (type))
+switch (clone_arg_type)
+  {
+  default:
+   break;
+  case SIMD_CLONE_ARG_TYPE_UNIFORM:
+  case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
+  case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
+   type = TREE_TYPE (type);
+   break;
+  }
 
-  return supported_simd_type (t);
+  /* For types (or types pointers of non map-to-vector types point to) that are
+ integers or floating point, we use their size if they are 1, 2, 4 or 8.
+   */
+  if (INTEGRAL_TYPE_P (type)
+  || SCALAR_FLOAT_TYPE_P (type))
+  switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
+   {
+   default:
+ break;
+   case 1:
+   case 2:
+   case 4:
+   case 8:
+ return TYPE_PRECISION (type);
+   }
+  /* For any other we use the size of uintptr_t.  For map-to-vector types that
+ are pointers, using the size of uintptr_t is the same as using the size of
+ their type, seeing all pointers are the same size as uintptr_t.  */
+  return POINTER_SIZE;
 }
 
+
 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
 
 static int
 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
struct cgraph_simd_clone *clonei,
-   tree base_type, int num,
-   bool explicit_p)
+   tree base_type ATTRIBUTE_UNUSED,
+   int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int elt_bits, count;
+  unsigned int nds_elt_bits;
+  int count;
   unsigned HOST_WIDE_INT const_simdlen;
-  poly_uint64 vec_bits;
 
   if (!TARGET_SIMD)
 return 0;
@@ -27460,80 +27489,135 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
 }
 
   

Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM

2023-10-04 Thread Andre Vieira (lists)




On 04/10/2023 11:41, Richard Biener wrote:

On Wed, 4 Oct 2023, Andre Vieira (lists) wrote:




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


This patch adds a new target hook to enable us to adapt the types of return
and parameters of simd clones.  We use this in two ways, the first one is
to
make sure we can create valid SVE types, including the SVE type attribute,
when creating a SVE simd clone, even when the target options do not support
SVE.  We are following the same behaviour seen with x86 that creates simd
clones according to the ABI rules when no simdlen is provided, even if that
simdlen is not supported by the current target options.  Note that this
doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by the time we
call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not vector.  The
same is true for the return type one.

Also the changes to the types need to be taken into consideration in
'adjustments' I think.


Nothing in the three existing implementations of TARGET_SIMD_CLONE_ADJUST
relies on this ordering I think, how about moving the hook invocation
after simd_clone_adjust_argument_types?



But that wouldn't change the 'ipa_param_body_adjustments' for when we 
have a function definition and we need to redo the body.

Richard.


PS: I hope the subject line survived, my email client is having a bit of a
wobble this morning... it's what you get for updating software :(


Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM

2023-10-04 Thread Andre Vieira (lists)




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


This patch adds a new target hook to enable us to adapt the types of return
and parameters of simd clones.  We use this in two ways, the first one is to
make sure we can create valid SVE types, including the SVE type attribute,
when creating a SVE simd clone, even when the target options do not support
SVE.  We are following the same behaviour seen with x86 that creates simd
clones according to the ABI rules when no simdlen is provided, even if that
simdlen is not supported by the current target options.  Note that this
doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by the 
time we call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not 
vector.  The same is true for the return type one.


Also the changes to the types need to be taken into consideration in 
'adjustments' I think.


PS: I hope the subject line survived, my email client is having a bit of 
a wobble this morning... it's what you get for updating software :(


Re: Check that passes do not forget to define profile

2023-10-03 Thread Andre Vieira (lists)

Hi Honza,

My current patch set for AArch64 VLA omp codegen started failing on 
gcc.dg/gomp/pr87898.c after this. I traced it back to 
'move_sese_region_to_fn' in tree/cfg.cc not setting count for the bb 
created.


I was able to 'fix' it locally by setting the count of the new bb to the 
accumulation of e->count () of all the entry_endges (if initialized). 
I'm however not even close to certain that's the right approach, 
attached patch for illustration.


Kind regards,
Andre

On 24/08/2023 14:14, Jan Hubicka via Gcc-patches wrote:

Hi,
this patch extends verifier to check that all probabilities and counts are
initialized if profile is supposed to be present.  This is a bit complicated
by the posibility that we inline !flag_guess_branch_probability function
into function with profile defined and in this case we need to stop
verification.  For this reason I added flag to cfg structure tracking this.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* cfg.h (struct control_flow_graph): New field full_profile.
* auto-profile.cc (afdo_annotate_cfg): Set full_profile to true.
* cfg.cc (init_flow): Set full_profile to false.
* graphite.cc (graphite_transform_loops): Set full_profile to false.
* lto-streamer-in.cc (input_cfg): Initialize full_profile flag.
* predict.cc (pass_profile::execute): Set full_profile to true.
* symtab-thunks.cc (expand_thunk): Set full_profile to true.
* tree-cfg.cc (gimple_verify_flow_info): Verify that profile is full
if full_profile is set.
* tree-inline.cc (initialize_cfun): Initialize full_profile.
(expand_call_inline): Combine full_profile.


diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index e3af3555e75..ff3b763945c 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -1578,6 +1578,7 @@ afdo_annotate_cfg (const stmt_set _stmts)
  }
update_max_bb_count ();
profile_status_for_fn (cfun) = PROFILE_READ;
+  cfun->cfg->full_profile = true;
if (flag_value_profile_transformations)
  {
gimple_value_profile_transformations ();
diff --git a/gcc/cfg.cc b/gcc/cfg.cc
index 9eb9916f61a..b7865f14e7f 100644
--- a/gcc/cfg.cc
+++ b/gcc/cfg.cc
@@ -81,6 +81,7 @@ init_flow (struct function *the_fun)
  = ENTRY_BLOCK_PTR_FOR_FN (the_fun);
the_fun->cfg->edge_flags_allocated = EDGE_ALL_FLAGS;
the_fun->cfg->bb_flags_allocated = BB_ALL_FLAGS;
+  the_fun->cfg->full_profile = false;
  }
  
  /* Helper function for remove_edge and free_cffg.  Frees edge structure
diff --git a/gcc/cfg.h b/gcc/cfg.h
index a0e944979c8..53e2553012c 100644
--- a/gcc/cfg.h
+++ b/gcc/cfg.h
@@ -78,6 +78,9 @@ struct GTY(()) control_flow_graph {
/* Dynamically allocated edge/bb flags.  */
int edge_flags_allocated;
int bb_flags_allocated;
+
+  /* Set if the profile is computed on every edge and basic block.  */
+  bool full_profile;
  };
  
  
diff --git a/gcc/graphite.cc b/gcc/graphite.cc

index 19f8975ffa2..2b387d5b016 100644
--- a/gcc/graphite.cc
+++ b/gcc/graphite.cc
@@ -512,6 +512,8 @@ graphite_transform_loops (void)
  
if (changed)

  {
+  /* FIXME: Graphite does not update profile meaningfully currently.  */
+  cfun->cfg->full_profile = false;
cleanup_tree_cfg ();
profile_status_for_fn (cfun) = PROFILE_ABSENT;
release_recorded_exits (cfun);
diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc
index 0cce14414ca..d3128fcebe4 100644
--- a/gcc/lto-streamer-in.cc
+++ b/gcc/lto-streamer-in.cc
@@ -1030,6 +1030,7 @@ input_cfg (class lto_input_block *ib, class data_in 
*data_in,
basic_block p_bb;
unsigned int i;
int index;
+  bool full_profile = false;
  
init_empty_tree_cfg_for_function (fn);
  
@@ -1071,6 +1072,8 @@ input_cfg (class lto_input_block *ib, class data_in *data_in,

  data_in->location_cache.input_location_and_block (>goto_locus,
, ib, data_in);
  e->probability = profile_probability::stream_in (ib);
+ if (!e->probability.initialized_p ())
+   full_profile = false;
  
  	}
  
@@ -1145,6 +1148,7 @@ input_cfg (class lto_input_block *ib, class data_in *data_in,
  
/* Rebuild the loop tree.  */

flow_loops_find (loops);
+  cfun->cfg->full_profile = full_profile;
  }
  
  
diff --git a/gcc/predict.cc b/gcc/predict.cc

index 5a1a561cc24..396746cbfd1 100644
--- a/gcc/predict.cc
+++ b/gcc/predict.cc
@@ -4131,6 +4131,7 @@ pass_profile::execute (function *fun)
  scev_initialize ();
  
tree_estimate_probability (false);

+  cfun->cfg->full_profile = true;
  
if (nb_loops > 1)

  scev_finalize ();
diff --git a/gcc/symtab-thunks.cc b/gcc/symtab-thunks.cc
index 4c04235c41b..23ead0d2138 100644
--- a/gcc/symtab-thunks.cc
+++ b/gcc/symtab-thunks.cc
@@ -648,6 +648,7 @@ expand_thunk (cgraph_node *node, bool output_asm_thunks,
  ? PROFILE_READ : PROFILE_GUESSED;
/* FIXME: C++ FE 

Re: [PATCH 6/8] vect: Add vector_mode paramater to simd_clone_usable

2023-09-28 Thread Andre Vieira (lists)




On 31/08/2023 07:39, Richard Biener wrote:

On Wed, Aug 30, 2023 at 5:02 PM Andre Vieira (lists)
 wrote:




On 30/08/2023 14:01, Richard Biener wrote:

On Wed, Aug 30, 2023 at 11:15 AM Andre Vieira (lists) via Gcc-patches
 wrote:


This patch adds a machine_mode parameter to the TARGET_SIMD_CLONE_USABLE
hook to enable rejecting SVE modes when the target architecture does not
support SVE.


How does the graph node of the SIMD clone lack this information?  That is, it
should have information on the types (and thus modes) for all formal arguments
and return values already, no?  At least the target would know how to
instantiate
it if it's not readily available at the point of use.



Yes it does, but that's the modes the simd clone itself uses, it does
not know what vector_mode we are currently vectorizing for. Which is
exactly why we need the vinfo's vector_mode to make sure the simd clone
and its types are compatible with the vector mode.

In practice, to make sure that a SVE simd clones are only used in loops
being vectorized for SVE modes. Having said that... I just realized that
the simdlen check already takes care of that currently...

by simdlen check I mean the one that writes off simdclones that match:
  if (!constant_multiple_p (vf, n->simdclone->simdlen, _calls)

However, when using -msve-vector-bits this will become an issue, as the
VF will be constant and we will match NEON simdclones.  This requires
some further attention though given that we now also reject the use of
SVE simdclones when using -msve-vector-bits, and I'm not entirely sure
we should...


Hmm, but vectorizable_simdclone should check for compatible types here
and if they are compatible why should we reject them?  Are -msve-vector-bits
"SVE" modes different from "NEON" modes?  I suppose not, because otherwise
the type compatibility check would say incompatible.

Prior to transformation we do all checks on the original scalar values, 
not the vector types. But I do believe you are right in that we don't 
need to pass the vector_mode. The simdlen check should be enough and if 
the length is the same or a multiple of the rest of the could should be 
able to deal with that and any conversions when dealing with things like 
SVE types that require the attribute.


I'll update the patch series soon and after that I'll look at how this 
reacts to -msve-vector-bits in more detail.


Thanks,
Andre


Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-09-28 Thread Andre Vieira (lists)

Hi,

On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote:

Hi Stam,





The arm parts look sensible but we'd need review for the df-core.h and 
df-core.cc changes.
Maybe Jeff can help or can recommend someone to take a look?
Thanks,
Kyrill



FWIW the changes LGTM, if we don't want these in df-core we can always 
implement the extra utility locally. It's really just a helper function 
to check if df_bb_regno_first_def_find and df_bb_regno_last_def_find 
yield the same result, meaning we only have a single definition.


Kind regards,
Andre


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-27 Thread Andre Vieira (lists)



On 26/09/2023 17:37, Andrew Stubbs wrote:

I don't have authority to approve anything, but here's a review anyway.

Thanks for working on this.


Thank you for reviewing and apologies for the mess of a patch, may have 
rushed it ;)


diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c

new file mode 100644
index 
..09127b8cb6f2e3699b6073591f58be7047330273

--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+/* { dg-additional-options "-fopenmp-simd" } */
+


Do you need -fopenmp-simd for this?

Nope, I keep forgetting that you only need it for pragmas.

Dealt with the other comments too.

Any thoughts on changing gimple_call_internal_fn  instead? My main 
argument against is that IFN_MASK_CALL should not appear outside of 
ifconvert and vectorizer. On the other hand, we may inspect the flags 
elsewhere in the vectorizer (now or in the future) and changing 
gimple_call_internal_fn would prevent the need to handle the IFN 
separately elsewhere.


Kind Regards,
Andrediff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
new file mode 100644
index 
..e7ed56ca75470464307d0d266dacfa0d8d6e43c1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,22 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+
+int __attribute__ ((__simd__, const)) fn (int);
+
+void test (int * __restrict__ a, int * __restrict__ b, int n)
+{
+  for (int i = 0; i < n; ++i)
+{
+  int a_;
+  if (b[i] > 0)
+a_ = fn (b[i]);
+  else
+a_ = b[i] + 5;
+  a[i] = a_;
+}
+}
+
+/* { dg-final { scan-tree-dump-not {loop contains function calls or data 
references} "vect" } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 
6d3b7c2290e4db9c1168a4c763facb481157c97c..689aaeed72282bb0da2a17e19fb923a06e8d62fa
 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "vr-values.h"
 #include "range-op.h"
 #include "tree-ssa-loop-ivopts.h"
+#include "calls.h"
 
 static struct datadep_stats
 {
@@ -5816,6 +5817,15 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ break;
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl
+ || (flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }
break;
  default:
clobbers_memory = true;
@@ -5852,7 +5862,7 @@ get_references_in_stmt (gimple *stmt, vec *references)
 }
   else if (stmt_code == GIMPLE_CALL)
 {
-  unsigned i, n;
+  unsigned i = 0, n;
   tree ptr, type;
   unsigned int align;
 
@@ -5879,13 +5889,16 @@ get_references_in_stmt (gimple *stmt, vec *references)
   ptr);
references->safe_push (ref);
return false;
+ case IFN_MASK_CALL:
+   i = 1;
+   gcc_fallthrough ();
  default:
break;
  }
 
   op0 = gimple_call_lhs (stmt);
   n = gimple_call_num_args (stmt);
-  for (i = 0; i < n; i++)
+  for (; i < n; i++)
{
  op1 = gimple_call_arg (stmt, i);
 


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)




On 26/09/2023 21:26, Bernhard Reutner-Fischer wrote:

On 26 September 2023 18:46:11 CEST, Tobias Burnus  
wrote:

On 26.09.23 18:37, Andrew Stubbs wrote:

If the fall-through is deliberate please add a /* FALLTHROUGH */
comment (or whatever spelling disables the warning).


It's: gcc_fallthrough ();

Which gets converted to "__attribute__((fallthrough))"; it could also
expand to "[[fallthrough]]" but that's C++17 (and, also, an C23 feature
- albeit so far unimplemented in gcc).


OT
IIRC we do parse comments for a number of spellings of the hint by the user 
that the fallthrough is deliberate:

https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html

See the numerous levels of -Wimplicit-fallthrough=n, the default being 3.

---8<---
-Wimplicit-fallthrough=3 case sensitively matches one of the following regular 
expressions:
-fallthrough
@fallthrough@
lint -fallthrough[ \t]*
[ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?
FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
[ \t.!]*(Else,? |Intentional(ly)? )?
Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
[ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?
fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
---8<---

Just FWIW.
thanks,


I was surprised my bootstrap didn't catch this, I thought we generated 
warnings in such cases and bootstrap builds with -Werror does it not?


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)




On 26/09/2023 17:48, Jakub Jelinek wrote:

On Tue, Sep 26, 2023 at 05:24:26PM +0100, Andre Vieira (lists) wrote:

@@ -5816,6 +5817,18 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl)
+   {
+ clobbers_memory = true;
+ break;
+   }
+ if ((flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }


Should IFN_MASK_LOAD/STORE really go through this?  I thought those have
first argument address of the memory being conditionally loaded/stored, not
function address.


No it shouldn't, my bad...
Surprising testing didn't catch it though, I'm guessing 
gimple_call_addr_fndecl just returned null everytime for those. I'll 
clean it up.


[PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)
The const attribute is ignored when simdclone's are used inbranch. This 
is due to the fact that when analyzing a MASK_CALL we were not looking 
at the targeted function for flags, but instead only at the internal 
function call itself.
This patch adds code to make sure we look at the target function to 
check for the const attribute and enables the autovectorization of 
inbranch const simdclones without needing the loop to be adorned the 
'openmp simd' pragma.


Not sure about how to add new includes to the ChangeLog. Which brings me 
to another point, I contemplated changing gimple_call_flags to do the 
checking of flags of the first argument of IFN_MASK_CALL itself rather 
than only calling internal_fn_flags on gimple_call_internal_fn (stmt), 
but that might be a bit too intrusive, opinions welcome :)


Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


Is this OK for trunk?

gcc/ChangeLog:

* tree-vect-data-ref.cc (include calls.h): Add new include.
(get_references_in_stmt): Correctly handle IFN_MASK_CALL.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-19.c: New test.diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
new file mode 100644
index 
..09127b8cb6f2e3699b6073591f58be7047330273
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+/* { dg-additional-options "-fopenmp-simd" } */
+
+int __attribute__ ((__simd__, const)) fn (int);
+
+void test (int * __restrict__ a, int * __restrict__ b, int n)
+{
+  for (int i = 0; i < n; ++i)
+{
+  int a_;
+  if (b[i] > 0)
+a_ = fn (b[i]);
+  else
+a_ = b[i] + 5;
+  a[i] = a_;
+}
+}
+
+/* { dg-final { scan-tree-dump-not {loop contains function calls or data 
references} "vect" } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 
6d3b7c2290e4db9c1168a4c763facb481157c97c..2926c3925ee7897fef53c16cfd1d19d23dbf05f3
 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "vr-values.h"
 #include "range-op.h"
 #include "tree-ssa-loop-ivopts.h"
+#include "calls.h"
 
 static struct datadep_stats
 {
@@ -5816,6 +5817,18 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl)
+   {
+ clobbers_memory = true;
+ break;
+   }
+ if ((flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }
break;
  default:
clobbers_memory = true;
@@ -5852,7 +5865,7 @@ get_references_in_stmt (gimple *stmt, vec *references)
 }
   else if (stmt_code == GIMPLE_CALL)
 {
-  unsigned i, n;
+  unsigned i  = 0, n;
   tree ptr, type;
   unsigned int align;
 
@@ -5879,13 +5892,15 @@ get_references_in_stmt (gimple *stmt, vec *references)
   ptr);
references->safe_push (ref);
return false;
+ case IFN_MASK_CALL:
+   i = 1;
  default:
break;
  }
 
   op0 = gimple_call_lhs (stmt);
   n = gimple_call_num_args (stmt);
-  for (i = 0; i < n; i++)
+  for (; i < n; i++)
{
  op1 = gimple_call_arg (stmt, i);
 


Re: [PATCH 6/8] vect: Add vector_mode paramater to simd_clone_usable

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches




On 30/08/2023 14:01, Richard Biener wrote:

On Wed, Aug 30, 2023 at 11:15 AM Andre Vieira (lists) via Gcc-patches
 wrote:


This patch adds a machine_mode parameter to the TARGET_SIMD_CLONE_USABLE
hook to enable rejecting SVE modes when the target architecture does not
support SVE.


How does the graph node of the SIMD clone lack this information?  That is, it
should have information on the types (and thus modes) for all formal arguments
and return values already, no?  At least the target would know how to
instantiate
it if it's not readily available at the point of use.



Yes it does, but that's the modes the simd clone itself uses, it does 
not know what vector_mode we are currently vectorizing for. Which is 
exactly why we need the vinfo's vector_mode to make sure the simd clone 
and its types are compatible with the vector mode.


In practice, to make sure that a SVE simd clones are only used in loops 
being vectorized for SVE modes. Having said that... I just realized that 
the simdlen check already takes care of that currently...


by simdlen check I mean the one that writes off simdclones that match:
if (!constant_multiple_p (vf, n->simdclone->simdlen, _calls)

However, when using -msve-vector-bits this will become an issue, as the 
VF will be constant and we will match NEON simdclones.  This requires 
some further attention though given that we now also reject the use of 
SVE simdclones when using -msve-vector-bits, and I'm not entirely sure 
we should...


I'm going on holidays for 2 weeks now though, so I'll have a look at 
that scenario when I get back. Same with other feedback, didn't expect 
feedback this quickly ;) Thank you!!


Kind regards,
Andre



[PATCH 8/8] aarch64: Add SVE support for simd clones [PR 96342]

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches
This patch finalizes adding support for the generation of SVE simd 
clones when no simdlen is provided, following the ABI rules where the 
widest data type determines the minimum amount of elements in a length 
agnostic vector.


gcc/ChangeLog:

* config/aarch64/aarch64-protos.h (add_sve_type_attribute): 
Declare.

* config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): Make
visibility global.
* config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
chosen over SIMD ABI if a SVE type is used in return or arguments.
(aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd clone
when no simdlen is provided, according to ABI rules.
(aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
(aarch64_simd_clone_adjust_ret_or_param): New.
(TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
* omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
(simd_clone_adjust): Adapt safelen check to be compatible with VLA
simdlen.

gcc/testsuite/ChangeLog:

* c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
* gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
longer necessary.
* gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
70303d6fd953e0c397b9138ede8858c2db2e53db..d7888c95a4999fad1a4c55d5cd2287c2040302c8
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1001,6 +1001,8 @@ namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+ const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 
161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
mangling of the type.  ACLE_NAME is the  name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
 = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+= (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
a13d3fba05f9f9d2989b36c681bc77d71e943e0d..492acb9ce081866162faa8dfca777e4cb943797f
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4034,13 +4034,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree 
fntype)
 static const predefined_function_abi &
 aarch64_fntype_abi (const_tree fntype)
 {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-return aarch64_simd_abi ();
-
   if (aarch64_returns_value_in_sve_regs_p (fntype)
   || aarch64_takes_arguments_in_sve_regs_p (fntype))
 return aarch64_sve_abi ();
 
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
+return aarch64_simd_abi ();
+
   return default_function_abi;
 }
 
@@ -27327,7 +27327,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
   int count;
   unsigned HOST_WIDE_INT const_simdlen;
   poly_uint64 vec_bits;
@@ -27374,10 +27374,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
   if (TREE_CODE (ret_type) != VOID_TYPE)
 {
   nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+  wds_elt_bits = nds_elt_bits;
   vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
 }
   else
-nds_elt_bits = POINTER_SIZE;
+{
+  nds_elt_bits = POINTER_SIZE;
+  wds_elt_bits = 0;
+}
 
   int i;
   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
@@ -27385,30 +27389,36 @@ 

[PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches
This patch adds a new target hook to enable us to adapt the types of 
return and parameters of simd clones.  We use this in two ways, the 
first one is to make sure we can create valid SVE types, including the 
SVE type attribute, when creating a SVE simd clone, even when the target 
options do not support SVE.  We are following the same behaviour seen 
with x86 that creates simd clones according to the ABI rules when no 
simdlen is provided, even if that simdlen is not supported by the 
current target options.  Note that this doesn't mean the simd clone will 
be used in auto-vectorization.


gcc/ChangeLog:

(TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
* doc/tm.texi (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Document.
* doc/tm.texi.in (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): New.
* omp-simd-clone.cc (simd_adjust_return_type): Call new hook.
(simd_clone_adjust_argument_types): Likewise.
* target.def (adjust_ret_or_param): New hook.
* targhooks.cc (default_simd_clone_adjust_ret_or_param): New.
* targhooks.h (default_simd_clone_adjust_ret_or_param): New.diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 
bde22e562ebb9069122eb3b142ab8f4a4ae56a3a..b80c09ec36d51f1bb55b14229f46207fb4457223
 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6343,6 +6343,9 @@ non-negative number if it is usable.  In that case, the 
smaller the number is,
 the more desirable it is to use it.
 @end deftypefn
 
+@deftypefn {Target Hook} tree TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM (struct 
cgraph_node *@var{}, @var{tree}, @var{bool})
+If defined, this hook should adjust the type of the return or parameter
+@var{type} to be used by the simd clone @var{node}.
 @end deftypefn
 
 @deftypefn {Target Hook} int TARGET_SIMT_VF (void)
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 
4ac96dc357d35e0e57bb43a41d1b1a4f66d05946..7496a32d84f7c422fe7ea88215ee72f3c354a3f4
 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4211,6 +4211,8 @@ address;  but often a machine-dependent strategy can 
generate better code.
 
 @hook TARGET_SIMD_CLONE_USABLE
 
+@hook TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
+
 @hook TARGET_SIMT_VF
 
 @hook TARGET_OMP_DEVICE_KIND_ARCH_ISA
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
ef0b9b48c7212900023bc0eaebca5e1f9389db77..c2fd4d3be878e56b6394e34097d2de826a0ba1ff
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -736,6 +736,7 @@ simd_clone_adjust_return_type (struct cgraph_node *node)
   t = build_array_type_nelts (t, exact_div (node->simdclone->simdlen,
veclen));
 }
+  t = targetm.simd_clone.adjust_ret_or_param (node, t, false);
   TREE_TYPE (TREE_TYPE (fndecl)) = t;
   if (!node->definition)
 return NULL_TREE;
@@ -748,6 +749,7 @@ simd_clone_adjust_return_type (struct cgraph_node *node)
 
   tree atype = build_array_type_nelts (orig_rettype,
   node->simdclone->simdlen);
+  atype = targetm.simd_clone.adjust_ret_or_param (node, atype, false);
   if (maybe_ne (veclen, node->simdclone->simdlen))
 return build1 (VIEW_CONVERT_EXPR, atype, t);
 
@@ -880,6 +882,8 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
   ? IDENTIFIER_POINTER (DECL_NAME (parm))
   : NULL, parm_type, sc->simdlen);
}
+  adj.type = targetm.simd_clone.adjust_ret_or_param (node, adj.type,
+false);
   vec_safe_push (new_params, adj);
 }
 
@@ -912,6 +916,8 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
adj.type = build_vector_type (pointer_sized_int_node, veclen);
   else
adj.type = build_vector_type (base_type, veclen);
+  adj.type = targetm.simd_clone.adjust_ret_or_param (node, adj.type,
+true);
   vec_safe_push (new_params, adj);
 
   k = vector_unroll_factor (sc->simdlen, veclen);
@@ -937,6 +943,7 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
sc->args[i].simd_array = NULL_TREE;
}
   sc->args[i].orig_type = base_type;
+  sc->args[i].vector_type = adj.type;
   sc->args[i].arg_type = SIMD_CLONE_ARG_TYPE_MASK;
   sc->args[i].vector_type = adj.type;
 }
diff --git a/gcc/target.def b/gcc/target.def
index 
6a0cbc454526ee29011451b570354bf234a4eabd..665083ce035da03b40b15f23684ccdacce33c9d3
 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1650,6 +1650,13 @@ non-negative number if it is usable.  In that case, the 
smaller the number is,\n
 the more desirable it is to use it.",
 int, (struct cgraph_node *, machine_mode), NULL)
 
+DEFHOOK
+(adjust_ret_or_param,
+"If defined, this hook should adjust the type of the return or parameter\n\
+@var{type} to be used by the simd clone @var{node}.",
+tree, (struct cgraph_node *, tree, 

Re: [PATCH 6/8] vect: Add vector_mode paramater to simd_clone_usable

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches

Forgot to CC this one to maintainers...

On 30/08/2023 10:14, Andre Vieira (lists) via Gcc-patches wrote:
This patch adds a machine_mode parameter to the TARGET_SIMD_CLONE_USABLE 
hook to enable rejecting SVE modes when the target architecture does not 
support SVE.


gcc/ChangeLog:

 * config/aarch64/aarch64.cc (aarch64_simd_clone_usable): Add mode
 parameter and use to to reject SVE modes when target architecture does
 not support SVE.
 * config/gcn/gcn.cc (gcn_simd_clone_usable): Add unused mode 
parameter.

 * config/i386/i386.cc (ix86_simd_clone_usable): Likewise.
 * doc/tm.texi (TARGET_SIMD_CLONE_USABLE): Document new parameter.
 * target.def (usable): Add new parameter.
 * tree-vect-stmts.cc (vectorizable_simd_clone_call): Pass vector mode
 to TARGET_SIMD_CLONE_CALL hook.


[PATCH 6/8] vect: Add vector_mode paramater to simd_clone_usable

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches
This patch adds a machine_mode parameter to the TARGET_SIMD_CLONE_USABLE 
hook to enable rejecting SVE modes when the target architecture does not 
support SVE.


gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_simd_clone_usable): Add mode
parameter and use to to reject SVE modes when target architecture does
not support SVE.
* config/gcn/gcn.cc (gcn_simd_clone_usable): Add unused mode parameter.
* config/i386/i386.cc (ix86_simd_clone_usable): Likewise.
* doc/tm.texi (TARGET_SIMD_CLONE_USABLE): Document new parameter.
* target.def (usable): Add new parameter.
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Pass vector mode
to TARGET_SIMD_CLONE_CALL hook.diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
5fb4c863d875871d6de865e72ce360506a3694d2..a13d3fba05f9f9d2989b36c681bc77d71e943e0d
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27498,12 +27498,18 @@ aarch64_simd_clone_adjust (struct cgraph_node *node)
 /* Implement TARGET_SIMD_CLONE_USABLE.  */
 
 static int
-aarch64_simd_clone_usable (struct cgraph_node *node)
+aarch64_simd_clone_usable (struct cgraph_node *node, machine_mode vector_mode)
 {
   switch (node->simdclone->vecsize_mangle)
 {
 case 'n':
-  if (!TARGET_SIMD)
+  if (!TARGET_SIMD
+ || aarch64_sve_mode_p (vector_mode))
+   return -1;
+  return 0;
+case 's':
+  if (!TARGET_SVE
+ || !aarch64_sve_mode_p (vector_mode))
return -1;
   return 0;
 default:
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index 
02f4dedec4214b1eea9e6f5057ed57d7e0db316a..252676273f06500c99df6ae251f0406c618df891
 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -5599,7 +5599,8 @@ gcn_simd_clone_adjust (struct cgraph_node *ARG_UNUSED 
(node))
 /* Implement TARGET_SIMD_CLONE_USABLE.  */
 
 static int
-gcn_simd_clone_usable (struct cgraph_node *ARG_UNUSED (node))
+gcn_simd_clone_usable (struct cgraph_node *ARG_UNUSED (node),
+  machine_mode ARG_UNUSED (mode))
 {
   /* We don't need to do anything here because
  gcn_simd_clone_compute_vecsize_and_simdlen currently only returns one
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 
5d57726e22cea8bcaa8ac8b1b25ac420193f39bb..84f0d5a7cb679e6be92001f59802276635506e97
 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24379,7 +24379,8 @@ ix86_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
slightly less desirable, etc.).  */
 
 static int
-ix86_simd_clone_usable (struct cgraph_node *node)
+ix86_simd_clone_usable (struct cgraph_node *node,
+   machine_mode mode ATTRIBUTE_UNUSED)
 {
   switch (node->simdclone->vecsize_mangle)
 {
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 
95ba56e05ae4a0f11639cc4a21d6736c53ad5ef1..bde22e562ebb9069122eb3b142ab8f4a4ae56a3a
 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6336,11 +6336,13 @@ This hook should add implicit 
@code{attribute(target("..."))} attribute
 to SIMD clone @var{node} if needed.
 @end deftypefn
 
-@deftypefn {Target Hook} int TARGET_SIMD_CLONE_USABLE (struct cgraph_node 
*@var{})
+@deftypefn {Target Hook} int TARGET_SIMD_CLONE_USABLE (struct cgraph_node 
*@var{}, @var{machine_mode})
 This hook should return -1 if SIMD clone @var{node} shouldn't be used
-in vectorized loops in current function, or non-negative number if it is
-usable.  In that case, the smaller the number is, the more desirable it is
-to use it.
+in vectorized loops being vectorized with mode @var{m} in current function, or
+non-negative number if it is usable.  In that case, the smaller the number is,
+the more desirable it is to use it.
+@end deftypefn
+
 @end deftypefn
 
 @deftypefn {Target Hook} int TARGET_SIMT_VF (void)
diff --git a/gcc/target.def b/gcc/target.def
index 
7d684296c17897b4ceecb31c5de1ae8665a8228e..6a0cbc454526ee29011451b570354bf234a4eabd
 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1645,10 +1645,11 @@ void, (struct cgraph_node *), NULL)
 DEFHOOK
 (usable,
 "This hook should return -1 if SIMD clone @var{node} shouldn't be used\n\
-in vectorized loops in current function, or non-negative number if it is\n\
-usable.  In that case, the smaller the number is, the more desirable it is\n\
-to use it.",
-int, (struct cgraph_node *), NULL)
+in vectorized loops being vectorized with mode @var{m} in current function, 
or\n\
+non-negative number if it is usable.  In that case, the smaller the number 
is,\n\
+the more desirable it is to use it.",
+int, (struct cgraph_node *, machine_mode), NULL)
+
 
 HOOK_VECTOR_END (simd_clone)
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
7217f36a250d549b955c874d7c7644d94982b0b5..dc2fc20ef9fe777132308c9e33f7731d62717466
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4195,7 +4195,7 @@ 

[PATCH 5/8] vect: Use inbranch simdclones in masked loops

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches
This patch enables the compiler to use inbranch simdclones when 
generating masked loops in autovectorization.


gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_adjust_argument_types): Make function
compatible with mask parameters in clone.
* tree-vect-stmts.cc (vect_convert): New helper function.
(vect_build_all_ones_mask): Allow vector boolean typed masks.
(vectorizable_simd_clone_call): Enable the use of masked clones in
fully masked loops.diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
a42643400ddcf10961633448b49d4caafb999f12..ef0b9b48c7212900023bc0eaebca5e1f9389db77
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -807,8 +807,14 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
 {
   ipa_adjusted_param adj;
   memset (, 0, sizeof (adj));
-  tree parm = args[i];
-  tree parm_type = node->definition ? TREE_TYPE (parm) : parm;
+  tree parm = NULL_TREE;
+  tree parm_type = NULL_TREE;
+  if(i < args.length())
+   {
+ parm = args[i];
+ parm_type = node->definition ? TREE_TYPE (parm) : parm;
+   }
+
   adj.base_index = i;
   adj.prev_clone_index = i;
 
@@ -1547,7 +1553,7 @@ simd_clone_adjust (struct cgraph_node *node)
  mask = gimple_assign_lhs (g);
  g = gimple_build_assign (make_ssa_name (TREE_TYPE (mask)),
   BIT_AND_EXPR, mask,
-  build_int_cst (TREE_TYPE (mask), 1));
+  build_one_cst (TREE_TYPE (mask)));
  gsi_insert_after (, g, GSI_CONTINUE_LINKING);
  mask = gimple_assign_lhs (g);
}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
664c3b5f7ca48fdb49383fb8a97f407465574479..7217f36a250d549b955c874d7c7644d94982b0b5
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1723,6 +1723,20 @@ check_load_store_for_partial_vectors (loop_vec_info 
loop_vinfo, tree vectype,
 }
 }
 
+/* Return SSA name of the result of the conversion of OPERAND into type TYPE.
+   The conversion statement is inserted at GSI.  */
+
+static tree
+vect_convert (vec_info *vinfo, stmt_vec_info stmt_info, tree type, tree 
operand,
+ gimple_stmt_iterator *gsi)
+{
+  operand = build1 (VIEW_CONVERT_EXPR, type, operand);
+  gassign *new_stmt = gimple_build_assign (make_ssa_name (type),
+  operand);
+  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+  return gimple_get_lhs (new_stmt);
+}
+
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
that needs to be applied to all loads and stores in a vectorized loop.
@@ -2666,7 +2680,8 @@ vect_build_all_ones_mask (vec_info *vinfo,
 {
   if (TREE_CODE (masktype) == INTEGER_TYPE)
 return build_int_cst (masktype, -1);
-  else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+  else if (VECTOR_BOOLEAN_TYPE_P (masktype)
+  || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
 {
   tree mask = build_int_cst (TREE_TYPE (masktype), -1);
   mask = build_vector_from_val (masktype, mask);
@@ -4018,7 +4033,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   size_t i, nargs;
   tree lhs, rtype, ratype;
   vec *ret_ctor_elts = NULL;
-  int arg_offset = 0;
+  int masked_call_offset = 0;
 
   /* Is STMT a vectorizable call?   */
   gcall *stmt = dyn_cast  (stmt_info->stmt);
@@ -4033,7 +4048,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
   fndecl = TREE_OPERAND (fndecl, 0);
   gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
-  arg_offset = 1;
+  masked_call_offset = 1;
 }
   if (fndecl == NULL_TREE)
 return false;
@@ -4065,7 +4080,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 return false;
 
   /* Process function arguments.  */
-  nargs = gimple_call_num_args (stmt) - arg_offset;
+  nargs = gimple_call_num_args (stmt) - masked_call_offset;
 
   /* Bail out if the function has zero arguments.  */
   if (nargs == 0)
@@ -4083,7 +4098,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   thisarginfo.op = NULL_TREE;
   thisarginfo.simd_lane_linear = false;
 
-  op = gimple_call_arg (stmt, i + arg_offset);
+  op = gimple_call_arg (stmt, i + masked_call_offset);
   if (!vect_is_simple_use (op, vinfo, ,
   )
  || thisarginfo.dt == vect_uninitialized_def)
@@ -4161,14 +4176,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 }
 
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  if (!vf.is_constant ())
-{
-  if (dump_enabled_p ())
-   dump_printf_loc 

[PATCH 4/8] vect: don't allow fully masked loops with non-masked simd clones [PR 110485]

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches
When analyzing a loop and choosing a simdclone to use it is possible to 
choose a simdclone that cannot be used 'inbranch' for a loop that can 
use partial vectors.  This may lead to the vectorizer deciding to use 
partial vectors which are not supported for notinbranch simd clones. 
This patch fixes that by disabling the use of partial vectors once a 
notinbranch simd clone has been selected.


gcc/ChangeLog:

PR tree-optimization/110485
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Disable partial
vectors usage if a notinbranch simdclone has been selected.

gcc/testsuite/ChangeLog:

* gcc.dg/gomp/pr110485.c: New test.diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c 
b/gcc/testsuite/gcc.dg/gomp/pr110485.c
new file mode 100644
index 
..ba6817a127f40246071e32ccebf692cc4d121d15
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
@@ -0,0 +1,19 @@
+/* PR 110485 */
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -fdump-tree-vect-details" } */
+/* { dg-additional-options "-march=znver4 --param=vect-partial-vector-usage=1" 
{ target x86_64-*-* } } */
+#pragma omp declare simd notinbranch uniform(p)
+extern double __attribute__ ((const)) bar (double a, double p);
+
+double a[1024];
+double b[1024];
+
+void foo (int n)
+{
+  #pragma omp simd
+  for (int i = 0; i < n; ++i)
+a[i] = bar (b[i], 71.2);
+}
+
+/* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a 
non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
35207de7acb410358220dbe8d1af82215b5091bf..664c3b5f7ca48fdb49383fb8a97f407465574479
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4349,6 +4349,17 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   ? boolean_true_node : boolean_false_node;
STMT_VINFO_SIMD_CLONE_INFO (stmt_info).safe_push (sll);
  }
+
+  if (!bestn->simdclone->inbranch)
+   {
+ if (dump_enabled_p ()
+ && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+   dump_printf_loc (MSG_NOTE, vect_location,
+"can't use a fully-masked loop because a"
+" non-masked simd clone was selected.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+   }
+
   STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
   DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
 /*  vect_model_simple_cost (vinfo, stmt_info, ncopies,


[Patch 3/8] vect: Fix vect_get_smallest_scalar_type for simd clones

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches
The vect_get_smallest_scalar_type helper function was using any argument 
to a simd clone call when trying to determine the smallest scalar type 
that would be vectorized.  This included the function pointer type in a 
MASK_CALL for instance, and would result in the wrong type being 
selected.  Instead this patch special cases simd_clone_call's and uses 
only scalar types of the original function that get transformed into 
vector types.


gcc/ChangeLog:

* tree-vect-data-refs.cci (vect_get_smallest_scalar_type): Special case
simd clone calls and only use types that are mapped to vectors.
* tree-vect-stmts.cc (simd_clone_call_p): New helper function.
* tree-vectorizer.h (simd_clone_call_p): Declare new function.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-16f.c: Remove unnecessary differentation
between targets with different pointer sizes.
* gcc.dg/vect/vect-simd-clone-17f.c: Likewise.
* gcc.dg/vect/vect-simd-clone-18f.c: Likewise.diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
index 
574698d3e133ecb8700e698fa42a6b05dd6b8a18..7cd29e894d0502a59fadfe67db2db383133022d3
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-16.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
index 
8bb6d19301a67a3eebce522daaf7d54d88f708d7..177521dc44531479fca1f1a1a0f2010f30fa3fb5
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-17.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
index 
d34f23f4db8e9c237558cc22fe66b7e02b9e6c20..4dd51381d73c0c7c8ec812f24e5054df038059c5
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-18.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 
a3570c45b5209281ac18c1220c3b95398487f389..1bdbea232afc6facddac23269ee3da033eb1ed50
 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -119,6 +119,7 @@ tree
 vect_get_smallest_scalar_type (stmt_vec_info stmt_info, tree scalar_type)
 {
   HOST_WIDE_INT lhs, rhs;
+  cgraph_node *node;
 
   /* During the analysis phase, this function is called on arbitrary
  statements that might not have scalar results.  */
@@ -145,6 +146,23 @@ vect_get_smallest_scalar_type (stmt_vec_info stmt_info, 
tree scalar_type)
scalar_type = rhs_type;
}
 }
+  else if (simd_clone_call_p (stmt_info->stmt, ))
+{
+  auto clone = node->simd_clones->simdclone;
+  for (unsigned int i = 0; i < clone->nargs; ++i)
+   {
+ if (clone->args[i].arg_type == 

[Patch 2/8] parloops: Allow poly nit and bound

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches
Teach parloops how to handle a poly nit and bound e ahead of the changes 
to enable non-constant simdlen.


gcc/ChangeLog:

* tree-parloops.cc (try_to_transform_to_exit_first_loop_alt): Accept
poly NIT and ALT_BOUND.diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 
a35f3d5023b06e5ef96eb4222488fcb34dd7bd45..cf713e53d712fb5ad050e274f373adba5a90c5a7
 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2531,14 +2531,16 @@ try_transform_to_exit_first_loop_alt (class loop *loop,
   tree nit_type = TREE_TYPE (nit);
 
   /* Figure out whether nit + 1 overflows.  */
-  if (TREE_CODE (nit) == INTEGER_CST)
+  if (TREE_CODE (nit) == INTEGER_CST
+  || TREE_CODE (nit) == POLY_INT_CST)
 {
   if (!tree_int_cst_equal (nit, TYPE_MAX_VALUE (nit_type)))
{
  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
   nit, build_one_cst (nit_type));
 
- gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+ gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST
+ || TREE_CODE (alt_bound) == POLY_INT_CST);
  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
  return true;
}


[PATCH 1/8] parloops: Copy target and optimizations when creating a function clone

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches


SVE simd clones require to be compiled with a SVE target enabled or the 
argument types will not be created properly. To achieve this we need to 
copy DECL_FUNCTION_SPECIFIC_TARGET from the original function 
declaration to the clones.  I decided it was probably also a good idea 
to copy DECL_FUNCTION_SPECIFIC_OPTIMIZATION in case the original 
function is meant to be compiled with specific optimization options.


gcc/ChangeLog:

* tree-parloops.cc (create_loop_fn): Copy specific target and
optimization options to clone.diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 
e495bbd65270bdf90bae2c4a2b52777522352a77..a35f3d5023b06e5ef96eb4222488fcb34dd7bd45
 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2203,6 +2203,11 @@ create_loop_fn (location_t loc)
   DECL_CONTEXT (t) = decl;
   TREE_USED (t) = 1;
   DECL_ARGUMENTS (decl) = t;
+  DECL_FUNCTION_SPECIFIC_TARGET (decl)
+= DECL_FUNCTION_SPECIFIC_TARGET (act_cfun->decl);
+  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (decl)
+= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (act_cfun->decl);
+
 
   allocate_struct_function (decl, false);
 


aarch64, vect, omp: Add SVE support for simd clones [PR 96342]

2023-08-30 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch series aims to implement support for SVE simd clones when not 
specifying a 'simdlen' clause for AArch64. This patch depends on my 
earlier patch: '[PATCH] aarch64: enable mixed-types for aarch64 simdclones'.


Bootstrapped and regression tested the series on 
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu. I also tried building 
the patches separately, but that was before some further clean-up 
restructuring, so will do that again prior to pushing.


Andre Vieira (8):

parloops: Copy target and optimizations when creating a function clone
parloops: Allow poly nit and bound
vect: Fix vect_get_smallest_scalar_type for simd clones
vect: don't allow fully masked loops with non-masked simd clones [PR 110485]
vect: Use inbranch simdclones in masked loops
vect: Add vector_mode paramater to simd_clone_usable
vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
aarch64: Add SVE support for simd clones [PR 96342]


Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

2023-08-29 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch enables the use of mixed-types for simd clones for AArch64, 
adds aarch64 as a target_vect_simd_clones and corrects the way the 
simdlen is chosen for non-specified simdlen clauses according to the 
'Vector Function Application Binary Interface Specification for AArch64'.


Additionally this patch also restricts combinations of simdlen and 
return/argument types that map to vectors larger than 128 bits as we 
currently do not have a way to represent these types in a way that is 
consistent internally and externally.


As I was writing this patch I was also contemplating a refactor of the 
compute_vecsize_and_simdlen targethook.  The current way it works where 
it is called once to get the 'count' and then count times for each of 
the respective simdlens, this leads to the need to write this function 
in an overly complex way.  I was thinking it would be nice to return 
either a vector of simdlens or perhaps a vector of some 'class' that can 
be extended per target. I was thinking something along the lines of:


class clone_config
{
  poly_uint64 simdlen;
  bool inbranch;
  char vecsize_mangle;
  poly_uint64 vecsize_int;
  poly_uint64 vecsize_float;
};

auto_vec clone_configs = 
targetm.simd_clone.compute_vecsizse_and_simdlen (node, clone_info, 
base_type, explicit_p);


for (auto config : clone_configs)
{
 clone = simd_clone_struct_alloc (clone_info->nargs
  + ((i & 1) != 0));
 simd_clone_struct_copy (clone, clone_info);
 /* Undo changes targetm.simd_clone.compute_vecsize_and_simdlen
and simd_clone_adjust_argument_types did to the first
clone's info.

Andre: Not sure we'd still need this here...*/
 clone->nargs -= clone_info->inbranch;
 clone->simdlen = orig_simdlen;
 targetm.simd_clone.config_clone (node, clone, config);  <--- new
}


I didn't want to block this patch on that, so I've left it for now, 
@Jakub: what are your thoughts on this?



Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (lane_size): New function.
(aarch64_simd_clone_compute_vecsize_and_simdlen): Determine 
simdlen according to NDS rule
and reject combination of simdlen and types that lead to 
vectors larger than 128bits.


gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add aarch64 targets to vect_simd_clones.
* c-c++-common/gomp/declare-variant-14.c: Adapt test for aarch64.
* c-c++-common/gomp/pr60823-1.c: Likewise.
* c-c++-common/gomp/pr60823-2.c: Likewise.
* c-c++-common/gomp/pr60823-3.c: Likewise.
* g++.dg/gomp/attrs-10.C: Likewise.
* g++.dg/gomp/declare-simd-1.C: Likewise.
* g++.dg/gomp/declare-simd-3.C: Likewise.
* g++.dg/gomp/declare-simd-4.C: Likewise.
* g++.dg/gomp/declare-simd-7.C: Likewise.
* g++.dg/gomp/declare-simd-8.C: Likewise.
* g++.dg/gomp/pr88182.C: Likewise.
* gcc.dg/declare-simd.c: Likewise.
* gcc.dg/gomp/declare-simd-1.c: Likewise.
* gcc.dg/gomp/declare-simd-3.c: Likewise.
* gcc.dg/gomp/pr87887-1.c: Likewise.
* gcc.dg/gomp/pr87895-1.c: Likewise.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/pr99542.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-1.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-2.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-4.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-5.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-8.c: Likewise.
* gfortran.dg/gomp/declare-simd-2.f90: Likewise.
* gfortran.dg/gomp/declare-simd-coarray-lib.f90: Likewise.
* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
* gfortran.dg/gomp/pr79154-1.f90: Likewise.
* gfortran.dg/gomp/pr83977.f90: Likewise.diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
7cd230c4602a15980016bdc92e80579be0c07094..5fb4c863d875871d6de865e72ce360506a3694d2
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27274,31 +27274,61 @@ supported_simd_type (tree t)
   return false;
 }
 
-/* Return true for types that currently are supported as SIMD return
-   or argument types.  */
+/* Determine the lane size for the clone argument/return type.  This follows
+   the LS(P) rule in the VFABIA64.  */
 
-static bool
-currently_supported_simd_type (tree t, tree b)
+static unsigned
+lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
 {
-  if (COMPLEX_FLOAT_TYPE_P (t))
-return false;
+  gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
 
-  if (TYPE_SIZE (t) != TYPE_SIZE (b))
-return false;
+  /* For non map-to-vector types that are pointers we use the element type it
+ points to.  */
+  if (POINTER_TYPE_P (type))
+switch (clone_arg_type)
+  {
+  default:
+   break;

Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

2023-08-09 Thread Andre Vieira (lists) via Gcc-patches




On 09/08/2023 17:55, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:


On 08/08/2023 11:51, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:



warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
-   "unsupported return type %qT for % functions",
+   "unsupported return type %qT for simd",
ret_type);


What's the reason for s/% functions/simd/, in particular for
dropping the quotes around simd?


It's to align with i386's error message, this helps with testing as then
I can avoid having different tests for the same error.

I asked Jakub which one he preferred, and he gave me an explanation why
the i386's one was preferable, ... but I didn't write it down unfortunately.


Jakub: do you remember what the reason was?  I don't mind dropping
"function", but it feels weird to drop the quotes around "simd".
Seems like, if we do that, there'll one day be a patch to add
them back. :)


After some IRC scrolling, unfortunately my client doesn't have a fancy 
search :(


avieira> Andre Vieira
jakub: which one do you prefer?
1:59 PM
"unsupported argument type %qT for simd" (i386)
1:59 PM
 "unsupported argument type %qT for % functions", (aarch64)
1:59 PM
Gonna change one to be the same as the other ...
2:04 PM
→ gaius joined  ⇐ lh_ideapad, egallager and jwakely_ quit
2:36 PM 
I'd just go with for simd; % functions isn't an established term, 
it would be either % functions, but we have also simd 
attribute...


Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

2023-08-09 Thread Andre Vieira (lists) via Gcc-patches

Here is my new version, see inline response to your comments.

New cover letter:

This patch enables the use of mixed-types for simd clones for AArch64, 
adds aarch64 as a target_vect_simd_clones and corrects the way the 
simdlen is chosen for non-specified simdlen clauses according to the 
'Vector Function Application Binary Interface Specification for AArch64'.


gcc/ChangeLog:

* config/aarch64/aarch64.cc (currently_supported_simd_type): 
Remove.
(aarch64_simd_clone_compute_vecsize_and_simdlen): Determine 
simdlen according to NDS rule.

(lane_size): New function.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add aarch64 targets to vect_simd_clones.
* c-c++-common/gomp/declare-variant-14.c: Add aarch64 checks 
and remove warning check.

* g++.dg/gomp/attrs-10.C: Likewise.
* g++.dg/gomp/declare-simd-1.C: Likewise.
* g++.dg/gomp/declare-simd-3.C: Likewise.
* g++.dg/gomp/declare-simd-4.C: Likewise.
* gcc.dg/gomp/declare-simd-3.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.
* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
* c-c++-common/gomp/pr60823-1.c: Remove warning check.
* c-c++-common/gomp/pr60823-3.c: Likewise.
* g++.dg/gomp/declare-simd-7.C: Likewise.
* g++.dg/gomp/declare-simd-8.C: Likewise.
* g++.dg/gomp/pr88182.C: Likewise.
* gcc.dg/declare-simd.c: Likewise.
* gcc.dg/gomp/declare-simd-1.c: Likewise.
* gcc.dg/gomp/pr87895-1.c: Likewise.
* gfortran.dg/gomp/declare-simd-2.f90: Likewise.
* gfortran.dg/gomp/declare-simd-coarray-lib.f90: Likewise.
* gfortran.dg/gomp/pr79154-1.f90: Likewise.
* gfortran.dg/gomp/pr83977.f90: Likewise.
* gcc.dg/gomp/pr87887-1.c: Add warning test.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/pr99542.c: Update warning test.



On 08/08/2023 11:51, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:



warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
-   "unsupported return type %qT for % functions",
+   "unsupported return type %qT for simd",
ret_type);


What's the reason for s/% functions/simd/, in particular for
dropping the quotes around simd?


It's to align with i386's error message, this helps with testing as then 
I can avoid having different tests for the same error.


I asked Jakub which one he preferred, and he gave me an explanation why 
the i386's one was preferable, ... but I didn't write it down unfortunately.





return 0;
  }
  
+  nfs_type = ret_type;


Genuine question, but what does nfs stand for in this context?

Was supposed to be nds... my bad.

I don't think this implements the NDS calculation in the spec:

  The `Narrowest Data Size of f`, or ``NDS(f)``, as the minumum of
  the lane size ``LS(P)`` among all input parameters and
  return value  of ``f``.

   ...

   We then define the `Lane Size of P`, or ``LS(P)``, as follows.

   1. If ``MTV(P)`` is ``false`` and ``P`` is a pointer or reference to
  some type ``T`` for which ``PBV(T)`` is ``true``, ``LS(P) =
  sizeof(T)``.
   2. If ``PBV(T(P))`` is ``true``, ``LS(P) = sizeof(P)``.
   3. Otherwise ``LS(P) = sizeof(uintptr_t)``.

AIUI, (1) means that we need to look at the targets of uniform and
linear scalars[*] that have pointer type, so that e.g. a uniform uint8_t *
pointer should cause NDS to be 1.

[*] i.e. arguments that remain scalar in the vector prototype

(2) means that other types of uniform and linear scalars do contribute.
A uniform uint8_t should cause NDS to be 1.


You are right, I misread the ABI description there.



Thanks,
Richarddiff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
7cd230c4602a15980016bdc92e80579be0c07094..458a4dbf76138e329eb99077780089a9b501c046
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27274,28 +27274,57 @@ supported_simd_type (tree t)
   return false;
 }
 
-/* Return true for types that currently are supported as SIMD return
-   or argument types.  */
+/* Determine the lane size for the clone argument/return type.  This follows
+   the LS(P) rule in the VFABIA64.  */
 
-static bool
-currently_supported_simd_type (tree t, tree b)
+static unsigned
+lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
 {
-  if (COMPLEX_FLOAT_TYPE_P (t))
-return false;
+  gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
 
-  if (TYPE_SIZE (t) != TYPE_SIZE (b))
-return false;
+  /* For non map-to-vector types that are pointers we use the element type it
+ points to.  */
+  if (POINTER_TYPE_P (type))
+switch (clone_arg_type)
+  {
+  default:
+   break;
+  case SIMD_CLONE_ARG_TYPE_UNIFORM:
+  case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
+  case SIMD_CLONE_ARG_TYPE_LINEAR_VARIAB

[PATCH] aarch64: enable mixed-types for aarch64 simdclones

2023-07-26 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch enables the use of mixed-types for simd clones for AArch64 
and adds aarch64 as a target_vect_simd_clones.


Bootstrapped and regression tested on aarch64-unknown-linux-gnu

gcc/ChangeLog:

* config/aarch64/aarch64.cc (currently_supported_simd_type): 
Remove.
(aarch64_simd_clone_compute_vecsize_and_simdlen): Use NFS type 
to determine simdlen.


gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add aarch64 targets to vect_simd_clones.
* c-c++-common/gomp/declare-variant-14.c: Add aarch64 checks 
and remove warning check.

* g++.dg/gomp/attrs-10.C: Likewise.
* g++.dg/gomp/declare-simd-1.C: Likewise.
* g++.dg/gomp/declare-simd-3.C: Likewise.
* g++.dg/gomp/declare-simd-4.C: Likewise.
* gcc.dg/gomp/declare-simd-3.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.
* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
* c-c++-common/gomp/pr60823-1.c: Remove warning check.
* c-c++-common/gomp/pr60823-3.c: Likewise.
* g++.dg/gomp/declare-simd-7.C: Likewise.
* g++.dg/gomp/declare-simd-8.C: Likewise.
* g++.dg/gomp/pr88182.C: Likewise.
* gcc.dg/declare-simd.c: Likewise.
* gcc.dg/gomp/declare-simd-1.c: Likewise.
* gcc.dg/gomp/pr87895-1.c: Likewise.
* gfortran.dg/gomp/declare-simd-2.f90: Likewise.
* gfortran.dg/gomp/declare-simd-coarray-lib.f90: Likewise.
* gfortran.dg/gomp/pr79154-1.f90: Likewise.
* gfortran.dg/gomp/pr83977.f90: Likewise.
* gcc.dg/gomp/pr87887-1.c: Add warning test.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/pr99542.c: Update warning test.diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
560e5431636ef46c41d56faa0c4e95be78f64b50..ac6350a44481628a947a0f20e034acf92cde63ec
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27194,21 +27194,6 @@ supported_simd_type (tree t)
   return false;
 }
 
-/* Return true for types that currently are supported as SIMD return
-   or argument types.  */
-
-static bool
-currently_supported_simd_type (tree t, tree b)
-{
-  if (COMPLEX_FLOAT_TYPE_P (t))
-return false;
-
-  if (TYPE_SIZE (t) != TYPE_SIZE (b))
-return false;
-
-  return supported_simd_type (t);
-}
-
 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
 
 static int
@@ -27217,7 +27202,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
tree base_type, int num,
bool explicit_p)
 {
-  tree t, ret_type;
+  tree t, ret_type, nfs_type;
   unsigned int elt_bits, count;
   unsigned HOST_WIDE_INT const_simdlen;
   poly_uint64 vec_bits;
@@ -27240,55 +27225,61 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
 }
 
   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
+  /* According to AArch64's Vector ABI the type that determines the simdlen is
+ the narrowest of types, so we ignore base_type for AArch64.  */
   if (TREE_CODE (ret_type) != VOID_TYPE
-  && !currently_supported_simd_type (ret_type, base_type))
+  && !supported_simd_type (ret_type))
 {
   if (!explicit_p)
;
-  else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
-   warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
-   "GCC does not currently support mixed size types "
-   "for % functions");
-  else if (supported_simd_type (ret_type))
+  else if (COMPLEX_FLOAT_TYPE_P (ret_type))
warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
"GCC does not currently support return type %qT "
-   "for % functions", ret_type);
+   "for simd", ret_type);
   else
warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
-   "unsupported return type %qT for % functions",
+   "unsupported return type %qT for simd",
ret_type);
   return 0;
 }
 
+  nfs_type = ret_type;
   int i;
   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
-
   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
t && t != void_list_node; t = TREE_CHAIN (t), i++)
 {
   tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
-
   if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
- && !currently_supported_simd_type (arg_type, base_type))
+ && !supported_simd_type (arg_type))
{
  if (!explicit_p)
;
- else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
+ else if (COMPLEX_FLOAT_TYPE_P (ret_type))
warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
-   "GCC does not currently support mixed size types "
- 

Re: [PATCH] Include insn-opinit.h in PLUGIN_H [PR110610]

2023-07-17 Thread Andre Vieira (lists) via Gcc-patches



On 11/07/2023 23:28, Jeff Law wrote:



On 7/11/23 04:37, Andre Vieira (lists) via Gcc-patches wrote:

Hi,

This patch fixes PR110610 by including OPTABS_H in the INTERNAL_FN_H 
list, as insn-opinit.h is now required by internal-fn.h. This will 
lead to insn-opinit.h, among the other OPTABS_H header files, being 
installed in the plugin directory.


Bootstrapped aarch64-unknown-linux-gnu.

@Jakub: could you check to see if it also addresses PR 110284?


gcc/ChangeLog:

 PR 110610
 * Makefile.in (INTERNAL_FN_H): Add OPTABS_H.
Why use OPTABS_H here?  Isn't the new dependency just on insn-opinit.h 
and insn-codes.h and neither of those #include other headers do they?





Yeah, there was no particular reason other than I just felt the Makefile 
structure sort of lend itself that way. I checked genopinit.cc and it 
seems insn-opinit.h doesn't include any other header files, only the 
sources do, so I've changed the patch to only add insn-opinit.h to 
INTERNAL_FN_H.


---

This patch fixes PR110610 by including insn-opinit.h in the 
INTERNAL_FN_H list, as insn-opinit.h is now required by internal-fn.h. 
This will lead to insn-opinit.h, among the other OPTABS_H header files, 
being installed in the plugin directory.


Bootstrapped aarch64-unknown-linux-gnu.

gcc/ChangeLog:
PR 110610
* Makefile.in (INTERNAL_FN_H): Add insn-opinit.h.diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 
c478ec852013eae65b9f3ec0a443e023c7d8b452..683774ad446d545362644d2dbdc37723eea55bc3
 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -976,7 +976,7 @@ READ_MD_H = $(OBSTACK_H) $(HASHTAB_H) read-md.h
 BUILTINS_DEF = builtins.def sync-builtins.def omp-builtins.def \
gtm-builtins.def sanitizer.def
 INTERNAL_FN_DEF = internal-fn.def
-INTERNAL_FN_H = internal-fn.h $(INTERNAL_FN_DEF)
+INTERNAL_FN_H = internal-fn.h $(INTERNAL_FN_DEF) insn-opinit.h
 TREE_CORE_H = tree-core.h $(CORETYPES_H) all-tree.def tree.def \
c-family/c-common.def $(lang_tree_files) \
$(BUILTINS_DEF) $(INPUT_H) statistics.h \


[PATCH] Include insn-opinit.h in PLUGIN_H [PR110610]

2023-07-11 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch fixes PR110610 by including OPTABS_H in the INTERNAL_FN_H 
list, as insn-opinit.h is now required by internal-fn.h. This will lead 
to insn-opinit.h, among the other OPTABS_H header files, being installed 
in the plugin directory.


Bootstrapped aarch64-unknown-linux-gnu.

@Jakub: could you check to see if it also addresses PR 110284?


gcc/ChangeLog:

PR 110610
* Makefile.in (INTERNAL_FN_H): Add OPTABS_H.diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 
c478ec852013eae65b9f3ec0a443e023c7d8b452..d3ff210ee04414f4e238c087400dd21e1cb0fc18
 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -976,7 +976,7 @@ READ_MD_H = $(OBSTACK_H) $(HASHTAB_H) read-md.h
 BUILTINS_DEF = builtins.def sync-builtins.def omp-builtins.def \
gtm-builtins.def sanitizer.def
 INTERNAL_FN_DEF = internal-fn.def
-INTERNAL_FN_H = internal-fn.h $(INTERNAL_FN_DEF)
+INTERNAL_FN_H = internal-fn.h $(INTERNAL_FN_DEF) $(OPTABS_H)
 TREE_CORE_H = tree-core.h $(CORETYPES_H) all-tree.def tree.def \
c-family/c-common.def $(lang_tree_files) \
$(BUILTINS_DEF) $(INPUT_H) statistics.h \


[PATCH] vect: Treat vector widening IFN calls as 'simple' [PR110436]

2023-07-03 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch makes the vectorizer treat any vector widening IFN as simple, 
like

it did with the tree codes VEC_WIDEN_*.

I wasn't sure whether I should make all IFN's simple and then exclude 
some (like GOMP_ ones), or include more than just the new widening IFNs. 
But since this is the only behaviour that changed with the ifn patch, I 
decided to only special case the widening IFNs for now. Let me know if 
you have different thoughts on this.


Bootstrapped and regression tested on aarch64-unknow-linux-gnu.

gcc/ChangeLog:

PR tree-optimization/110436
* tree-vect-stmts.cc (is_simple_and_all_uses_invariant): Treat widening
IFN's as simple.

gcc/testsuite/ChangeLog:

* gcc.dg/pr110436.c: New test.diff --git a/gcc/testsuite/gcc.dg/pr110436.c b/gcc/testsuite/gcc.dg/pr110436.c
new file mode 100644
index 
..c146f99fac9f0524eaa3b1230b56e9f94eed5bda
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr110436.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "pr83089.c"
+
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
d642d3c257f8d540a8562eedbcd40372b9550959..706055e9af94f0c1500c25faf4bd74fc08bf3cd6
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -296,8 +296,11 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
   tree op;
   ssa_op_iter iter;
 
-  gassign *stmt = dyn_cast  (stmt_info->stmt);
-  if (!stmt)
+  gimple *stmt = stmt_info->stmt;
+  if (!is_gimple_assign (stmt)
+  && !(is_gimple_call (stmt)
+  && gimple_call_internal_p (stmt)
+  && widening_fn_p (gimple_call_combined_fn (stmt
 return false;
 
   FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)


Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-06-23 Thread Andre Vieira (lists) via Gcc-patches

+  /* In order to find out if the loop is of type A or B above look for the
+ loop counter: it will either be incrementing by one per iteration or
+ it will be decrementing by num_of_lanes.  We can find the loop counter
+ in the condition at the end of the loop.  */
+  rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (body));
+  gcc_assert (cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode)
+ && GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE);

Not sure this should be an assert. If we do encounter a differently 
formed loop, we should bail out of DLSTPing for now but we shouldn't ICE.



+  /* The loop latch has to be empty.  When compiling all the known MVE 
LoLs in
+ user applications, none of those with incrementing counters had 
any real
+ insns in the loop latch.  As such, this function has only been 
tested with

+ an empty latch and may misbehave or ICE if we somehow get here with an
+ increment in the latch, so, for sanity, error out early.  */
+  rtx_insn *dec_insn = BB_END (body->loop_father->latch);
+  if (NONDEBUG_INSN_P (dec_insn))
+gcc_unreachable ();

Similarly here I'd return false rather than gcc_unreachable ();


+  /* Find where both of those are modified in the loop body bb.  */
+  rtx condcount_reg_set = PATTERN (DF_REF_INSN (df_bb_regno_only_def_find
+(body, REGNO (condcount;
Put = on newline, breaks it down nicer.

+ counter_orig_set = XEXP (PATTERN
+   (DF_REF_INSN
+ (DF_REF_NEXT_REG
+   (DF_REG_DEF_CHAIN
+(REGNO
+  (XEXP (condcount_reg_set, 0)), 
1);

This makes me a bit nervous, can we be certain that the PATTERN of the 
next insn that sets it is indeed a set. Heck can we even be sure 
DF_REG_DEF_CHAIN returns a non-null, I can't imagine why not but maybe 
there are some constructs it can't follow-up on? Might just be worth 
checking these steps and bailing out.




+  /* When we find the vctp instruction: This may be followed by
+  a zero-extend insn to SImode.  If it is, then save the
+  zero-extended REG into vctp_vpr_generated.  If there is no
+  zero-extend, then store the raw output of the vctp.
+  For any VPT-predicated instructions we need to ensure that
+  the VPR they use is the same as the one given here and
+  they often consume the output of a subreg of the SImode
+  zero-extended VPR-reg.  As a result, comparing against the
+  output of the zero-extend is more likely to succeed.
+  This code also guarantees to us that the vctp comes before
+  any instructions that use the VPR within the loop, for the
+  dlstp/letp transform to succeed.  */

Wrong comment indent after first line.

+  rtx_insn *vctp_insn = arm_mve_get_loop_vctp (body);
+  if (!vctp_insn || !arm_mve_loop_valid_for_dlstp (body))
+return GEN_INT (1);

arm_mve_loop_valid_for_dlstp already calls arm_mve_get_loop_vctp, maybe 
have 'arm_mve_loop_valid_for_dlstp' return vctp_insn or null to 
determine success or failure, avoids looping through the BB again.


For the same reason I'd also pass vctp_insn down to 
'arm_mve_check_df_chain_back_for_implic_predic'.


+ if (GET_CODE (SET_SRC (single_set (next_use1))) == ZERO_EXTEND)
+   {
+ rtx_insn *next_use2 = NULL;

Are we sure single_set can never return 0 here? Maybe worth an extra 
check and bail out if it does?


+   /* If the insn pattern requires the use of the VPR value from the
+ vctp as an input parameter.  */
s/an an input parameter./as an input parameter for predication./

+ /* None of registers USE-d by the instruction need can be the VPR
+vctp_vpr_generated.  This blocks the optimisation if there any
+instructions that use the optimised-out VPR value in any way
+other than as a VPT block predicate.  */

Reword this slightly to be less complex:
This instruction USE-s the vctp_vpr_generated other than for 
predication, this blocks the transformation as we are not allowed to 
optimise the VPR value away.


Will continue reviewing next week :)

On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:

     Hi all,

     This is the 2/2 patch that contains the functional changes needed
     for MVE Tail Predicated Low Overhead Loops.  See my previous email
     for a general introduction of MVE LOLs.

     This support is added through the already existing loop-doloop
     mechanisms that are used for non-MVE dls/le looping.

     Mid-end changes are:

     1) Relax the loop-doloop mechanism in the mid-end to allow for
    decrement numbers other that -1 and for `count` to be an
    rtx containing a simple REG (which in this case will contain
    the number of elements to be processed), 

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-06-23 Thread Andre Vieira (lists) via Gcc-patches

+  if (insn != arm_mve_get_loop_vctp (body))
+{

probably a good idea to invert the condition here and return false, 
helps reducing the indenting in this function.



+   /* Starting from the current insn, scan backwards through the insn
+  chain until BB_HEAD: "for each insn in the BB prior to the current".
+   */

There's a trailing whitespace after insn, but also I'd rewrite this bit. 
The "for each insn in the BB prior to the current" is superfluous and 
even confusing to me. How about:
"Scan backwards from the current INSN through the instruction chain 
until the start of the basic block.  "



 I find 'that previous insn' to be confusing as you don't mention any 
previous insn before. So how about something along the lines of:
'If a previous insn defines a register that INSN uses then return true 
if...'



Do we need to check: 'insn != prev_insn' ? Any reason why you can't 
start the loop with:

'for (rtx_insn *prev_insn = PREV_INSN (insn);'

Now I also found a case where things might go wrong in:
+   /* Look at all the DEFs of that previous insn: if one of them is on
+  the same REG as our current insn, then recurse in order to check
+  that insn's USEs.  If any of these insns return true as
+  MVE_VPT_UNPREDICATED_INSN_Ps, then the whole chain is affected
+  by the change in behaviour from being placed in dlstp/letp loop.
+   */
+   df_ref prev_insn_defs = NULL;
+   FOR_EACH_INSN_DEF (prev_insn_defs, prev_insn)
+ {
+   if (DF_REF_REGNO (insn_uses) == DF_REF_REGNO (prev_insn_defs)
+   && insn != prev_insn
+   && body == BLOCK_FOR_INSN (prev_insn)
+   && !arm_mve_vec_insn_is_predicated_with_this_predicate
+(insn, vctp_vpr_generated)
+   && arm_mve_check_df_chain_back_for_implic_predic
+(prev_insn, vctp_vpr_generated))
+ return true;
+ }

The body == BLOCK_FOR_INSN (prev_insn) hinted me at it, if a def comes 
from outside of the BB (so outside of the loop's body) then its by 
definition unpredicated by vctp.  I think you want to check that if 
prev_insn defines a register used by insn then return true if prev_insn 
isn't in the same BB or has a chain that is not predicated, i.e.: 
'!arm_mve_vec_insn_is_predicated_with_this_predicate (insn, 
vctp_vpr_generated) && arm_mve_check_df_chain_back_for_implic_predic 
prev_insn, vctp_vpr_generated))' you check body != BLOCK_FOR_INSN 
(prev_insn)'



I also found some other issues, this currently loloops:

uint16_t  test (uint16_t *a, int n)
{
  uint16_t res =0;
  while (n > 0)
{
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vaddvaq_u16 (res, va);
  res = vaddvaq_p_u16 (res, va, p);
  a += 8;
  n -= 8;
}
  return res;
}

But it shouldn't, this is because there's a lack of handling of across 
vector instructions. Luckily in MVE all across vector instructions have 
the side-effect that they write to a scalar register, even the vshlcq 
instruction (it writes to a scalar carry output).


Did this lead me to find an ICE with:

uint16x8_t  test (uint16_t *a, int n)
{
  uint16x8_t res = vdupq_n_u16 (0);
  while (n > 0)
{
  uint16_t carry = 0;
  mve_pred16_t p = vctp16q (n);
  uint16x8_t va = vldrhq_u16 (a);
  res = vshlcq_u16 (va, , 1);
  res = vshlcq_m_u16 (res, , 1 , p);
  a += 8;
  n -= 8;
}
  return res;
}

This is because:
+ /* If the USE is outside the loop body bb, or it is inside, but
+is an unpredicated store to memory.  */
+ if (BLOCK_FOR_INSN (insn) != BLOCK_FOR_INSN (next_use_insn)
+|| (arm_mve_vec_insn_is_unpredicated_or_uses_other_predicate
+(next_use_insn, vctp_vpr_generated)
+   && mve_memory_operand
+   (SET_DEST (single_set (next_use_insn)),
+GET_MODE (SET_DEST (single_set (next_use_insn))
+   return true;

Assumes single_set doesn't return 0.

Let's deal with these issues and I'll continue to review.

On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:

     Hi all,

     This is the 2/2 patch that contains the functional changes needed
     for MVE Tail Predicated Low Overhead Loops.  See my previous email
     for a general introduction of MVE LOLs.

     This support is added through the already existing loop-doloop
     mechanisms that are used for non-MVE dls/le looping.

     Mid-end changes are:

     1) Relax the loop-doloop mechanism in the mid-end to allow for
    decrement numbers other that -1 and for `count` to be an
    rtx containing a simple REG (which in this case will contain
    the number of elements to be processed), rather
    than an expression for calculating the number of iterations.
  

Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-06-22 Thread Andre Vieira (lists) via Gcc-patches
Some comments below, all quite minor. I'll continue to review tomorrow, 
I need a fresher brain for arm_mve_check_df_chain_back_for_implic_predic 
 ;)


+static int
+arm_mve_get_vctp_lanes (rtx x)
+{
+  if (GET_CODE (x) == SET && GET_CODE (XEXP (x, 1)) == UNSPEC
+  && (XINT (XEXP (x, 1), 1) == VCTP || XINT (XEXP (x, 1), 1) == 
VCTP_M))

+{
+  switch (GET_MODE (XEXP (x, 1)))
+   {
+ case V16BImode:
+   return 16;
+ case V8BImode:
+   return 8;
+ case V4BImode:
+   return 4;
+ case V2QImode:
+   return 2;
+ default:
+   break;
+   }
+}
+  return 0;
+}

I think you can replace the switch with something along the lines of:
machine_mode mode = GET_MODE (XEXP (x, 1));
return VECTOR_MODE_P (mode) ? GET_MODE_NUNITS (mode) : 0;


+/* Check if an insn requires the use of the VPR_REG, if it does, return the
+   sub-rtx of the VPR_REG.  The `type` argument controls whether
+   this function should:
+   * For type == 0, check all operands, including the OUT operands,
+ and return the first occurance of the VPR_REG.

s/occurance/occurrence/

+ bool requires_vpr;
+  extract_constrain_insn (insn);

indent of requires_vpr is off.

+  if (type == 1 && (recog_data.operand_type[op] == OP_OUT
+   || recog_data.operand_type[op] == OP_INOUT))
+   continue;
+  else if (type == 2 && (recog_data.operand_type[op] == OP_IN
+|| recog_data.operand_type[op] == OP_INOUT))
+   continue;

Why skip INOUT? I guess this will become clear when I see the uses, but 
I'm wondering whether 'only check the input operands.' is clear enough. 
Maybe 'check operands that are input only.' would be more accurate?


+ /* Fetch the reg_class for each entry and check it against the
+  * VPR_REG reg_class.  */

Remove leading * on the second line.

+
+/* Wrapper function of arm_get_required_vpr_reg with type == 1, so return
+   something only if the VPR reg is an input operand to the insn.  */

When talking about a function parameter in comments capitalize (INSN) 
the name. Same for:


+/* Wrapper function of arm_get_required_vpr_reg with type == 2, so return
+   something only if the VPR reg is the retrurn value, an output of, or is
+   clobbered by the insn.  */

+/* Return true if an insn is an MVE instruction that VPT-predicable, but in
+   its unpredicated form, or if it is predicated, but on a predicate other
+   than vpr_reg.  */

In this one also 'is a MVE instruction that is VPT-predicable' would be 
better I think.



On 15/06/2023 12:47, Stamatis Markianos-Wright via Gcc-patches wrote:
>  Hi all,
>
>  This is the 2/2 patch that contains the functional changes needed
>  for MVE Tail Predicated Low Overhead Loops.  See my previous email
>  for a general introduction of MVE LOLs.
>
>  This support is added through the already existing loop-doloop
>  mechanisms that are used for non-MVE dls/le looping.
>
>  Mid-end changes are:
>
>  1) Relax the loop-doloop mechanism in the mid-end to allow for
> decrement numbers other that -1 and for `count` to be an
> rtx containing a simple REG (which in this case will contain
> the number of elements to be processed), rather
> than an expression for calculating the number of iterations.
>  2) Added a new df utility function: `df_bb_regno_only_def_find` that
> will return the DEF of a REG only if it is DEF-ed once within the
> basic block.
>
>  And many things in the backend to implement the above optimisation:
>
>  3)  Implement the `arm_predict_doloop_p` target hook to instruct the
>  mid-end about Low Overhead Loops (MVE or not), as well as
>  `arm_loop_unroll_adjust` which will prevent unrolling of any 
loops
>  that are valid for becoming MVE Tail_Predicated Low Overhead 
Loops
>  (unrolling can transform a loop in ways that invalidate the 
dlstp/

>  letp tranformation logic and the benefit of the dlstp/letp loop
>  would be considerably higher than that of unrolling)
>  4)  Appropriate changes to the define_expand of doloop_end, new
>  patterns for dlstp and letp, new iterators,  unspecs, etc.
>  5) `arm_mve_loop_valid_for_dlstp` and a number of checking 
functions:

> * `arm_mve_dlstp_check_dec_counter`
> * `arm_mve_dlstp_check_inc_counter`
> * `arm_mve_check_reg_origin_is_num_elems`
> * `arm_mve_check_df_chain_back_for_implic_predic`
> * `arm_mve_check_df_chain_fwd_for_implic_predic_impact`
> This all, in smoe way or another, are running checks on the loop
> structure in order to determine if the loop is valid for 
dlstp/letp

> transformation.
>  6) `arm_attempt_dlstp_transform`: (called from the define_expand of
>  doloop_end) this function re-checks for the loop's 
suitability for


Re: [PATCH] inline: improve internal function costs

2023-06-12 Thread Andre Vieira (lists) via Gcc-patches




On 05/06/2023 04:04, Jan Hubicka wrote:

On Thu, 1 Jun 2023, Andre Vieira (lists) wrote:


Hi,

This is a follow-up of the internal function patch to add widening and
narrowing patterns.  This patch improves the inliner cost estimation for
internal functions.


I have no idea why calls are special in IPA analyze_function_body
and so I cannot say whether treating all internal fn calls as
non-calls is correct there.  Honza?


The reason is that normal statements are acconted as part of the
function body, while calls have their costs attached to call edges
(so it can be adjusted when call is inlined to otherwise optimized).

However since internal functions have no cgraph edges, this looks like
a bug that we do not test it.  (the code was written before internal
calls was introduced).



This sounds to me like you agree with my approach to treat internal 
calls different to regular calls.



I wonder if we don't want to have is_noninternal_gimple_call that could
be used by IPA code to test whether cgraph edge should exist for
the statement.


I'm happy to add such a helper function @richi,rsandifo: you ok with that?


The tree-inline.cc change is OK though (you can push that separately).

The rest is OK too.
Honza


Thanks,
Richard.


Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

gcc/ChangeLog:

 * ipa-fnsummary.cc (analyze_function_body): Correctly handle
 non-zero costed internal functions.
 * tree-inline.cc (estimate_num_insns): Improve costing for internal
 functions.



--
Richard Biener 
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)


vect: Don't pass subtype to vect_widened_op_tree where not needed [PR 110142]

2023-06-07 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch fixes an issue introduced by 
g:2f482a07365d9f4a94a56edd13b7f01b8f78b5a0, where a subtype was beeing 
passed to vect_widened_op_tree, when no subtype was to be used. This 
lead to an errorneous use of IFN_VEC_WIDEN_MINUS.


gcc/ChangeLog:

* tree-vect-patterns.cc (vect_recog_widen_op_pattern): Don't 
pass subtype to

vect_widened_op_tree and remove subtype parameter.
(vect_recog_widen_plus_pattern): Remove subtype parameter and 
dont pass to call to

vect_recog_widen_op_pattern.
(vect_recog_widen_minus_pattern): Likewise.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr110142.c: New test.diff --git a/gcc/testsuite/gcc.dg/vect/pr110142.c 
b/gcc/testsuite/gcc.dg/vect/pr110142.c
new file mode 100644
index 
..a88dbe400f46a33a53649298345c24c569e2f567
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr110142.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3" } */
+void test(short *x, unsigned short *y, int n)
+{
+  for (int i = 0; i < n; i++)
+  x[i] = (y[i] - x[i]) >> 1;
+}
+
+/* { dg-final { scan-tree-dump-not "widen_minus" "vect" } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 
dc102c919352a0328cf86eabceb3a38c41a7e4fd..599a027f9b2feb8971c1ee017b6457bc297c86c2
 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1405,15 +1405,14 @@ static gimple *
 vect_recog_widen_op_pattern (vec_info *vinfo,
 stmt_vec_info last_stmt_info, tree *type_out,
 tree_code orig_code, code_helper wide_code,
-bool shift_p, const char *name,
-optab_subtype *subtype = NULL)
+bool shift_p, const char *name)
 {
   gimple *last_stmt = last_stmt_info->stmt;
 
   vect_unpromoted_value unprom[2];
   tree half_type;
   if (!vect_widened_op_tree (vinfo, last_stmt_info, orig_code, orig_code,
-shift_p, 2, unprom, _type, subtype))
+shift_p, 2, unprom, _type))
 
 return NULL;
 
@@ -1484,13 +1483,11 @@ static gimple *
 vect_recog_widen_op_pattern (vec_info *vinfo,
 stmt_vec_info last_stmt_info, tree *type_out,
 tree_code orig_code, internal_fn wide_ifn,
-bool shift_p, const char *name,
-optab_subtype *subtype = NULL)
+bool shift_p, const char *name)
 {
   combined_fn ifn = as_combined_fn (wide_ifn);
   return vect_recog_widen_op_pattern (vinfo, last_stmt_info, type_out,
- orig_code, ifn, shift_p, name,
- subtype);
+ orig_code, ifn, shift_p, name);
 }
 
 
@@ -1513,11 +1510,9 @@ static gimple *
 vect_recog_widen_plus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,
   tree *type_out)
 {
-  optab_subtype subtype;
   return vect_recog_widen_op_pattern (vinfo, last_stmt_info, type_out,
  PLUS_EXPR, IFN_VEC_WIDEN_PLUS,
- false, "vect_recog_widen_plus_pattern",
- );
+ false, "vect_recog_widen_plus_pattern");
 }
 
 /* Try to detect subtraction on widened inputs, converting MINUS_EXPR
@@ -1526,11 +1521,9 @@ static gimple *
 vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info,
   tree *type_out)
 {
-  optab_subtype subtype;
   return vect_recog_widen_op_pattern (vinfo, last_stmt_info, type_out,
  MINUS_EXPR, IFN_VEC_WIDEN_MINUS,
- false, "vect_recog_widen_minus_pattern",
- );
+ false, "vect_recog_widen_minus_pattern");
 }
 
 /* Function vect_recog_ctz_ffs_pattern


Re: [PATCH] modula2: Fix bootstrap

2023-06-07 Thread Andre Vieira (lists) via Gcc-patches

Thanks Jakub!

I do need those includes and sorry I broke your bootstrap it didn't show 
up on my aarch64-unknown-linux-gnu bootstrap, I'm guessing the rules 
there were just run in a different order. Glad you were able to fix it :)


On 06/06/2023 22:28, Jakub Jelinek wrote:

Hi!

internal-fn.h since yesterday includes insn-opinit.h, which is a generated
header.
One of my bootstraps today failed because some m2 sources started compiling
before insn-opinit.h has been generated.

Normally, gcc/Makefile.in has
# In order for parallel make to really start compiling the expensive
# objects from $(OBJS) as early as possible, build all their
# prerequisites strictly before all objects.
$(ALL_HOST_OBJS) : | $(generated_files)

rule which ensures that all the generated files are generated before
any $(ALL_HOST_OBJS) objects start, but use order-only dependency for
this because we don't want to rebuild most of the objects whenever one
generated header is regenerated.  After the initial build in an empty
directory we'll have .deps/ files contain the detailed dependencies.

$(ALL_HOST_OBJS) includes even some FE files, I think in the m2 case
would be m2_OBJS, but m2/Make-lang.in doesn't define those.

The following patch just adds a similar rule to m2/Make-lang.in.
Another option would be to set m2_OBJS variable in m2/Make-lang.in to
something, but not really sure to which exactly and why it isn't
done.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2023-06-06  Jakub Jelinek  

* Make-lang.in: Build $(generated_files) before building
all $(GM2_C_OBJS).

--- gcc/m2/Make-lang.in.jj  2023-05-04 09:31:27.289948109 +0200
+++ gcc/m2/Make-lang.in 2023-06-06 21:38:26.655336041 +0200
@@ -511,6 +511,8 @@ GM2_LIBS_BOOT = m2/gm2-compiler-boot
  m2/gm2-libs-boot/libgm2.a \
  $(GM2-BOOT-O)
  
+$(GM2_C_OBJS) : | $(generated_files)

+
  cc1gm2$(exeext): m2/stage1/cc1gm2$(exeext) $(m2.prev)
cp -p $< $@
  



Jakub



Re: [PATCH] inline: improve internal function costs

2023-06-02 Thread Andre Vieira (lists) via Gcc-patches




On 02/06/2023 10:13, Richard Biener wrote:

On Thu, 1 Jun 2023, Andre Vieira (lists) wrote:


Hi,

This is a follow-up of the internal function patch to add widening and
narrowing patterns.  This patch improves the inliner cost estimation for
internal functions.


I have no idea why calls are special in IPA analyze_function_body
and so I cannot say whether treating all internal fn calls as
non-calls is correct there.  Honza?

The tree-inline.cc change is OK though (you can push that separately).
I can't though, it ICEs on libgcc compilation (and other tests in 
testsuite). The estimate function is used by IPA to compute size and 
without the changes there it hits an assert because the 
estimate_num_insns no longer matches what ipa records in its 
size_time_table.


I'll wait for Honza to comment.


Thanks,
Richard.


Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

gcc/ChangeLog:

 * ipa-fnsummary.cc (analyze_function_body): Correctly handle
 non-zero costed internal functions.
 * tree-inline.cc (estimate_num_insns): Improve costing for internal
 functions.





[PATCH] gimple-range: implement widen plus range

2023-06-01 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch adds gimple-range information for the new IFN_VEC_WIDEN_PLUS* 
internal functions, identical to what VEC_WIDEN_PLUS did.


Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

gcc/ChangeLog:

* gimple-range-op.cc (gimple_range_op_handler::maybe_non_standard):
Add support for IFN_VEC_WIDEN_PLUS*.diff --git a/gcc/gimple-range-op.cc b/gcc/gimple-range-op.cc
index 
59c47e2074ddc73065468fe92274c260bd5bac48..7a84931d6204a56549cf1563114d8db7a2e26a6a
 100644
--- a/gcc/gimple-range-op.cc
+++ b/gcc/gimple-range-op.cc
@@ -1187,6 +1187,7 @@ gimple_range_op_handler::maybe_non_standard ()
 {
   range_operator *signed_op = ptr_op_widen_mult_signed;
   range_operator *unsigned_op = ptr_op_widen_mult_unsigned;
+  bool signed1, signed2, signed_ret;
   if (gimple_code (m_stmt) == GIMPLE_ASSIGN)
 switch (gimple_assign_rhs_code (m_stmt))
   {
@@ -1196,32 +1197,58 @@ gimple_range_op_handler::maybe_non_standard ()
  m_op1 = gimple_assign_rhs1 (m_stmt);
  m_op2 = gimple_assign_rhs2 (m_stmt);
  tree ret = gimple_assign_lhs (m_stmt);
- bool signed1 = TYPE_SIGN (TREE_TYPE (m_op1)) == SIGNED;
- bool signed2 = TYPE_SIGN (TREE_TYPE (m_op2)) == SIGNED;
- bool signed_ret = TYPE_SIGN (TREE_TYPE (ret)) == SIGNED;
-
- /* Normally these operands should all have the same sign, but
-some passes and violate this by taking mismatched sign args.  At
-the moment the only one that's possible is mismatch inputs and
-unsigned output.  Once ranger supports signs for the operands we
-can properly fix it,  for now only accept the case we can do
-correctly.  */
- if ((signed1 ^ signed2) && signed_ret)
-   return;
-
- m_valid = true;
- if (signed2 && !signed1)
-   std::swap (m_op1, m_op2);
-
- if (signed1 || signed2)
-   m_int = signed_op;
- else
-   m_int = unsigned_op;
+ signed1 = TYPE_SIGN (TREE_TYPE (m_op1)) == SIGNED;
+ signed2 = TYPE_SIGN (TREE_TYPE (m_op2)) == SIGNED;
+ signed_ret = TYPE_SIGN (TREE_TYPE (ret)) == SIGNED;
  break;
}
default:
- break;
+ return;
+  }
+  else if (gimple_code (m_stmt) == GIMPLE_CALL
+  && gimple_call_internal_p (m_stmt)
+  && gimple_get_lhs (m_stmt) != NULL_TREE)
+switch (gimple_call_internal_fn (m_stmt))
+  {
+  case IFN_VEC_WIDEN_PLUS:
+  case IFN_VEC_WIDEN_PLUS_LO:
+  case IFN_VEC_WIDEN_PLUS_HI:
+  case IFN_VEC_WIDEN_PLUS_EVEN:
+  case IFN_VEC_WIDEN_PLUS_ODD:
+ {
+   signed_op = ptr_op_widen_plus_signed;
+   unsigned_op = ptr_op_widen_plus_unsigned;
+   m_valid = false;
+   m_op1 = gimple_call_arg (m_stmt, 0);
+   m_op2 = gimple_call_arg (m_stmt, 1);
+   tree ret = gimple_get_lhs (m_stmt);
+   signed1 = TYPE_SIGN (TREE_TYPE (m_op1)) == SIGNED;
+   signed2 = TYPE_SIGN (TREE_TYPE (m_op2)) == SIGNED;
+   signed_ret = TYPE_SIGN (TREE_TYPE (ret)) == SIGNED;
+   break;
+ }
+  default:
+   return;
   }
+  else
+return;
+
+  /* Normally these operands should all have the same sign, but some passes
+ and violate this by taking mismatched sign args.  At the moment the only
+ one that's possible is mismatch inputs and unsigned output.  Once ranger
+ supports signs for the operands we can properly fix it,  for now only
+ accept the case we can do correctly.  */
+  if ((signed1 ^ signed2) && signed_ret)
+return;
+
+  m_valid = true;
+  if (signed2 && !signed1)
+std::swap (m_op1, m_op2);
+
+  if (signed1 || signed2)
+m_int = signed_op;
+  else
+m_int = unsigned_op;
 }
 
 // Set up a gimple_range_op_handler for any built in function which can be


[PATCH] inline: improve internal function costs

2023-06-01 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This is a follow-up of the internal function patch to add widening and 
narrowing patterns.  This patch improves the inliner cost estimation for 
internal functions.


Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

gcc/ChangeLog:

* ipa-fnsummary.cc (analyze_function_body): Correctly handle
non-zero costed internal functions.
* tree-inline.cc (estimate_num_insns): Improve costing for internal
functions.diff --git a/gcc/ipa-fnsummary.cc b/gcc/ipa-fnsummary.cc
index 
b328bb8ce14b0725f6e5607da9d1e2f61e9baf62..449961fe44e4d86bf61e625dff0759d58e1e80ba
 100644
--- a/gcc/ipa-fnsummary.cc
+++ b/gcc/ipa-fnsummary.cc
@@ -2862,16 +2862,19 @@ analyze_function_body (struct cgraph_node *node, bool 
early)
 to happen, but we cannot do that for call statements
 because edges are accounted specially.  */
 
- if (*(is_gimple_call (stmt) ? _predicate : ) != false)
+ if (*(is_gimple_call (stmt) && !gimple_call_internal_p (stmt)
+   ? _predicate : ) != false)
{
  time += final_time;
  size += this_size;
}
 
  /* We account everything but the calls.  Calls have their own
-size/time info attached to cgraph edges.  This is necessary
-in order to make the cost disappear after inlining.  */
- if (!is_gimple_call (stmt))
+size/time info attached to cgraph edges.  This is necessary
+in order to make the cost disappear after inlining.  The only
+exceptions are internal calls.  */
+ if (!is_gimple_call (stmt)
+ || gimple_call_internal_p (stmt))
{
  if (prob)
{
diff --git a/gcc/tree-inline.cc b/gcc/tree-inline.cc
index 
99efddc36c8906a797583a569424336e961c35d1..bac84d277254703369c27993dcad048de8d4ff70
 100644
--- a/gcc/tree-inline.cc
+++ b/gcc/tree-inline.cc
@@ -4427,7 +4427,48 @@ estimate_num_insns (gimple *stmt, eni_weights *weights)
tree decl;
 
if (gimple_call_internal_p (stmt))
- return 0;
+ {
+   switch (gimple_call_internal_fn (stmt))
+ {
+ default:
+   return 1;
+
+ case IFN_GOMP_TARGET_REV:
+ case IFN_GOMP_USE_SIMT:
+ case IFN_GOMP_SIMT_ENTER_ALLOC:
+ case IFN_GOMP_SIMT_EXIT:
+ case IFN_GOMP_SIMT_LANE:
+ case IFN_GOMP_SIMT_VF:
+ case IFN_GOMP_SIMT_LAST_LANE:
+ case IFN_GOMP_SIMT_ORDERED_PRED:
+ case IFN_GOMP_SIMT_VOTE_ANY:
+ case IFN_GOMP_SIMT_XCHG_BFLY:
+ case IFN_GOMP_SIMT_XCHG_IDX:
+ case IFN_GOMP_SIMD_LANE:
+ case IFN_GOMP_SIMD_VF:
+ case IFN_GOMP_SIMD_LAST_LANE:
+ case IFN_GOMP_SIMD_ORDERED_START:
+ case IFN_GOMP_SIMD_ORDERED_END:
+ case IFN_BUILTIN_EXPECT:
+ case IFN_ANNOTATE:
+ case IFN_NOP:
+ case IFN_UNIQUE:
+ case IFN_DEFERRED_INIT:
+ case IFN_ASSUME:
+   return 0;
+
+ case IFN_UBSAN_NULL:
+ case IFN_UBSAN_BOUNDS:
+ case IFN_UBSAN_VPTR:
+ case IFN_UBSAN_CHECK_ADD:
+ case IFN_UBSAN_CHECK_SUB:
+ case IFN_UBSAN_CHECK_MUL:
+ case IFN_UBSAN_PTR:
+ case IFN_UBSAN_OBJECT_SIZE:
+   /* Estimating a compare and jump.  */
+   return 2;
+ }
+ }
else if ((decl = gimple_call_fndecl (stmt))
 && fndecl_built_in_p (decl))
  {


Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-06-01 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This is the updated patch and cover letter. Patches for inline and 
gimple-op changes will follow soon.


DEF_INTERNAL_WIDENING_OPTAB_FN and DEF_INTERNAL_NARROWING_OPTAB_FN 
are like DEF_INTERNAL_SIGNED_OPTAB_FN and DEF_INTERNAL_OPTAB_FN 
respectively. With the exception that they provide convenience wrappers 
for a single vector to vector conversion, a hi/lo split or an even/odd 
split.  Each definition for  will require either signed optabs 
named  and  (for widening) or a single  (for 
narrowing) for each of the five functions it creates.


 For example, for widening addition the 
DEF_INTERNAL_WIDENING_OPTAB_FN will create five internal functions: 
IFN_VEC_WIDEN_PLUS, IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO, 
IFN_VEC_WIDEN_PLUS_EVEN and IFN_VEC_WIDEN_PLUS_ODD. Each requiring two 
optabs, one for signed and one for unsigned.

 Aarch64 implements the hi/lo split optabs:
 IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_add_hi_ -> (u/s)addl2
 IFN_VEC_WIDEN_PLUS_LO  -> vec_widen_add_lo_ -> (u/s)addl

This gives the same functionality as the previous 
WIDEN_PLUS/WIDEN_MINUS tree codes which are expanded into 
VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.


gcc/ChangeLog:

2023-04-25  Andre Vieira  
Joel Hutton  
Tamar Christina  

* config/aarch64/aarch64-simd.md 
(vec_widen_addl_lo_): Rename

this ...
(vec_widen_add_lo_): ... to this.
(vec_widen_addl_hi_): Rename this ...
(vec_widen_add_hi_): ... to this.
(vec_widen_subl_lo_): Rename this ...
(vec_widen_sub_lo_): ... to this.
(vec_widen_subl_hi_): Rename this ...
(vec_widen_sub_hi_): ...to this.
* doc/generic.texi: Document new IFN codes.
	* internal-fn.cc (ifn_cmp): Function to compare ifn's for 
sorting/searching.

(lookup_hilo_internal_fn): Add lookup function.
(commutative_binary_fn_p): Add widen_plus fn's.
(widening_fn_p): New function.
(narrowing_fn_p): New function.
(direct_internal_fn_optab): Change visibility.
* internal-fn.def (DEF_INTERNAL_WIDENING_OPTAB_FN): Macro to define an
internal_fn that expands into multiple internal_fns for widening.
(DEF_INTERNAL_NARROWING_OPTAB_FN): Likewise but for narrowing.
(IFN_VEC_WIDEN_PLUS, IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO,
 IFN_VEC_WIDEN_PLUS_EVEN, IFN_VEC_WIDEN_PLUS_ODD,
 IFN_VEC_WIDEN_MINUS, IFN_VEC_WIDEN_MINUS_HI, 
IFN_VEC_WIDEN_MINUS_LO,
 IFN_VEC_WIDEN_MINUS_ODD, IFN_VEC_WIDEN_MINUS_EVEN): Define 
widening

plus,minus functions.
* internal-fn.h (direct_internal_fn_optab): Declare new prototype.
(lookup_hilo_internal_fn): Likewise.
(widening_fn_p): Likewise.
(Narrowing_fn_p): Likewise.
* optabs.cc (commutative_optab_p): Add widening plus optabs.
* optabs.def (OPTAB_D): Define widen add, sub optabs.
* tree-vect-patterns.cc (vect_recog_widen_op_pattern): Support
patterns with a hi/lo or even/odd split.
(vect_recog_sad_pattern): Refactor to use new IFN codes.
(vect_recog_widen_plus_pattern): Likewise.
(vect_recog_widen_minus_pattern): Likewise.
(vect_recog_average_pattern): Likewise.
* tree-vect-stmts.cc (vectorizable_conversion): Add support for
_HILO IFNs.
(supportable_widening_operation): Likewise.
* tree.def (WIDEN_SUM_EXPR): Update example to use new IFNs.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vect-widen-add.c: Test that new
IFN_VEC_WIDEN_PLUS is being used.
* gcc.target/aarch64/vect-widen-sub.c: Test that new
IFN_VEC_WIDEN_MINUS is being used.

On 22/05/2023 14:06, Richard Biener wrote:

On Thu, 18 May 2023, Andre Vieira (lists) wrote:


How about this?

Not sure about the DEF_INTERNAL documentation I rewrote in internal-fn.def,
was struggling to word these, so improvements welcome!


The even/odd variant optabs are also commutative_optab_p, so is
the vec_widen_sadd without hi/lo or even/odd.

+/* { dg-options "-O3 -save-temps -fdump-tree-vect-all" } */

do you really want -all?  I think you want -details

+  else if (widening_fn_p (ifn)
+  || narrowing_fn_p (ifn))
+   {
+ tree lhs = gimple_get_lhs (stmt);
+ if (!lhs)
+   {
+ error ("vector IFN call with no lhs");
+ debug_generic_stmt (fn);

that's an error because ...?  Maybe we want to verify this
for all ECF_CONST|ECF_NOTHROW (or pure instead of const) internal
function calls, but I wouldn't add any verification as part
of this patch (not special to widening/narrowing fns either).

 if (gimple_call_internal_p (stmt))
- return 0;
+ {
+   internal_fn fn = gimple_call_internal_fn (stmt);
+   switch (fn)
+ {
+ case IFN_VEC_WIDEN_PLUS_HI:
+ case IFN_VEC_WIDEN_PLUS_LO:
+

Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-05-18 Thread Andre Vieira (lists) via Gcc-patches

How about this?

Not sure about the DEF_INTERNAL documentation I rewrote in 
internal-fn.def, was struggling to word these, so improvements welcome!


gcc/ChangeLog:

2023-04-25  Andre Vieira  
Joel Hutton  
Tamar Christina  

* config/aarch64/aarch64-simd.md 
(vec_widen_addl_lo_): Rename

this ...
(vec_widen_add_lo_): ... to this.
(vec_widen_addl_hi_): Rename this ...
(vec_widen_add_hi_): ... to this.
(vec_widen_subl_lo_): Rename this ...
(vec_widen_sub_lo_): ... to this.
(vec_widen_subl_hi_): Rename this ...
(vec_widen_sub_hi_): ...to this.
* doc/generic.texi: Document new IFN codes.
	* internal-fn.cc (ifn_cmp): Function to compare ifn's for 
sorting/searching.

(lookup_hilo_internal_fn): Add lookup function.
(commutative_binary_fn_p): Add widen_plus fn's.
(widening_fn_p): New function.
(narrowing_fn_p): New function.
(direct_internal_fn_optab): Change visibility.
* internal-fn.def (DEF_INTERNAL_WIDENING_OPTAB_FN): Macro to define an
internal_fn that expands into multiple internal_fns for widening.
(DEF_INTERNAL_NARROWING_OPTAB_FN): Likewise but for narrowing.
(IFN_VEC_WIDEN_PLUS, IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO,
 IFN_VEC_WIDEN_PLUS_EVEN, IFN_VEC_WIDEN_PLUS_ODD,
 IFN_VEC_WIDEN_MINUS, IFN_VEC_WIDEN_MINUS_HI, 
IFN_VEC_WIDEN_MINUS_LO,
 IFN_VEC_WIDEN_MINUS_ODD, IFN_VEC_WIDEN_MINUS_EVEN): Define 
widening

plus,minus functions.
* internal-fn.h (direct_internal_fn_optab): Declare new prototype.
(lookup_hilo_internal_fn): Likewise.
(widening_fn_p): Likewise.
(Narrowing_fn_p): Likewise.
* optabs.cc (commutative_optab_p): Add widening plus optabs.
* optabs.def (OPTAB_D): Define widen add, sub optabs.
* tree-cfg.cc (verify_gimple_call): Add checks for widening ifns.
* tree-inline.cc (estimate_num_insns): Return same
cost for widen add and sub IFNs as previous tree_codes.
* tree-vect-patterns.cc (vect_recog_widen_op_pattern): Support
patterns with a hi/lo or even/odd split.
(vect_recog_sad_pattern): Refactor to use new IFN codes.
(vect_recog_widen_plus_pattern): Likewise.
(vect_recog_widen_minus_pattern): Likewise.
(vect_recog_average_pattern): Likewise.
* tree-vect-stmts.cc (vectorizable_conversion): Add support for
_HILO IFNs.
(supportable_widening_operation): Likewise.
* tree.def (WIDEN_SUM_EXPR): Update example to use new IFNs.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vect-widen-add.c: Test that new
IFN_VEC_WIDEN_PLUS is being used.
* gcc.target/aarch64/vect-widen-sub.c: Test that new
IFN_VEC_WIDEN_MINUS is being used.diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
bfc98a8d943467b33390defab9682f44efab5907..ffbbecb9409e1c2835d658c2a8855cd0e955c0f2
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4626,7 +4626,7 @@
   [(set_attr "type" "neon__long")]
 )
 
-(define_expand "vec_widen_addl_lo_"
+(define_expand "vec_widen_add_lo_"
   [(match_operand: 0 "register_operand")
(ANY_EXTEND: (match_operand:VQW 1 "register_operand"))
(ANY_EXTEND: (match_operand:VQW 2 "register_operand"))]
@@ -4638,7 +4638,7 @@
   DONE;
 })
 
-(define_expand "vec_widen_addl_hi_"
+(define_expand "vec_widen_add_hi_"
   [(match_operand: 0 "register_operand")
(ANY_EXTEND: (match_operand:VQW 1 "register_operand"))
(ANY_EXTEND: (match_operand:VQW 2 "register_operand"))]
@@ -4650,7 +4650,7 @@
   DONE;
 })
 
-(define_expand "vec_widen_subl_lo_"
+(define_expand "vec_widen_sub_lo_"
   [(match_operand: 0 "register_operand")
(ANY_EXTEND: (match_operand:VQW 1 "register_operand"))
(ANY_EXTEND: (match_operand:VQW 2 "register_operand"))]
@@ -4662,7 +4662,7 @@
   DONE;
 })
 
-(define_expand "vec_widen_subl_hi_"
+(define_expand "vec_widen_sub_hi_"
   [(match_operand: 0 "register_operand")
(ANY_EXTEND: (match_operand:VQW 1 "register_operand"))
(ANY_EXTEND: (match_operand:VQW 2 "register_operand"))]
diff --git a/gcc/doc/generic.texi b/gcc/doc/generic.texi
index 
8b2882da4fe7da07d22b4e5384d049ba7d3907bf..5e36dac2b1a10257616f12cdfb0b12d0f2879ae9
 100644
--- a/gcc/doc/generic.texi
+++ b/gcc/doc/generic.texi
@@ -1811,10 +1811,16 @@ a value from @code{enum annot_expr_kind}, the third is 
an @code{INTEGER_CST}.
 @tindex VEC_RSHIFT_EXPR
 @tindex VEC_WIDEN_MULT_HI_EXPR
 @tindex VEC_WIDEN_MULT_LO_EXPR
-@tindex VEC_WIDEN_PLUS_HI_EXPR
-@tindex VEC_WIDEN_PLUS_LO_EXPR
-@tindex VEC_WIDEN_MINUS_HI_EXPR
-@tindex VEC_WIDEN_MINUS_LO_EXPR
+@tindex IFN_VEC_WIDEN_PLUS
+@tindex IFN_VEC_WIDEN_PLUS_HI
+@tindex IFN_VEC_WIDEN_PLUS_LO
+@tindex IFN_VEC_WIDEN_PLUS_EVEN
+@tindex IFN_VEC_WIDEN_PLUS_ODD
+@tindex IFN_VEC_WIDEN_MINUS
+@tindex IFN_VEC_WIDEN_MINUS_HI
+@tindex 

Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-05-15 Thread Andre Vieira (lists) via Gcc-patches



On 15/05/2023 12:01, Richard Biener wrote:

On Mon, 15 May 2023, Richard Sandiford wrote:


Richard Biener  writes:

On Fri, 12 May 2023, Richard Sandiford wrote:


Richard Biener  writes:

On Fri, 12 May 2023, Andre Vieira (lists) wrote:


I have dealt with, I think..., most of your comments. There's quite a few
changes, I think it's all a bit simpler now. I made some other changes to the
costing in tree-inline.cc and gimple-range-op.cc in which I try to preserve
the same behaviour as we had with the tree codes before. Also added some extra
checks to tree-cfg.cc that made sense to me.

I am still regression testing the gimple-range-op change, as that was a last
minute change, but the rest survived a bootstrap and regression test on
aarch64-unknown-linux-gnu.

cover letter:

This patch replaces the existing tree_code widen_plus and widen_minus
patterns with internal_fn versions.

DEF_INTERNAL_OPTAB_WIDENING_HILO_FN and DEF_INTERNAL_OPTAB_NARROWING_HILO_FN
are like DEF_INTERNAL_SIGNED_OPTAB_FN and DEF_INTERNAL_OPTAB_FN respectively
except they provide convenience wrappers for defining conversions that require
a hi/lo split.  Each definition for  will require optabs for _hi and _lo
and each of those will also require a signed and unsigned version in the case
of widening. The hi/lo pair is necessary because the widening and narrowing
operations take n narrow elements as inputs and return n/2 wide elements as
outputs. The 'lo' operation operates on the first n/2 elements of input. The
'hi' operation operates on the second n/2 elements of input. Defining an
internal_fn along with hi/lo variations allows a single internal function to
be returned from a vect_recog function that will later be expanded to hi/lo.


  For example:
  IFN_VEC_WIDEN_PLUS -> IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO
for aarch64: IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_add_hi_ ->
(u/s)addl2
IFN_VEC_WIDEN_PLUS_LO  -> vec_widen_add_lo_
-> (u/s)addl

This gives the same functionality as the previous WIDEN_PLUS/WIDEN_MINUS tree
codes which are expanded into VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.


What I still don't understand is how we are so narrowly focused on
HI/LO?  We need a combined scalar IFN for pattern selection (not
sure why that's now called _HILO, I expected no suffix).  Then there's
three possibilities the target can implement this:

  1) with a widen_[su]add instruction - I _think_ that's what
 RISCV is going to offer since it is a target where vector modes
 have "padding" (aka you cannot subreg a V2SI to get V4HI).  Instead
 RVV can do a V4HI to V4SI widening and widening add/subtract
 using vwadd[u] and vwsub[u] (the HI->SI widening is actually
 done with a widening add of zero - eh).
 IIRC GCN is the same here.


SVE currently does this too, but the addition and widening are
separate operations.  E.g. in principle there's no reason why
you can't sign-extend one operand, zero-extend the other, and
then add the result together.  Or you could extend them from
different sizes (QI and HI).  All of those are supported
(if the costing allows them).


I see.  So why does the target the expose widen_[su]add at all?


It shouldn't (need to) do that.  I don't think we should have an optab
for the unsplit operation.

At least on SVE, we really want the extensions to be fused with loads
(where possible) rather than with arithmetic.

We can still do the widening arithmetic in one go.  It's just that
fusing with the loads works for the mixed-sign and mixed-size cases,
and can handle more than just doubling the element size.


If the target has operations to do combined extending and adding (or
whatever), then at the moment we rely on combine to generate them.

So I think this case is separate from Andre's work.  The addition
itself is just an ordinary addition, and any widening happens by
vectorising a CONVERT/NOP_EXPR.


  2) with a widen_[su]add{_lo,_hi} combo - that's what the tree
 codes currently support (exclusively)
  3) similar, but widen_[su]add{_even,_odd}

that said, things like decomposes_to_hilo_fn_p look to paint us into
a 2) corner without good reason.


I suppose one question is: how much of the patch is really specific
to HI/LO, and how much is just grouping two halves together?


Yep, that I don't know for sure.


  The nice
thing about the internal-fn grouping macros is that, if (3) is
implemented in future, the structure will strongly encourage even/odd
pairs to be supported for all operations that support hi/lo.  That is,
I would expect the grouping macros to be extended to define even/odd
ifns alongside hi/lo ones, rather than adding separate definitions
for even/odd functions.

If so, at least from the internal-fn.* side of things, I think the question
is whether it's OK to stick with hilo names for now, or whether we should
use more forward-looking names.


I think for parts that are independent we could use a more
forward-looking n

Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-05-12 Thread Andre Vieira (lists) via Gcc-patches




On 12/05/2023 14:28, Richard Biener wrote:

On Fri, 12 May 2023, Andre Vieira (lists) wrote:


I have dealt with, I think..., most of your comments. There's quite a few
changes, I think it's all a bit simpler now. I made some other changes to the
costing in tree-inline.cc and gimple-range-op.cc in which I try to preserve
the same behaviour as we had with the tree codes before. Also added some extra
checks to tree-cfg.cc that made sense to me.

I am still regression testing the gimple-range-op change, as that was a last
minute change, but the rest survived a bootstrap and regression test on
aarch64-unknown-linux-gnu.

cover letter:

This patch replaces the existing tree_code widen_plus and widen_minus
patterns with internal_fn versions.

DEF_INTERNAL_OPTAB_WIDENING_HILO_FN and DEF_INTERNAL_OPTAB_NARROWING_HILO_FN
are like DEF_INTERNAL_SIGNED_OPTAB_FN and DEF_INTERNAL_OPTAB_FN respectively
except they provide convenience wrappers for defining conversions that require
a hi/lo split.  Each definition for  will require optabs for _hi and _lo
and each of those will also require a signed and unsigned version in the case
of widening. The hi/lo pair is necessary because the widening and narrowing
operations take n narrow elements as inputs and return n/2 wide elements as
outputs. The 'lo' operation operates on the first n/2 elements of input. The
'hi' operation operates on the second n/2 elements of input. Defining an
internal_fn along with hi/lo variations allows a single internal function to
be returned from a vect_recog function that will later be expanded to hi/lo.


  For example:
  IFN_VEC_WIDEN_PLUS -> IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO
for aarch64: IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_add_hi_ ->
(u/s)addl2
IFN_VEC_WIDEN_PLUS_LO  -> vec_widen_add_lo_
-> (u/s)addl

This gives the same functionality as the previous WIDEN_PLUS/WIDEN_MINUS tree
codes which are expanded into VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.


What I still don't understand is how we are so narrowly focused on
HI/LO?  We need a combined scalar IFN for pattern selection (not
sure why that's now called _HILO, I expected no suffix).  Then there's
three possibilities the target can implement this:

  1) with a widen_[su]add instruction - I _think_ that's what
 RISCV is going to offer since it is a target where vector modes
 have "padding" (aka you cannot subreg a V2SI to get V4HI).  Instead
 RVV can do a V4HI to V4SI widening and widening add/subtract
 using vwadd[u] and vwsub[u] (the HI->SI widening is actually
 done with a widening add of zero - eh).
 IIRC GCN is the same here.
  2) with a widen_[su]add{_lo,_hi} combo - that's what the tree
 codes currently support (exclusively)
  3) similar, but widen_[su]add{_even,_odd}

that said, things like decomposes_to_hilo_fn_p look to paint us into
a 2) corner without good reason.


I was kind of just keeping the naming, I had forgotten to mention I was 
also going to add _EVENODD but you are right, the pattern selection IFN 
does not need to be restrictive.


And then at supportable_widening_operation we could check what the 
target offers support for (either 1, 2 or 3). We can then actually just 
get rid of decomposes_to_hilo_fn_p and just assume that for all 
narrowing or widening IFN's there are optabs (that may or may not be 
implemented by a target) for all three variants


Having said that, that means we should have an optab to cover 1, which 
should probably just have the original name. Let me write it out...


Say we have a IFN_VEC_WIDEN_PLUS pattern and assume its signed, 
supportable_widening_operation would then first check if the target 
supported vec_widen_sadd_optab for say V8HI -> V8SI? Risc-V would take 
this path I guess?


If the target doesn't then it could check for support for:
vec_widen_sadd_lo_optab V4HI -> V4SI
vec_widen_sadd_hi_optab V4HI -> V4SI

AArch64 Advanced SIMD would implement this.

If the target still didn't support this it would check for (not sure 
about the modes here):

vec_widen_sadd_even_optab VNx8HI -> VNx4SI
vec_widen_sadd_odd_optab VNx8HI -> VNx4SI

This is one SVE would implement.


So that would mean that I'd probably end up rewriting
#define DEF_INTERNAL_OPTAB_WIDENING_FN (NAME, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)

as:
for1)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME, FLAGS, SELECTOR, SOPTAB, UOPTAB, 
TYPE)


for 2)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME##_LO, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME##_HI, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)


for 3)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME##_EVEN, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)
  DEF_INTERNAL_SIGNED_OPTAB_FN (NAME##_ODD, FLAGS, SELECTOR, SOPTAB, 
UOPTAB, TYPE)


And the same for narrowing (but with DEF_INTERNAL_OPTAB_FN instead of 
SIGNED_OPTAB).


So each widening and narrowing IFN would have optabs for all its 
variants and each target impleme

Re: [PATCH 3/3] Remove widen_plus/minus_expr tree codes

2023-05-12 Thread Andre Vieira (lists) via Gcc-patches
Moved the 'changes' from this patch back to the second so it's all just 
about removing code that we no longer use. I don't really know why Joel 
formatted the patches this way, but I thought I'd keep it as is for now.


cover letter:

This patch removes the old widen plus/minus tree codes which have been
replaced by internal functions.

gcc/ChangeLog:

2023-05-12  Andre Vieira  
Joel Hutton  

* cfgexpand.cc (expand_debug_expr): Remove old tree codes.
* doc/generic.texi: Likewise.
* expr.cc (expand_expr_real_2): Likewise.
* gimple-pretty-print.cc (dump_binary_rhs): Likewise.
* gimple-range-op.cc (gimple_range_op_handler::maybe_non_standard):
Likewise.
* optabs-tree.cc (optab_for_tree_code): Likewise.
(supportable_half_widening_operation): Likewise.
* optabs.cc (commutative_optab_p): Likewise.
* optabs.def (OPTAB_D): Likewise.
* tree-cfg.cc (verify_gimple_assign_binary): Likewise.
* tree-inline.cc (estimate_operator_cost): Likewise.
(op_symbol_code): Likewise.
* tree-pretty-print.cc (dump_generic_node): Remove tree code definition.
* tree-vect-data-refs.cc (vect_get_smallest_scalar_type): Likewise.
(vect_analyze_data_ref_accesses): Likewise.
* tree-vect-generic.cc (expand_vector_operations_1): Likewise.
* tree-vect-stmts.cc (vectorizable_conversion): Likewise.
(supportable_widening_operation): Likewise.
* tree.def (WIDEN_PLUS_EXPR, WIDEN_MINUS_EXPR, VEC_WIDEN_PLUS_HI_EXPR,
VEC_WIDEN_PLUS_LO_EXPR, VEC_WIDEN_MINUS_HI_EXPR,
VEC_WIDEN_MINUS_LO_EXPR): Likewise.diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index 
1a1b26b1c6c23ce273bcd08dc9a973f777174007..25b1558dcb941ea491a19aeeb2cd8f4d2dbdf7c6
 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -5365,10 +5365,6 @@ expand_debug_expr (tree exp)
 case VEC_WIDEN_MULT_ODD_EXPR:
 case VEC_WIDEN_LSHIFT_HI_EXPR:
 case VEC_WIDEN_LSHIFT_LO_EXPR:
-case VEC_WIDEN_PLUS_HI_EXPR:
-case VEC_WIDEN_PLUS_LO_EXPR:
-case VEC_WIDEN_MINUS_HI_EXPR:
-case VEC_WIDEN_MINUS_LO_EXPR:
 case VEC_PERM_EXPR:
 case VEC_DUPLICATE_EXPR:
 case VEC_SERIES_EXPR:
@@ -5405,8 +5401,6 @@ expand_debug_expr (tree exp)
 case WIDEN_MULT_EXPR:
 case WIDEN_MULT_PLUS_EXPR:
 case WIDEN_MULT_MINUS_EXPR:
-case WIDEN_PLUS_EXPR:
-case WIDEN_MINUS_EXPR:
   if (SCALAR_INT_MODE_P (GET_MODE (op0))
  && SCALAR_INT_MODE_P (mode))
{
@@ -5419,10 +5413,6 @@ expand_debug_expr (tree exp)
op1 = simplify_gen_unary (ZERO_EXTEND, mode, op1, inner_mode);
  else
op1 = simplify_gen_unary (SIGN_EXTEND, mode, op1, inner_mode);
- if (TREE_CODE (exp) == WIDEN_PLUS_EXPR)
-   return simplify_gen_binary (PLUS, mode, op0, op1);
- else if (TREE_CODE (exp) == WIDEN_MINUS_EXPR)
-   return simplify_gen_binary (MINUS, mode, op0, op1);
  op0 = simplify_gen_binary (MULT, mode, op0, op1);
  if (TREE_CODE (exp) == WIDEN_MULT_EXPR)
return op0;
diff --git a/gcc/doc/generic.texi b/gcc/doc/generic.texi
index 
0fd7e6cce8bbd4ecb8027b702722adcf6c32eb55..a23d57af20610e0bb4809f06fb0c91253ae56d11
 100644
--- a/gcc/doc/generic.texi
+++ b/gcc/doc/generic.texi
@@ -1815,10 +1815,6 @@ a value from @code{enum annot_expr_kind}, the third is 
an @code{INTEGER_CST}.
 @tindex IFN_VEC_WIDEN_PLUS_LO
 @tindex IFN_VEC_WIDEN_MINUS_HI
 @tindex IFN_VEC_WIDEN_MINUS_LO
-@tindex VEC_WIDEN_PLUS_HI_EXPR
-@tindex VEC_WIDEN_PLUS_LO_EXPR
-@tindex VEC_WIDEN_MINUS_HI_EXPR
-@tindex VEC_WIDEN_MINUS_LO_EXPR
 @tindex VEC_UNPACK_HI_EXPR
 @tindex VEC_UNPACK_LO_EXPR
 @tindex VEC_UNPACK_FLOAT_HI_EXPR
@@ -1892,33 +1888,6 @@ vector of @code{N/2} products.  In the case of
 vector are subtracted from the low @code{N/2} of the first to produce the
 vector of @code{N/2} products.
 
-@item VEC_WIDEN_PLUS_HI_EXPR
-@itemx VEC_WIDEN_PLUS_LO_EXPR
-These nodes represent widening vector addition of the high and low parts of
-the two input vectors, respectively.  Their operands are vectors that contain
-the same number of elements (@code{N}) of the same integral type. The result
-is a vector that contains half as many elements, of an integral type whose size
-is twice as wide.  In the case of @code{VEC_WIDEN_PLUS_HI_EXPR} the high
-@code{N/2} elements of the two vectors are added to produce the vector of
-@code{N/2} products.  In the case of @code{VEC_WIDEN_PLUS_LO_EXPR} the low
-@code{N/2} elements of the two vectors are added to produce the vector of
-@code{N/2} products.
-
-@item VEC_WIDEN_MINUS_HI_EXPR
-@itemx VEC_WIDEN_MINUS_LO_EXPR
-These nodes represent widening vector subtraction of the high and low parts of
-the two input vectors, respectively.  Their operands are vectors that contain
-the same number of elements (@code{N}) of the same integral type. The high/low
-elements of the second vector are subtracted from the high/low elements 

Re: [PATCH 2/3] Refactor widen_plus as internal_fn

2023-05-12 Thread Andre Vieira (lists) via Gcc-patches
I have dealt with, I think..., most of your comments. There's quite a 
few changes, I think it's all a bit simpler now. I made some other 
changes to the costing in tree-inline.cc and gimple-range-op.cc in which 
I try to preserve the same behaviour as we had with the tree codes 
before. Also added some extra checks to tree-cfg.cc that made sense to me.


I am still regression testing the gimple-range-op change, as that was a 
last minute change, but the rest survived a bootstrap and regression 
test on aarch64-unknown-linux-gnu.


cover letter:

This patch replaces the existing tree_code widen_plus and widen_minus
patterns with internal_fn versions.

DEF_INTERNAL_OPTAB_WIDENING_HILO_FN and 
DEF_INTERNAL_OPTAB_NARROWING_HILO_FN are like 
DEF_INTERNAL_SIGNED_OPTAB_FN and DEF_INTERNAL_OPTAB_FN respectively 
except they provide convenience wrappers for defining conversions that 
require a hi/lo split.  Each definition for  will require optabs 
for _hi and _lo and each of those will also require a signed and 
unsigned version in the case of widening. The hi/lo pair is necessary 
because the widening and narrowing operations take n narrow elements as 
inputs and return n/2 wide elements as outputs. The 'lo' operation 
operates on the first n/2 elements of input. The 'hi' operation operates 
on the second n/2 elements of input. Defining an internal_fn along with 
hi/lo variations allows a single internal function to be returned from a 
vect_recog function that will later be expanded to hi/lo.



 For example:
 IFN_VEC_WIDEN_PLUS -> IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO
for aarch64: IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_add_hi_ -> 
(u/s)addl2
   IFN_VEC_WIDEN_PLUS_LO  -> 
vec_widen_add_lo_ -> (u/s)addl


This gives the same functionality as the previous WIDEN_PLUS/WIDEN_MINUS 
tree codes which are expanded into VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.


gcc/ChangeLog:

2023-05-12  Andre Vieira  
Joel Hutton  
Tamar Christina  

* config/aarch64/aarch64-simd.md 
(vec_widen_addl_lo_): Rename

this ...
(vec_widen_add_lo_): ... to this.
(vec_widen_addl_hi_): Rename this ...
(vec_widen_add_hi_): ... to this.
(vec_widen_subl_lo_): Rename this ...
(vec_widen_sub_lo_): ... to this.
(vec_widen_subl_hi_): Rename this ...
(vec_widen_sub_hi_): ...to this.
* doc/generic.texi: Document new IFN codes.
* internal-fn.cc (DEF_INTERNAL_OPTAB_WIDENING_HILO_FN): Macro to define 
an
internal_fn that expands into multiple internal_fns for widening.
(DEF_INTERNAL_OPTAB_NARROWING_HILO_FN): Likewise but for narrowing.
(ifn_cmp): Function to compare ifn's for sorting/searching.
(lookup_hilo_internal_fn): Add lookup function.
(commutative_binary_fn_p): Add widen_plus fn's.
(widening_fn_p): New function.
(narrowing_fn_p): New function.
(decomposes_to_hilo_fn_p): New function.
(direct_internal_fn_optab): Change visibility.
* internal-fn.def (DEF_INTERNAL_OPTAB_WIDENING_HILO_FN): Define widening
plus,minus functions.
(VEC_WIDEN_PLUS): Replacement for VEC_WIDEN_PLUS_EXPR tree code.
(VEC_WIDEN_MINUS): Replacement for VEC_WIDEN_MINUS_EXPR tree code.
* internal-fn.h (GCC_INTERNAL_FN_H): Add headers.
(direct_internal_fn_optab): Declare new prototype.
(lookup_hilo_internal_fn): Likewise.
(widening_fn_p): Likewise.
(Narrowing_fn_p): Likewise.
(decomposes_to_hilo_fn_p): Likewise.
* optabs.cc (commutative_optab_p): Add widening plus optabs.
* optabs.def (OPTAB_D): Define widen add, sub optabs.
* tree-cfg.cc (verify_gimple_call): Add checks for new widen
add and sub IFNs.
* tree-inline.cc (estimate_num_insns): Return same
cost for widen add and sub IFNs as previous tree_codes.
* tree-vect-patterns.cc (vect_recog_widen_op_pattern): Support
patterns with a hi/lo split.
(vect_recog_sad_pattern): Refactor to use new IFN codes.
(vect_recog_widen_plus_pattern): Likewise.
(vect_recog_widen_minus_pattern): Likewise.
(vect_recog_average_pattern): Likewise.
* tree-vect-stmts.cc (vectorizable_conversion): Add support for
_HILO IFNs.
(supportable_widening_operation): Likewise.
* tree.def (WIDEN_SUM_EXPR): Update example to use new IFNs.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vect-widen-add.c: Test that new
IFN_VEC_WIDEN_PLUS is being used.
* gcc.target/aarch64/vect-widen-sub.c: Test that new
IFN_VEC_WIDEN_MINUS is being used.diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
bfc98a8d943467b33390defab9682f44efab5907..ffbbecb9409e1c2835d658c2a8855cd0e955c0f2
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4626,7 +4626,7 @@
   [(set_attr "type" 

Re: [PATCH 1/3] Refactor to allow internal_fn's

2023-05-12 Thread Andre Vieira (lists) via Gcc-patches

Hi,

I think I tackled all of your comments, let me know if I missed something.


gcc/ChangeLog:

2023-05-12  Andre Vieira  
Joel Hutton  

* tree-vect-patterns.cc (vect_gimple_build): New Function.
(vect_recog_widen_op_pattern): Refactor to use code_helper.
* tree-vect-stmts.cc (vect_gen_widened_results_half): Likewise.
(vect_create_vectorized_demotion_stmts): Likewise.
(vect_create_vectorized_promotion_stmts): Likewise.
(vect_create_half_widening_stmts): Likewise.
(vectorizable_conversion): Likewise.
(vectorizable_call): Likewise.
(supportable_widening_operation): Likewise.
(supportable_narrowing_operation): Likewise.
(simple_integer_narrowing): Likewise.
* tree-vectorizer.h (supportable_widening_operation): Likewise.
(supportable_narrowing_operation): Likewise.
(vect_gimple_build): New function prototype.
* tree.h (code_helper::safe_as_tree_code): New function.
(code_helper::safe_as_fn_code): New function.diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 
33a8b2bb60601dc1a67de62a56bbf3c355e12dbd..1778af0242898e3dc73d94d22a5b8505628a53b5
 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -25,6 +25,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "rtl.h"
 #include "tree.h"
 #include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimple-fold.h"
 #include "ssa.h"
 #include "expmed.h"
 #include "optabs-tree.h"
@@ -1392,7 +1394,7 @@ vect_recog_sad_pattern (vec_info *vinfo,
 static gimple *
 vect_recog_widen_op_pattern (vec_info *vinfo,
 stmt_vec_info last_stmt_info, tree *type_out,
-tree_code orig_code, tree_code wide_code,
+tree_code orig_code, code_helper wide_code,
 bool shift_p, const char *name)
 {
   gimple *last_stmt = last_stmt_info->stmt;
@@ -1435,7 +1437,7 @@ vect_recog_widen_op_pattern (vec_info *vinfo,
   vecctype = get_vectype_for_scalar_type (vinfo, ctype);
 }
 
-  enum tree_code dummy_code;
+  code_helper dummy_code;
   int dummy_int;
   auto_vec dummy_vec;
   if (!vectype
@@ -1456,8 +1458,7 @@ vect_recog_widen_op_pattern (vec_info *vinfo,
   2, oprnd, half_type, unprom, vectype);
 
   tree var = vect_recog_temp_ssa_var (itype, NULL);
-  gimple *pattern_stmt = gimple_build_assign (var, wide_code,
- oprnd[0], oprnd[1]);
+  gimple *pattern_stmt = vect_gimple_build (var, wide_code, oprnd[0], 
oprnd[1]);
 
   if (vecctype != vecitype)
 pattern_stmt = vect_convert_output (vinfo, last_stmt_info, ctype,
@@ -6808,3 +6809,20 @@ vect_pattern_recog (vec_info *vinfo)
   /* After this no more add_stmt calls are allowed.  */
   vinfo->stmt_vec_info_ro = true;
 }
+
+/* Build a GIMPLE_ASSIGN or GIMPLE_CALL with the tree_code,
+   or internal_fn contained in ch, respectively.  */
+gimple *
+vect_gimple_build (tree lhs, code_helper ch, tree op0, tree op1)
+{
+  gcc_assert (op0 != NULL_TREE);
+  if (ch.is_tree_code ())
+return gimple_build_assign (lhs, (tree_code) ch, op0, op1);
+
+  gcc_assert (ch.is_internal_fn ());
+  gimple* stmt = gimple_build_call_internal (as_internal_fn ((combined_fn) ch),
+op1 == NULL_TREE ? 1 : 2,
+op0, op1);
+  gimple_call_set_lhs (stmt, lhs);
+  return stmt;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
61a2da4ecee9c449c1469cab3c4cfa1a782471d5..d152ae9ab10b361b88c0f839d6951c43b954750a
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3261,13 +3261,13 @@ vectorizable_bswap (vec_info *vinfo,
 
 static bool
 simple_integer_narrowing (tree vectype_out, tree vectype_in,
- tree_code *convert_code)
+ code_helper *convert_code)
 {
   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype_out))
   || !INTEGRAL_TYPE_P (TREE_TYPE (vectype_in)))
 return false;
 
-  tree_code code;
+  code_helper code;
   int multi_step_cvt = 0;
   auto_vec  interm_types;
   if (!supportable_narrowing_operation (NOP_EXPR, vectype_out, vectype_in,
@@ -3481,7 +3481,7 @@ vectorizable_call (vec_info *vinfo,
   tree callee = gimple_call_fndecl (stmt);
 
   /* First try using an internal function.  */
-  tree_code convert_code = ERROR_MARK;
+  code_helper convert_code = MAX_TREE_CODES;
   if (cfn != CFN_LAST
   && (modifier == NONE
  || (modifier == NARROW
@@ -3667,8 +3667,8 @@ vectorizable_call (vec_info *vinfo,
  continue;
}
  new_temp = make_ssa_name (vec_dest);
- new_stmt = gimple_build_assign (new_temp, convert_code,
- prev_res, half_res);
+ new_stmt = vect_gimple_build 

Re: [PATCH 3/3] Remove widen_plus/minus_expr tree codes

2023-05-10 Thread Andre Vieira (lists) via Gcc-patches




On 03/05/2023 13:29, Richard Biener wrote:

On Fri, 28 Apr 2023, Andre Vieira (lists) wrote:


This is a rebase of Joel's previous patch.

This patch removes the old widen plus/minus tree codes which have been
replaced by internal functions.


I guess that's obvious then.  I wonder what we do to internal
fns in debug stmts?  Looks like we throw those away and do not
generate debug stmts from calls.

See the comment above the removed lines in expand_debug_expr:
 /* Vector stuff.  For most of the codes we don't have rtl codes.  */

And it then just returns NULL for those expr's. So the behaviour there 
remains unchanged, not saying we couldn't do anything but I don





Given you remove handling of the scalar WIDEN_PLUS/MINUS_EXPR
codes everywhere do we want to add checking code the scalar
IFNs do not appear in the IL?  For at least some cases there
are corresponding functions handling internal functions that
you could have amended otherwise.


I am making some changes to PATCH 2 of this series, in the new version I 
am adding some extra code to the gimple checks, one of which is to error 
if it comes a cross an IFN that decomposes to HILO as that should only 
occur as an intermediary representation of the vect pass.


Richard.


gcc/ChangeLog:

2023-04-28  Andre Vieira  
 Joel Hutton  

* doc/generic.texi: Remove old tree codes.
* expr.cc (expand_expr_real_2): Remove old tree code cases.
* gimple-pretty-print.cc (dump_binary_rhs): Likewise.
* optabs-tree.cc (optab_for_tree_code): Likewise.
(supportable_half_widening_operation): Likewise.
* tree-cfg.cc (verify_gimple_assign_binary): Likewise.
* tree-inline.cc (estimate_operator_cost): Likewise.
(op_symbol_code): Likewise.
* tree-vect-data-refs.cc (vect_get_smallest_scalar_type): Likewise.
(vect_analyze_data_ref_accesses): Likewise.
* tree-vect-generic.cc (expand_vector_operations_1): Likewise.
* cfgexpand.cc (expand_debug_expr): Likewise.
* tree-vect-stmts.cc (vectorizable_conversion): Likewise.
(supportable_widening_operation): Likewise.
* gimple-range-op.cc (gimple_range_op_handler::maybe_non_standard):
Likewise.
* tree-vect-patterns.cc (vect_widened_op_tree): Refactor to replace
usage in vect_recog_sad_pattern.
(vect_recog_sad_pattern): Replace tree code widening pattern with
internal function.
(vect_recog_average_pattern): Likewise.
* tree-pretty-print.cc (dump_generic_node): Remove tree code
definition.
* tree.def (WIDEN_PLUS_EXPR, WIDEN_MINUS_EXPR, VEC_WIDEN_PLUS_HI_EXPR,
VEC_WIDEN_PLUS_LO_EXPR, VEC_WIDEN_MINUS_HI_EXPR,
VEC_WIDEN_MINUS_LO_EXPR): Likewise





Re: [PATCH 1/3] Refactor to allow internal_fn's

2023-05-04 Thread Andre Vieira (lists) via Gcc-patches




On 03/05/2023 12:55, Richard Biener wrote:

On Fri, 28 Apr 2023, Andre Vieira (lists) wrote:


Hi,

I'm posting the patches separately now with ChangeLogs.

I made the suggested changes and tried to simplify the code a bit further.
Where internal to tree-vect-stmts I changed most functions to use code_helper
to avoid having to check at places we didn't need to. I was trying to simplify
things further by also modifying supportable_half_widening_operation and
supportable_convert_operation but the result of that was that I ended up
moving the code to cast to tree code inside them rather than at the call site
and it didn't look simpler, so I left those. Though if we did make those
changes we'd no longer need to keep around the tc1 variable in
vectorizable_conversion... Let me know what you think.


I see that

-  else if (CONVERT_EXPR_CODE_P (code)
+  else if (CONVERT_EXPR_CODE_P (code.safe_as_tree_code ())

is convenient (as much as I dislike safe_as_tree_code).  Isn't
the following

-  if (!CONVERT_EXPR_CODE_P (code))
+  if (!CONVERT_EXPR_CODE_P ((tree_code) code))
  return false;
For some reason I thought the code could only reach here if code was a 
tree code, but I guess if we have an ifn and the modes aren't the same 
as the wide_vectype it would fall to this, which for an ifn this would 
fail. I am wondering whether it needs to though, the multi-step widening 
should also work for ifn's no? We'd need to adapt it, to not use c1, c2 
but hi, lo in case of ifn I guess.. and then use a different optab look 
up too?


Though I'm thinking, maybe this should be a follow-up and just not have 
that 'feature' for now. The feature being, supporting multi-step 
conversion for new widening IFN's.


Re: [ping][vect-patterns] Refactor widen_plus/widen_minus as internal_fns

2023-04-28 Thread Andre Vieira (lists) via Gcc-patches




On 25/04/2023 13:30, Richard Biener wrote:

On Mon, 24 Apr 2023, Richard Sandiford wrote:


Richard Biener  writes:

On Thu, Apr 20, 2023 at 3:24?PM Andre Vieira (lists) via Gcc-patches
 wrote:


Rebased all three patches and made some small changes to the second one:
- removed sub and abd optabs from commutative_optab_p, I suspect this
was a copy paste mistake,
- removed what I believe to be a superfluous switch case in vectorizable
conversion, the one that was here:
+  if (code.is_fn_code ())
+ {
+  internal_fn ifn = as_internal_fn (code.as_fn_code ());
+  int ecf_flags = internal_fn_flags (ifn);
+  gcc_assert (ecf_flags & ECF_MULTI);
+
+  switch (code.as_fn_code ())
+   {
+   case CFN_VEC_WIDEN_PLUS:
+ break;
+   case CFN_VEC_WIDEN_MINUS:
+ break;
+   case CFN_LAST:
+   default:
+ return false;
+   }
+
+  internal_fn lo, hi;
+  lookup_multi_internal_fn (ifn, , );
+  *code1 = as_combined_fn (lo);
+  *code2 = as_combined_fn (hi);
+  optab1 = lookup_multi_ifn_optab (lo, !TYPE_UNSIGNED (vectype));
+  optab2 = lookup_multi_ifn_optab (hi, !TYPE_UNSIGNED (vectype));
   }

I don't think we need to check they are a specfic fn code, as we look-up
optabs and if they succeed then surely we can vectorize?

OK for trunk?


In the first patch I see some uses of safe_as_tree_code like

+  if (ch.is_tree_code ())
+return op1 == NULL_TREE ? gimple_build_assign (lhs,
ch.safe_as_tree_code (),
+  op0) :
+ gimple_build_assign (lhs, ch.safe_as_tree_code (),
+  op0, op1);
+  else
+  {
+internal_fn fn = as_internal_fn (ch.safe_as_fn_code ());
+gimple* stmt;

where the context actually requires a valid tree code.  Please change those
to force to tree code / ifn code.  Just use explicit casts here and the other
places that are similar.  Before the as_internal_fn just put a
gcc_assert (ch.is_internal_fn ()).


Also, doesn't the above ?: simplify to the "else" arm?  Null trailing
arguments would be ignored for unary operators.

I wasn't sure what to make of the op0 handling:


+/* Build a GIMPLE_ASSIGN or GIMPLE_CALL with the tree_code,
+   or internal_fn contained in ch, respectively.  */
+gimple *
+vect_gimple_build (tree lhs, code_helper ch, tree op0, tree op1)
+{
+  if (op0 == NULL_TREE)
+return NULL;


Can that happen, and if so, does returning null make sense?
Maybe an assert would be safer.


Yeah, I was hoping to have a look whether the new gimple_build
overloads could be used to make this all better (but hoped we can
finally get this series in in some way).

Richard.


Yeah, in the newest version of the first patch of the series I found 
that most of the time I can get away with only really needing to 
distinguish between tree_code and internal_fn when building gimple, for 
which it currently uses vect_gimple_build, but it does feel like that 
could easily be a gimple function.


Having said that, as I partially mention in the patch, I didn't rewrite 
the optabs-tree supportable_half_widening and supportable_conversion (or 
whatever they are called) because those also at some point need to 
access the stmt and there is a massive difference in how we handle 
gassigns and gcall's from that perspective, but maybe we can generalize 
that too somehow...


Anyway have a look at the new versions (posted just some minutes after 
the email I'm replying too haha! timing :P)


[PATCH 2/3] Refactor widen_plus as internal_fn

2023-04-28 Thread Andre Vieira (lists) via Gcc-patches

This patch replaces the existing tree_code widen_plus and widen_minus
patterns with internal_fn versions.

DEF_INTERNAL_OPTAB_HILO_FN is like DEF_INTERNAL_OPTAB_FN except it 
provides convenience wrappers for defining conversions that require a 
hi/lo split, like widening and narrowing operations.  Each definition 
for  will require an optab named  and two other optabs that 
you specify for signed and unsigned. The hi/lo pair is necessary because 
the widening operations take n narrow elements as inputs and return n/2 
wide elements as outputs. The 'lo' operation operates on the first n/2 
elements of input. The 'hi' operation operates on the second n/2 
elements of input. Defining an internal_fn along with hi/lo variations 
allows a single internal function to be returned from a vect_recog 
function that will later be expanded to hi/lo.


DEF_INTERNAL_OPTAB_HILO_FN is used in internal-fn.def to register a 
widening internal_fn. It is defined differently in different places and 
internal-fn.def is sourced from those places so the parameters given can 
be reused.
  internal-fn.c: defined to expand to hi/lo signed/unsigned optabs, 
later defined to generate the  'expand_' functions for the hi/lo 
versions of the fn.
  internal-fn.def: defined to invoke DEF_INTERNAL_OPTAB_FN for the 
original and hi/lo variants of the internal_fn


 For example:
 IFN_VEC_WIDEN_PLUS -> IFN_VEC_WIDEN_PLUS_HI, IFN_VEC_WIDEN_PLUS_LO
for aarch64: IFN_VEC_WIDEN_PLUS_HI   -> vec_widen_addl_hi_ -> 
(u/s)addl2
   IFN_VEC_WIDEN_PLUS_LO  -> 
vec_widen_addl_lo_ -> (u/s)addl


This gives the same functionality as the previous WIDEN_PLUS/WIDEN_MINUS 
tree codes which are expanded into VEC_WIDEN_PLUS_LO, VEC_WIDEN_PLUS_HI.


gcc/ChangeLog:

2023-04-28  Andre Vieira  
Joel Hutton  
Tamar Christina  

* internal-fn.cc (INCLUDE_MAP): Include maps for use in optab
lookup.
(DEF_INTERNAL_OPTAB_HILO_FN): Macro to define an internal_fn that
expands into multiple internal_fns (for widening).
(ifn_cmp): Function to compare ifn's for sorting/searching.
(lookup_hilo_ifn_optab): Add lookup function.
(lookup_hilo_internal_fn): Add lookup function.
(commutative_binary_fn_p): Add widen_plus fn's.
(widening_fn_p): New function.
(decomposes_to_hilo_fn_p): New function.
* internal-fn.def (DEF_INTERNAL_OPTAB_HILO_FN): Define widening
plus,minus functions.
(VEC_WIDEN_PLUS): Replacement for VEC_WIDEN_PLUS tree code.
(VEC_WIDEN_MINUS): Replacement for VEC_WIDEN_MINUS tree code.
* internal-fn.h (GCC_INTERNAL_FN_H): Add headers.
(lookup_hilo_ifn_optab): Add prototype.
(lookup_hilo_internal_fn): Likewise.
(widening_fn_p): Likewise.
(decomposes_to_hilo_fn_p): Likewise.
* optabs.cc (commutative_optab_p): Add widening plus, minus optabs.
* optabs.def (OPTAB_CD): widen add, sub optabs
* tree-vect-patterns.cc (vect_recog_widen_op_pattern): Support
patterns with a hi/lo split.
(vect_recog_widen_plus_pattern): Refactor to return
IFN_VECT_WIDEN_PLUS.
(vect_recog_widen_minus_pattern): Refactor to return new
IFN_VEC_WIDEN_MINUS.
* tree-vect-stmts.cc (vectorizable_conversion): Add widen plus/minus
ifn
support.
(supportable_widening_operation): Add widen plus/minus ifn support.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vect-widen-add.c: Test that new
IFN_VEC_WIDEN_PLUS is being used.
* gcc.target/aarch64/vect-widen-sub.c: Test that new
IFN_VEC_WIDEN_MINUS is being used.diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 
6e81dc05e0e0714256759b0594816df451415a2d..e4d815cd577d266d2bccf6fb68d62aac91a8b4cf
 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -17,6 +17,7 @@ You should have received a copy of the GNU General Public 
License
 along with GCC; see the file COPYING3.  If not see
 .  */
 
+#define INCLUDE_MAP
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
@@ -70,6 +71,26 @@ const int internal_fn_flags_array[] = {
   0
 };
 
+const enum internal_fn internal_fn_hilo_keys_array[] = {
+#undef DEF_INTERNAL_OPTAB_HILO_FN
+#define DEF_INTERNAL_OPTAB_HILO_FN(NAME, FLAGS, OPTAB, SOPTAB, UOPTAB, TYPE) \
+  IFN_##NAME##_LO, \
+  IFN_##NAME##_HI,
+#include "internal-fn.def"
+  IFN_LAST
+#undef DEF_INTERNAL_OPTAB_HILO_FN
+};
+
+const optab internal_fn_hilo_values_array[] = {
+#undef DEF_INTERNAL_OPTAB_HILO_FN
+#define DEF_INTERNAL_OPTAB_HILO_FN(NAME, FLAGS, OPTAB, SOPTAB, UOPTAB, TYPE) \
+  SOPTAB##_lo_optab, UOPTAB##_lo_optab, \
+  SOPTAB##_hi_optab, UOPTAB##_hi_optab,
+#include "internal-fn.def"
+  unknown_optab, unknown_optab
+#undef DEF_INTERNAL_OPTAB_HILO_FN
+};
+
 /* Return the internal function called NAME, or IFN_LAST if there's
no such function.  */
 
@@ -90,6 +111,61 @@ lookup_internal_fn (const char *name)
   return 

[PATCH 1/3] Refactor to allow internal_fn's

2023-04-28 Thread Andre Vieira (lists) via Gcc-patches

Hi,

I'm posting the patches separately now with ChangeLogs.

I made the suggested changes and tried to simplify the code a bit 
further. Where internal to tree-vect-stmts I changed most functions to 
use code_helper to avoid having to check at places we didn't need to. I 
was trying to simplify things further by also modifying 
supportable_half_widening_operation and supportable_convert_operation 
but the result of that was that I ended up moving the code to cast to 
tree code inside them rather than at the call site and it didn't look 
simpler, so I left those. Though if we did make those changes we'd no 
longer need to keep around the tc1 variable in 
vectorizable_conversion... Let me know what you think.


gcc/ChangeLog:

2023-04-28  Andre Vieira  
Joel Hutton  

* tree-vect-patterns.cc (vect_gimple_build): New Function.
(vect_recog_widen_op_pattern): Refactor to use code_helper.
* tree-vect-stmts.cc (vect_gen_widened_results_half): Likewise.
(vect_create_vectorized_demotion_stmts): Likewise.
(vect_create_vectorized_promotion_stmts): Likewise.
(vect_create_half_widening_stmts): Likewise.
(vectorizable_conversion): Likewise.
(vectorizable_call): Likewise.
(supportable_widening_operation): Likewise.
(supportable_narrowing_operation): Likewise.
(simple_integer_narrowing): Likewise.
* tree-vectorizer.h (supportable_widening_operation): Likewise.
(supportable_narrowing_operation): Likewise.
(vect_gimple_build): New function prototype.
* tree.h (code_helper::safe_as_tree_code): New function.
(code_helper::safe_as_fn_code): New function.diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 
8802141cd6edb298866025b8a55843eae1f0eb17..b35023adade94c1996cd076c4b7419560e819c6b
 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -25,6 +25,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "rtl.h"
 #include "tree.h"
 #include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimple-fold.h"
 #include "ssa.h"
 #include "expmed.h"
 #include "optabs-tree.h"
@@ -1391,7 +1393,7 @@ vect_recog_sad_pattern (vec_info *vinfo,
 static gimple *
 vect_recog_widen_op_pattern (vec_info *vinfo,
 stmt_vec_info last_stmt_info, tree *type_out,
-tree_code orig_code, tree_code wide_code,
+tree_code orig_code, code_helper wide_code,
 bool shift_p, const char *name)
 {
   gimple *last_stmt = last_stmt_info->stmt;
@@ -1434,7 +1436,7 @@ vect_recog_widen_op_pattern (vec_info *vinfo,
   vecctype = get_vectype_for_scalar_type (vinfo, ctype);
 }
 
-  enum tree_code dummy_code;
+  code_helper dummy_code;
   int dummy_int;
   auto_vec dummy_vec;
   if (!vectype
@@ -1455,8 +1457,7 @@ vect_recog_widen_op_pattern (vec_info *vinfo,
   2, oprnd, half_type, unprom, vectype);
 
   tree var = vect_recog_temp_ssa_var (itype, NULL);
-  gimple *pattern_stmt = gimple_build_assign (var, wide_code,
- oprnd[0], oprnd[1]);
+  gimple *pattern_stmt = vect_gimple_build (var, wide_code, oprnd[0], 
oprnd[1]);
 
   if (vecctype != vecitype)
 pattern_stmt = vect_convert_output (vinfo, last_stmt_info, ctype,
@@ -6406,3 +6407,20 @@ vect_pattern_recog (vec_info *vinfo)
   /* After this no more add_stmt calls are allowed.  */
   vinfo->stmt_vec_info_ro = true;
 }
+
+/* Build a GIMPLE_ASSIGN or GIMPLE_CALL with the tree_code,
+   or internal_fn contained in ch, respectively.  */
+gimple *
+vect_gimple_build (tree lhs, code_helper ch, tree op0, tree op1)
+{
+  gcc_assert (op0 != NULL_TREE);
+  if (ch.is_tree_code ())
+return gimple_build_assign (lhs, (tree_code) ch, op0, op1);
+
+  gcc_assert (ch.is_internal_fn ());
+  gimple* stmt = gimple_build_call_internal (as_internal_fn ((combined_fn) ch),
+op1 == NULL_TREE ? 1 : 2,
+op0, op1);
+  gimple_call_set_lhs (stmt, lhs);
+  return stmt;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
6b7dbfd4a231baec24e740ffe0ce0b0bf7a1de6b..ce47f4940fa9a1baca4ba1162065cfc3b4072eba
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -3258,13 +3258,13 @@ vectorizable_bswap (vec_info *vinfo,
 
 static bool
 simple_integer_narrowing (tree vectype_out, tree vectype_in,
- tree_code *convert_code)
+ code_helper *convert_code)
 {
   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype_out))
   || !INTEGRAL_TYPE_P (TREE_TYPE (vectype_in)))
 return false;
 
-  tree_code code;
+  code_helper code;
   int multi_step_cvt = 0;
   auto_vec  interm_types;
   if (!supportable_narrowing_operation (NOP_EXPR, vectype_out, vectype_in,
@@ -3478,7 +3478,7 @@ vectorizable_call (vec_info *vinfo,
   tree callee = 

  1   2   3   4   5   6   7   >