[PATCH 18/19][GCC-8] aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]

2020-04-16 Thread Andre Vieira (lists)

The following testcase ICEs, because aarch64_gen_compare_reg_maybe_ze emits
invalid RTL.
For y_mode [QH]Imode it expects y to be of that mode (or CONST_INT that fits
into that mode) and x being SImode; for non-CONST_INT y it zero extends y
into SImode and compares that against x, for CONST_INT y it zero extends y
into SImode.  The problem is that when the zero extended constant isn't
usable directly, it forces it into a REG, but with y_mode mode, and then
compares against y.  That is wrong, because it should force it into a SImode
REG and compare that way.

2020-04-16  Andre Vieira 

    Backport from mainline
    2020-04-02  Jakub Jelinek 

    PR target/94435
    * config/aarch64/aarch64.c (aarch64_gen_compare_reg_maybe_ze): For
    y_mode E_[QH]Imode and y being a CONST_INT, change y_mode to SImode.

    * gcc.target/aarch64/pr94435.c: New test.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
21124b5a3479dd388eb767402e080e2181153467..6bac63402e508027e77a9f4557cb10c578ea7c2c
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1556,7 +1556,10 @@ aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, 
rtx y,
   if (y_mode == E_QImode || y_mode == E_HImode)
 {
   if (CONST_INT_P (y))
-   y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+   {
+ y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
+ y_mode = SImode;
+   }
   else
{
  rtx t, cc_reg;
diff --git a/gcc/testsuite/gcc.target/aarch64/pr94435.c 
b/gcc/testsuite/gcc.target/aarch64/pr94435.c
new file mode 100644
index 
..5713c14d5f90b1d42f92d040e9030ecc03c97d51
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr94435.c
@@ -0,0 +1,25 @@
+/* PR target/94435 */
+/* { dg-do compile } */
+/* { dg-options "-march=armv8-a+nolse -moutline-atomics" } */
+
+int b, c, d, e, f, h;
+short g;
+int foo (int) __attribute__ ((__const__));
+
+void
+bar (void)
+{
+  while (1)
+{
+  while (1)
+   {
+ __atomic_load_n (, 0);
+ if (foo (2))
+   __sync_val_compare_and_swap (, 0, f);
+ b = 1;
+ if (h == e)
+   break;
+   }
+  __sync_val_compare_and_swap (, -1, f);
+}
+}


[PATCH 17/19][GCC-8] aarch64: Fix bootstrap with old binutils [PR93053]

2020-04-16 Thread Andre Vieira (lists)

As reported in the PR, GCC 10 (and also 9.3.1 but not 9.3.0) fails to build
when using older binutils which lack LSE support, because those instructions
are used in libgcc.
Thanks to Kyrylo's hint, the following patches (hopefully) allow it to build
even with older binutils by using .inst directive if LSE support isn't
available in the assembler.

2020-04-16 Andre Vieira 

    Backport from mainline
    2020-04-15  Jakub Jelinek 

    PR target/93053
    * configure.ac (LIBGCC_CHECK_AS_LSE): Add HAVE_AS_LSE checking.
    * config/aarch64/lse.S: Include auto-target.h, if HAVE_AS_LSE
    is not defined, use just .arch armv8-a.
    (B, M, N, OPN): Define.
    (COMMENT): New .macro.
    (CAS, CASP, SWP, LDOP): Use .inst directive if HAVE_AS_LSE is not
    defined.  Otherwise, move the operands right after the glue? and
    comment out operands where the macros are used.
    * configure: Regenerated.
    * config.in: Regenerated.

diff --git a/libgcc/config.in b/libgcc/config.in
index 
59a3d8daf52e72e548d3d9425d6043d5e0c663ad..5be5321d2584392bac1ec3af779cd96823212902
 100644
--- a/libgcc/config.in
+++ b/libgcc/config.in
@@ -10,6 +10,9 @@
*/
 #undef HAVE_AS_CFI_SECTIONS
 
+/* Define to 1 if the assembler supports LSE. */
+#undef HAVE_AS_LSE
+
 /* Define to 1 if the target assembler supports thread-local storage. */
 #undef HAVE_CC_TLS
 
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
c7979382ad7770b61bb1c64d32ba2395963a9d7a..f7f1c19587beaec2ccb6371378d54d50139ba1c9
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -48,8 +48,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
  * separately to minimize code size.
  */
 
+#include "auto-target.h"
+
 /* Tell the assembler to accept LSE instructions.  */
+#ifdef HAVE_AS_LSE
.arch armv8-a+lse
+#else
+   .arch armv8-a
+#endif
 
 /* Declare the symbol gating the LSE implementations.  */
.hidden __aarch64_have_lse_atomics
@@ -58,12 +64,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #if SIZE == 1
 # define S b
 # define UXT   uxtb
+# define B 0x
 #elif SIZE == 2
 # define S h
 # define UXT   uxth
+# define B 0x4000
 #elif SIZE == 4 || SIZE == 8 || SIZE == 16
 # define S
 # define UXT   mov
+# if SIZE == 4
+#  define B0x8000
+# elif SIZE == 8
+#  define B0xc000
+# endif
 #else
 # error
 #endif
@@ -72,18 +85,26 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 # define SUFF  _relax
 # define A
 # define L
+# define M 0x00
+# define N 0x00
 #elif MODEL == 2
 # define SUFF  _acq
 # define A a
 # define L
+# define M 0x40
+# define N 0x80
 #elif MODEL == 3
 # define SUFF  _rel
 # define A
 # define L l
+# define M 0x008000
+# define N 0x40
 #elif MODEL == 4
 # define SUFF  _acq_rel
 # define A a
 # define L l
+# define M 0x408000
+# define N 0xc0
 #else
 # error
 #endif
@@ -144,9 +165,13 @@ STARTFNNAME(cas)
JUMP_IF_NOT_LSE 8f
 
 #if SIZE < 16
-#define CASglue4(cas, A, L, S)
+#ifdef HAVE_AS_LSE
+# define CAS   glue4(cas, A, L, S) s(0), s(1), [x2]
+#else
+# define CAS   .inst 0x08a07c41 + B + M
+#endif
 
-   CAS s(0), s(1), [x2]
+   CAS /* s(0), s(1), [x2] */
ret
 
 8: UXT s(tmp0), s(0)
@@ -160,9 +185,13 @@ STARTFNNAME(cas)
 #else
 #define LDXP   glue3(ld, A, xp)
 #define STXP   glue3(st, L, xp)
-#define CASP   glue3(casp, A, L)
+#ifdef HAVE_AS_LSE
+# define CASP  glue3(casp, A, L)   x0, x1, x2, x3, [x4]
+#else
+# define CASP  .inst 0x48207c82 + M
+#endif
 
-   CASPx0, x1, x2, x3, [x4]
+   CASP/* x0, x1, x2, x3, [x4] */
ret
 
 8: mov x(tmp0), x0
@@ -181,12 +210,16 @@ ENDFN NAME(cas)
 #endif
 
 #ifdef L_swp
-#define SWPglue4(swp, A, L, S)
+#ifdef HAVE_AS_LSE
+# define SWP   glue4(swp, A, L, S) s(0), s(0), [x1]
+#else
+# define SWP   .inst 0x38208020 + B + N
+#endif
 
 STARTFNNAME(swp)
JUMP_IF_NOT_LSE 8f
 
-   SWP s(0), s(0), [x1]
+   SWP /* s(0), s(0), [x1] */
ret
 
 8: mov s(tmp0), s(0)
@@ -204,24 +237,32 @@ ENDFN NAME(swp)
 #ifdef L_ldadd
 #define LDNM   ldadd
 #define OP add
+#define OPN0x
 #elif defined(L_ldclr)
 #define LDNM   ldclr
 #define OP bic
+#define OPN0x1000
 #elif defined(L_ldeor)
 #define LDNM   ldeor
 #define OP eor
+#define OPN0x2000
 #elif defined(L_ldset)
 #define LDNM   ldset
 #define OP orr
+#define OPN0x3000
 #else
 #error
 #endif
-#define LDOP   glue4(LDNM, A, L, S)
+#ifdef HAVE_AS_LSE
+# define LDOP  glue4(LDNM, A, L, S)s(0), s(0), [x1]
+#else
+# define LDOP  .inst 0x38200020 + OPN + B + N
+#endif
 
 STARTFNNAME(LDNM)
JUMP_IF_NOT_LSE 8f
 
-   LDOPs(0), s(0), [x1]
+   LDOP 

[PATCH 14/19][GCC-8] aarch64: Fix store-exclusive in load-operate LSE helpers

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline
    2019-09-25  Richard Henderson 

    PR target/91834
    * config/aarch64/lse.S (LDNM): Ensure STXR output does not
    overlap the inputs.

diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
a5f6673596c73c497156a6f128799cc43b400504..c7979382ad7770b61bb1c64d32ba2395963a9d7a
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -227,8 +227,8 @@ STARTFN NAME(LDNM)
 8: mov s(tmp0), s(0)
 0: LDXRs(0), [x1]
OP  s(tmp1), s(0), s(tmp0)
-   STXRw(tmp1), s(tmp1), [x1]
-   cbnzw(tmp1), 0b
+   STXRw(tmp2), s(tmp1), [x1]
+   cbnzw(tmp2), 0b
ret
 
 ENDFN  NAME(LDNM)


[PATCH 10/19][GCC-8] aarch64: Add out-of-line functions for LSE atomics

2020-04-16 Thread Andre Vieira (lists)

This is the libgcc part of the interface -- providing the functions.
Rationale is provided at the top of libgcc/config/aarch64/lse.S.

2020-04-16  Andre Vieira 

    Backport from mainline
    2019-09-19  Richard Henderson 

    * config/aarch64/lse-init.c: New file.
    * config/aarch64/lse.S: New file.
    * config/aarch64/t-lse: New file.
    * config.host: Add t-lse to all aarch64 tuples.

diff --git a/libgcc/config.host b/libgcc/config.host
index 
b12c86267dac9da8da9e1ab4123d5171c3e07f40..e436ade1a68c6cd918d2f370b14d61682cb9fd59
 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -337,23 +337,27 @@ aarch64*-*-elf | aarch64*-*-rtems*)
extra_parts="$extra_parts crtbegin.o crtend.o crti.o crtn.o"
extra_parts="$extra_parts crtfastmath.o"
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
md_unwind_header=aarch64/aarch64-unwind.h
;;
 aarch64*-*-freebsd*)
extra_parts="$extra_parts crtfastmath.o"
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
md_unwind_header=aarch64/freebsd-unwind.h
;;
 aarch64*-*-fuchsia*)
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp"
;;
 aarch64*-*-linux*)
extra_parts="$extra_parts crtfastmath.o"
md_unwind_header=aarch64/linux-unwind.h
tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
+   tmake_file="${tmake_file} ${cpu_type}/t-lse t-slibgcc-libgcc"
tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
;;
 alpha*-*-linux*)
diff --git a/libgcc/config/aarch64/lse-init.c b/libgcc/config/aarch64/lse-init.c
new file mode 100644
index 
..33d2914747994a1e07dcae906f0352e64045ab02
--- /dev/null
+++ b/libgcc/config/aarch64/lse-init.c
@@ -0,0 +1,45 @@
+/* Out-of-line LSE atomics for AArch64 architecture, Init.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Linaro Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+.  */
+
+/* Define the symbol gating the LSE implementations.  */
+_Bool __aarch64_have_lse_atomics
+  __attribute__((visibility("hidden"), nocommon));
+
+/* Disable initialization of __aarch64_have_lse_atomics during bootstrap.  */
+#ifndef inhibit_libc
+# include 
+
+/* Disable initialization if the system headers are too old.  */
+# if defined(AT_HWCAP) && defined(HWCAP_ATOMICS)
+
+static void __attribute__((constructor))
+init_have_lse_atomics (void)
+{
+  unsigned long hwcap = getauxval (AT_HWCAP);
+  __aarch64_have_lse_atomics = (hwcap & HWCAP_ATOMICS) != 0;
+}
+
+# endif /* HWCAP */
+#endif /* inhibit_libc */
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
new file mode 100644
index 
..a5f6673596c73c497156a6f128799cc43b400504
--- /dev/null
+++ b/libgcc/config/aarch64/lse.S
@@ -0,0 +1,235 @@
+/* Out-of-line LSE atomics for AArch64 architecture.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   Contributed by Linaro Ltd.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public 

[PATCH 13/19][GCC-8] Aarch64: Fix shrinkwrapping interactions with atomics

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline
    2020-01-17  Wilco Dijkstra 

    PR target/92692
    * config/aarch64/atomics.md (aarch64_compare_and_swap)
    Use epilogue_completed rather than reload_completed.

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
28a1dbc4231009333c2e766d9d3aead54a491631..0ee8d2efac05877d610981b719bd02afdf93a832
 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -104,7 +104,7 @@
(clobber (match_scratch:SI 7 "="))]
   ""
   "#"
-  "&& reload_completed"
+  "&& epilogue_completed"
   [(const_int 0)]
   {
 aarch64_split_compare_and_swap (operands);


[PATCH 8/19][GCC-8] aarch64: Implement TImode compare-and-swap

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline.
    2019-09-19  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_gen_compare_reg): Add support
    for NE comparison of TImode values.
    (aarch64_emit_load_exclusive): Add support for TImode.
    (aarch64_emit_store_exclusive): Likewise.
    (aarch64_split_compare_and_swap): Disable strong_zero_p for TImode.
    * config/aarch64/atomics.md (atomic_compare_and_swapti):
    Change iterator from ALLI to ALLI_TI.
    (atomic_compare_and_swapti): New.
    (atomic_compare_and_swapti: New.
    (aarch64_load_exclusive_pair): New.
    (aarch64_store_exclusive_pair): New.
    * config/aarch64/iterators.md (ALLI_TI): New iterator.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
317571e018c4f96046799675e042cdfaabb5b94a..09e78313489d266daaca9eba3647f150534893f6
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1517,10 +1517,33 @@ emit_set_insn (rtx x, rtx y)
 rtx
 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 {
-  machine_mode mode = SELECT_CC_MODE (code, x, y);
-  rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
+  machine_mode cmp_mode = GET_MODE (x);
+  machine_mode cc_mode;
+  rtx cc_reg;
 
-  emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
+  if (cmp_mode == TImode)
+{
+  gcc_assert (code == NE);
+
+  cc_mode = CCmode;
+  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+
+  rtx x_lo = operand_subword (x, 0, 0, TImode);
+  rtx y_lo = operand_subword (y, 0, 0, TImode);
+  emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
+
+  rtx x_hi = operand_subword (x, 1, 0, TImode);
+  rtx y_hi = operand_subword (y, 1, 0, TImode);
+  emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
+gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
+GEN_INT (AARCH64_EQ)));
+}
+  else
+{
+  cc_mode = SELECT_CC_MODE (code, x, y);
+  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
+  emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
+}
   return cc_reg;
 }
 
@@ -14145,40 +14168,54 @@ static void
 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
 rtx mem, rtx model_rtx)
 {
-  rtx (*gen) (rtx, rtx, rtx);
-
-  switch (mode)
+  if (mode == TImode)
+emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
+   gen_highpart (DImode, rval),
+   mem, model_rtx));
+  else
 {
-case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
-case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
-case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
-case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
-default:
-  gcc_unreachable ();
-}
+  rtx (*gen) (rtx, rtx, rtx);
+
+  switch (mode)
+   {
+   case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
+   case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
+   case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
+   case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
+   default:
+ gcc_unreachable ();
+   }
 
-  emit_insn (gen (rval, mem, model_rtx));
+  emit_insn (gen (rval, mem, model_rtx));
+}
 }
 
 /* Emit store exclusive.  */
 
 static void
 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
- rtx rval, rtx mem, rtx model_rtx)
+ rtx mem, rtx rval, rtx model_rtx)
 {
-  rtx (*gen) (rtx, rtx, rtx, rtx);
-
-  switch (mode)
+  if (mode == TImode)
+emit_insn (gen_aarch64_store_exclusive_pair
+  (bval, mem, operand_subword (rval, 0, 0, TImode),
+   operand_subword (rval, 1, 0, TImode), model_rtx));
+  else
 {
-case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
-case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
-case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
-case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
-default:
-  gcc_unreachable ();
-}
+  rtx (*gen) (rtx, rtx, rtx, rtx);
+
+  switch (mode)
+   {
+   case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
+   case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
+   case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
+   case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
+   default:
+ gcc_unreachable ();
+   }
 
-  emit_insn (gen (bval, rval, mem, model_rtx));
+  emit_insn (gen (bval, mem, rval, model_rtx));
+}
 }
 
 /* Mark the previous jump instruction as unlikely.  */
@@ -14197,16 +14234,6 @@ aarch64_expand_compare_and_swap (rtx operands[])
 {
   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
   machine_mode mode, r_mode;
-  typedef rtx (*gen_atomic_cas_fn) (rtx, rtx, rtx, rtx);
-  

[PATCH 7/19][GCC-8] aarch64: Extend %R for integer registers

2020-04-16 Thread Andre Vieira (lists)

2020-04-16  Andre Vieira 

    Backport from mainline.
    2019-09-19  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_print_operand): Allow integer
    registers with %R.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
1068cfd899a759c506e3217e1e2c19cd778b4372..317571e018c4f96046799675e042cdfaabb5b94a
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -6627,7 +6627,7 @@ sizetochar (int size)
  'S/T/U/V':Print a FP/SIMD register name for a register 
list.
The register printed is the FP/SIMD register name
of X + 0/1/2/3 for S/T/U/V.
- 'R':  Print a scalar FP/SIMD register name + 1.
+ 'R':  Print a scalar Integer/FP/SIMD register name + 1.
  'X':  Print bottom 16 bits of integer constant in hex.
  'w/x':Print a general register name or the zero register
(32-bit or 64-bit).
@@ -6813,12 +6813,13 @@ aarch64_print_operand (FILE *f, rtx x, int code)
   break;
 
 case 'R':
-  if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
-   {
- output_operand_lossage ("incompatible floating point / vector 
register operand for '%%%c'", code);
- return;
-   }
-  asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+  if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
+   asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
+  else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
+   asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
+  else
+   output_operand_lossage ("incompatible register operand for '%%%c'",
+   code);
   break;
 
 case 'X':


Re: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics

2020-04-16 Thread Andre Vieira (lists)

On 16/04/2020 13:24, Andre Vieira (lists) wrote:

Hi,

This series backports all the patches and fixes regarding outline 
atomics to the gcc-8 branch.


Bootstrapped the series for aarch64-linux-gnu and regression tested.
Is this OK for gcc-8?

Andre Vieira (19):
aarch64: Add early clobber for aarch64_store_exclusive
aarch64: Simplify LSE cas generation
aarch64: Improve cas generation
aarch64: Improve swp generation
aarch64: Improve atomic-op lse generation
aarch64: Remove early clobber from ATOMIC_LDOP scratch
aarch64: Extend %R for integer registers
aarch64: Implement TImode compare-and-swap
aarch64: Tidy aarch64_split_compare_and_swap
aarch64: Add out-of-line functions for LSE atomics
Add visibility to libfunc constructors
aarch64: Implement -moutline-atomics
Aarch64: Fix shrinkwrapping interactions with atomics (PR92692)
aarch64: Fix store-exclusive in load-operate LSE helpers
aarch64: Configure for sys/auxv.h in libgcc for lse-init.c
aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]
aarch64: Fix bootstrap with old binutils [PR93053]
aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]
re PR target/90724 (ICE with __sync_bool_compare_and_swap with 
-march=armv8.2-a+sve)


Hmm something went wrong when sending these, I had tried to make the 
N/19 patches reply to this one, but failed and also I was pretty sure I 
had CC'ed Kyrill and Richard S.


Adding them now.



[PATCH][GCC][Arm]MVE: Fix -Wall testisms

2020-04-07 Thread Andre Vieira (lists)

Hi,

This patch fixes some testisms I found when testing using -Wall/-Werror.

Is this OK for trunk?

gcc/testsuite/ChangeLog:
2020-04-07  Andre Vieira  

    * gcc.target/arm/mve/intrinsics/vuninitializedq_float.c: Likewise.
    * gcc.target/arm/mve/intrinsics/vuninitializedq_float1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/vuninitializedq_int.c: Likewise.
    * gcc.target/arm/mve/intrinsics/vuninitializedq_int1.c: Likewise.

diff --git 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_float.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_float.c
index 
3b9c0a740976854e7189ab23a6a8b2764c9b888a..52bad05b6219621ada414dc74ab2deebdd1c93e3
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_float.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_float.c
@@ -4,11 +4,12 @@
 
 #include "arm_mve.h"
 
+float16x8_t fa;
+float32x4_t fb;
+
 void
 foo ()
 {
-  float16x8_t fa;
-  float32x4_t fb;
   fa = vuninitializedq_f16 ();
   fb = vuninitializedq_f32 ();
 }
diff --git 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_float1.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_float1.c
index 
0c94608af41fc30c65b959759704033a76bb879f..c6724a52074c6ce0361fdba66c4add831e8c13db
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_float1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_float1.c
@@ -4,13 +4,14 @@
 
 #include "arm_mve.h"
 
+float16x8_t fa, faa;
+float32x4_t fb, fbb;
+
 void
 foo ()
 {
-  float16x8_t fa, faa;
-  float32x4_t fb, fbb;
   fa = vuninitializedq (faa);
   fb = vuninitializedq (fbb);
 }
 
-/* { dg-final { scan-assembler-times "vstrb.8" } */
+/* { dg-final { scan-assembler-times "vstrb.8" 6 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_int.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_int.c
index 
9ae17e240083a66e7c20c16ae06b99463c213bf9..13a0109a9b5380cd83f48154df231081ddb8f08e
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_int.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_int.c
@@ -3,18 +3,18 @@
 /* { dg-additional-options "-O0" } */
 
 #include "arm_mve.h"
+int8x16_t a;
+int16x8_t b;
+int32x4_t c;
+int64x2_t d;
+uint8x16_t ua;
+uint16x8_t ub;
+uint32x4_t uc;
+uint64x2_t ud;
 
 void
 foo ()
 {
-  int8x16_t a;
-  int16x8_t b;
-  int32x4_t c;
-  int64x2_t d;
-  uint8x16_t ua;
-  uint16x8_t ub;
-  uint32x4_t uc;
-  uint64x2_t ud;
   a = vuninitializedq_s8 ();
   b = vuninitializedq_s16 ();
   c = vuninitializedq_s32 ();
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_int1.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_int1.c
index 
e8c1f1019c95af6d871cda9c9142c346ff3b49ae..a321398709e65ee7daadfab9c6089116baccde83
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_int1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vuninitializedq_int1.c
@@ -4,17 +4,18 @@
 
 #include "arm_mve.h"
 
+int8x16_t a, aa;
+int16x8_t b, bb;
+int32x4_t c, cc;
+int64x2_t d, dd;
+uint8x16_t ua, uaa;
+uint16x8_t ub, ubb;
+uint32x4_t uc, ucc;
+uint64x2_t ud, udd;
+
 void
 foo ()
 {
-  int8x16_t a, aa;
-  int16x8_t b, bb;
-  int32x4_t c, cc;
-  int64x2_t d, dd;
-  uint8x16_t ua, uaa;
-  uint16x8_t ub, ubb;
-  uint32x4_t uc, ucc;
-  uint64x2_t ud, udd;
   a = vuninitializedq (aa);
   b = vuninitializedq (bb);
   c = vuninitializedq (cc);


Re: [PATCH][GCC][Arm]: MVE: Fix v[id]wdup's

2020-04-07 Thread Andre Vieira (lists)

On 07/04/2020 11:57, Christophe Lyon wrote:

On Tue, 7 Apr 2020 at 12:40, Andre Vieira (lists)
 wrote:

Hi,

This patch fixes v[id]wdup intrinsics. They had two issues:
1) the predicated versions did not link the incoming inactive vector
parameter to the output
2) The backend didn't enforce the wrap limit operand be in an odd register.

1) was fixed like we did for all other predicated intrinsics
2) requires a temporary hack where we pass the value in the top end of
DImode operand. The proper fix would be to add a register CLASS but this
interacted badly with other existing targets codegen.  We will look to
fix this properly in GCC 11.

Regression tested on arm-none-eabi.


Hi Andre,

How did you find problem 1? I suspect you are using an internal
simulator since qemu does not support MVE yet?
And you probably have runtime tests to exhibit this failure?

Hi Christophe,

I actually found 1) because I was fixing 2). Though yes, I am trying to 
complement testing using an internal simulator and running tests in 
Arm's CMSIS DSP Library (https://github.com/ARM-software/CMSIS_5) that 
use MVE.


Cheers,
Andre

Thanks,

Christophe


Is this OK for trunk?

gcc/ChangeLog:
2020-04-07  Andre Vieira  

  * config/arm/arm_mve.h: Fix v[id]wdup intrinsics.
  * config/arm/mve/md: Fix v[id]wdup patterns.



[PATCH][GCC][Arm]: MVE: Add C++ polymorphism and fix some more issues

2020-04-07 Thread Andre Vieira (lists)

Hi,

This patch adds C++ polymorphism for the MVE intrinsics, by using the 
native C++ polymorphic functions when C++ is used.


It also moves the PRESERVE name macro definitions to the right place so 
that the variants without the '__arm_' prefix are not available if we 
define the PRESERVE NAMESPACE macro.


This patch further fixes two testisms that were brought to light by C++ 
testing added in this patch.


Regression tested on arm-none-eabi.

Is this OK for trunk?

gcc/ChangeLog:
2020-04-07  Andre Vieira  

    * config/arm/arm_mve.h: Add C++ polymorphism and fix
    preserve MACROs.

gcc/testsuite/ChangeLog:
2020-04-07  Andre Vieira  

    * g++.target/arm/mve.exp: New.
    * gcc.target/arm/mve/intrinsics/vcmpneq_n_f16: Fix testism.
    * gcc.target/arm/mve/intrinsics/vcmpneq_n_f32: Likewise.

<>


[PATCH][GCC][Arm]: MVE Fix immediate constraints on some vector instructions

2020-04-07 Thread Andre Vieira (lists)

Hi,

This patch fixes the immediate checks on vcvt and vqshr(u)n[bt] 
instrucitons.  It also removes the 'arm_mve_immediate_check' as the 
check was wrong and the error message is not much better than the 
constraint one, which albeit isn't great either.


Regression tested on arm-none-eabi.

Is this OK for trunk?

gcc/ChangeLog:
2020-04-07  Andre Vieira      (mve_vcvtq_n_to_f_*, mve_vcvtq_n_from_f_*, mve_vqshrnbq_n_*, 
mve_vqshrntq_n_*,
 mve_vqshrunbq_n_s*, mve_vqshruntq_n_s*, 
mve_vcvtq_m_n_from_f_*, mve_vcvtq_m_n_to_f_*,

 mve_vqshrnbq_m_n_*, mve_vqrshruntq_m_n_s*, mve_vqshrunbq_m_n_s*,
 mve_vqshruntq_m_n_s*): Fixed immediate constraints.

gcc/testsuite/ChangeLog:
2020-04-07  Andre Vieira  diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
1af9d5cf145f6d01e364a1afd7ceb3df5da86c9a..cd0a49cdb63690d794981a73e1e7e0d47f6d1987
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -32693,31 +32693,6 @@ arm_simd_check_vect_par_cnst_half_p (rtx op, 
machine_mode mode,
   return true;
 }
 
-/* To check op's immediate values matches the mode of the defined insn.  */
-bool
-arm_mve_immediate_check (rtx op, machine_mode mode, bool val)
-{
-  if (val)
-{
-  if (((GET_CODE (op) == CONST_INT) && (INTVAL (op) <= 7)
-  && (mode == E_V16QImode))
- || ((GET_CODE (op) == CONST_INT) && (INTVAL (op) <= 15)
-  && (mode == E_V8HImode))
- || ((GET_CODE (op) == CONST_INT) && (INTVAL (op) <= 31)
-  && (mode == E_V4SImode)))
-   return true;
-}
-  else
-{
-  if (((GET_CODE (op) == CONST_INT) && (INTVAL (op) <= 7)
-  && (mode == E_V8HImode))
- || ((GET_CODE (op) == CONST_INT) && (INTVAL (op) <= 15)
-  && (mode == E_V4SImode)))
-   return true;
-}
-  return false;
-}
-
 /* Can output mi_thunk for all cases except for non-zero vcall_offset
in Thumb1.  */
 static bool
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 
4a506cc3861534b4ddc30ba8f4f3c4ec28a8cc69..3c75f9ebc70d5765a59934b944955c757b6b2195
 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -401,8 +401,10 @@ (define_int_attr mode1 [(VCTP8Q "8") (VCTP16Q "16") 
(VCTP32Q "32")
(VCTP64Q "64") (VCTP8Q_M "8") (VCTP16Q_M "16")
(VCTP32Q_M "32") (VCTP64Q_M "64")])
 (define_mode_attr MVE_pred2 [(V16QI "mve_imm_8") (V8HI "mve_imm_16")
-(V4SI "mve_imm_32")])
-(define_mode_attr MVE_constraint2 [(V16QI "Rb") (V8HI "Rd") (V4SI "Rf")])
+(V4SI "mve_imm_32")
+(V8HF "mve_imm_16") (V4SF "mve_imm_32")])
+(define_mode_attr MVE_constraint2 [(V16QI "Rb") (V8HI "Rd") (V4SI "Rf")
+   (V8HF "Rd") (V4SF "Rf")])
 (define_mode_attr MVE_LANES [(V16QI "16") (V8HI "8") (V4SI "4")])
 (define_mode_attr MVE_constraint [ (V16QI "Ra") (V8HI "Rc") (V4SI "Re")])
 (define_mode_attr MVE_pred [ (V16QI "mve_imm_7") (V8HI "mve_imm_15")
@@ -1330,7 +1332,7 @@ (define_insn "mve_vcvtq_n_to_f_"
   [
(set (match_operand:MVE_0 0 "s_register_operand" "=w")
(unspec:MVE_0 [(match_operand: 1 "s_register_operand" "w")
-  (match_operand:SI 2 "mve_imm_16" "Rd")]
+  (match_operand:SI 2 "" "")]
 VCVTQ_N_TO_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -1389,7 +1391,7 @@ (define_insn "mve_vcvtq_n_from_f_"
   [
(set (match_operand:MVE_5 0 "s_register_operand" "=w")
(unspec:MVE_5 [(match_operand: 1 "s_register_operand" "w")
-  (match_operand:SI 2 "mve_imm_16" "Rd")]
+  (match_operand:SI 2 "" "")]
 VCVTQ_N_FROM_F))
   ]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
@@ -5484,7 +5486,7 @@ (define_insn "mve_vqshrnbq_n_"
(set (match_operand: 0 "s_register_operand" "=w")
(unspec: [(match_operand: 1 
"s_register_operand" "0")
   (match_operand:MVE_5 2 "s_register_operand" "w")
-  (match_operand:SI 3 "" "")]
+  (match_operand:SI 3 "" "")]
 VQSHRNBQ_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5500,7 +5502,7 @@ (define_insn "mve_vqshrntq_n_"
(set (match_operand: 0 "s_register_operand" "=w")
(unspec: [(match_operand: 1 
"s_register_operand" "0")
   (match_operand:MVE_5 2 "s_register_operand" "w")
-  (match_operand:SI 3 "mve_imm_8" "Rb")]
+  (match_operand:SI 3 "" "")]
 VQSHRNTQ_N))
   ]
   "TARGET_HAVE_MVE"
@@ -5516,7 +5518,7 @@ (define_insn "mve_vqshrunbq_n_s"
(set (match_operand: 0 "s_register_operand" "=w")
(unspec: [(match_operand: 1 
"s_register_operand" "0")
   (match_operand:MVE_5 2 "s_register_operand" "w")
-  (match_operand:SI 3 "immediate_operand" "i")]
+  (match_operand:SI 3 "" "")]
 VQSHRUNBQ_N_S))
   ]
   "TARGET_HAVE_MVE"
@@ -5532,7 

[PATCH][GCC][Arm]: MVE: Fix vec extracts to memory

2020-04-07 Thread Andre Vieira (lists)

Hi,

This patch fixes vec extracts to memory that can arise from code as seen 
in the testcase added. The patch fixes this by allowing mem operands in 
the set of mve_vec_extract patterns, which given the only '=r' 
constraint will lead to the scalar value being written to a register and 
then stored in memory using scalar store pattern.


Regression tested on arm-none-eabi.

Is this OK for trunk?

gcc/ChangeLog:
2020-04-07  Andre Vieira  

    * config/arm/mve.md (mve_vec_extract*): Allow memory operands 
in set.


gcc/testsuite/ChangeLog:
2020-04-07  Andre Vieira  

    * gcc.target/arm/mve/intrinsics/mve_vec_extracts_from_memory.c: 
New test.


diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 
3c75f9ebc70d5765a59934b944955c757b6b2195..c49c14c4240838ce086f424f58726e2e94cf190e
 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -10993,7 +10993,7 @@ (define_insn "mve_vld4q"
 ;; [vgetq_lane_u, vgetq_lane_s, vgetq_lane_f])
 ;;
 (define_insn "mve_vec_extract"
- [(set (match_operand: 0 "s_register_operand" "=r")
+ [(set (match_operand: 0 "nonimmediate_operand" "=r")
(vec_select:
 (match_operand:MVE_VLD_ST 1 "s_register_operand" "w")
 (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
@@ -11011,7 +11011,7 @@ (define_insn "mve_vec_extract"
  [(set_attr "type" "mve_move")])
 
 (define_insn "mve_vec_extractv2didi"
- [(set (match_operand:DI 0 "s_register_operand" "=r")
+ [(set (match_operand:DI 0 "nonimmediate_operand" "=r")
(vec_select:DI
 (match_operand:V2DI 1 "s_register_operand" "w")
 (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
@@ -11024,7 +11024,7 @@ (define_insn "mve_vec_extractv2didi"
   if (elt == 0)
return "vmov\t%Q0, %R0, %e1";
   else
-   return "vmov\t%J0, %K0, %f1";
+   return "vmov\t%Q0, %R0, %f1";
 }
  [(set_attr "type" "mve_move")])
 
diff --git 
a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vec_extracts_from_memory.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vec_extracts_from_memory.c
new file mode 100644
index 
..12f2f2d38d3c2e189a9c06f21fc63e2c23e2e721
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vec_extracts_from_memory.c
@@ -0,0 +1,40 @@
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O3" } */
+
+#include "arm_mve.h"
+
+uint8x16_t *vu8;
+int8x16_t *vs8;
+uint16x8_t *vu16;
+int16x8_t *vs16;
+uint32x4_t *vu32;
+int32x4_t *vs32;
+uint64x2_t *vu64;
+int64x2_t *vs64;
+float16x8_t *vf16;
+float32x4_t *vf32;
+uint8_t u8;
+uint16_t u16;
+uint32_t u32;
+uint64_t u64;
+int8_t s8;
+int16_t s16;
+int32_t s32;
+int64_t s64;
+float16_t f16;
+float32_t f32;
+
+void foo (void)
+{
+  u8 = vgetq_lane (*vu8, 1);
+  u16 = vgetq_lane (*vu16, 1);
+  u32 = vgetq_lane (*vu32, 1);
+  u64 = vgetq_lane (*vu64, 1);
+  s8 = vgetq_lane (*vs8, 1);
+  s16 = vgetq_lane (*vs16, 1);
+  s32 = vgetq_lane (*vs32, 1);
+  s64 = vgetq_lane (*vs64, 1);
+  f16 = vgetq_lane (*vf16, 1);
+  f32 = vgetq_lane (*vf32, 1);
+}


[PATCH][GCC][Arm]: MVE: Fixes for pointers used in intrinsics for c++

2020-04-07 Thread Andre Vieira (lists)

Hi,

This patch fixes the passing of some pointers to builtins that expect 
slightly different types of pointers.  In C this didn't prove an issue, 
but when compiling for C++ gcc complains.


Regression tested on arm-none-eabi.

Is this OK for trunk?

2020-04-07  Andre Vieira  

    * config/arm/arm_mve.h: Cast some pointers to expected types.

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 
44cb0d4a92f7f64d08c17722944d20bd6ea7048a..49c7fb95f17347d283c4df34a6875d686a3e3f09
 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -12897,56 +12897,56 @@ __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_offset_p_s64 (int64_t * __base, uint64x2_t __offset, 
int64x2_t __value, mve_pred16_t __p)
 {
-  __builtin_mve_vstrdq_scatter_offset_p_sv2di (__base, __offset, __value, __p);
+  __builtin_mve_vstrdq_scatter_offset_p_sv2di ((__builtin_neon_di *) __base, 
__offset, __value, __p);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_offset_p_u64 (uint64_t * __base, uint64x2_t __offset, 
uint64x2_t __value, mve_pred16_t __p)
 {
-  __builtin_mve_vstrdq_scatter_offset_p_uv2di (__base, __offset, __value, __p);
+  __builtin_mve_vstrdq_scatter_offset_p_uv2di ((__builtin_neon_di *) __base, 
__offset, __value, __p);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_offset_s64 (int64_t * __base, uint64x2_t __offset, 
int64x2_t __value)
 {
-  __builtin_mve_vstrdq_scatter_offset_sv2di (__base, __offset, __value);
+  __builtin_mve_vstrdq_scatter_offset_sv2di ((__builtin_neon_di *) __base, 
__offset, __value);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_offset_u64 (uint64_t * __base, uint64x2_t __offset, 
uint64x2_t __value)
 {
-  __builtin_mve_vstrdq_scatter_offset_uv2di (__base, __offset, __value);
+  __builtin_mve_vstrdq_scatter_offset_uv2di ((__builtin_neon_di *) __base, 
__offset, __value);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_shifted_offset_p_s64 (int64_t * __base, uint64x2_t 
__offset, int64x2_t __value, mve_pred16_t __p)
 {
-  __builtin_mve_vstrdq_scatter_shifted_offset_p_sv2di (__base, __offset, 
__value, __p);
+  __builtin_mve_vstrdq_scatter_shifted_offset_p_sv2di ((__builtin_neon_di *) 
__base, __offset, __value, __p);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_shifted_offset_p_u64 (uint64_t * __base, uint64x2_t 
__offset, uint64x2_t __value, mve_pred16_t __p)
 {
-  __builtin_mve_vstrdq_scatter_shifted_offset_p_uv2di (__base, __offset, 
__value, __p);
+  __builtin_mve_vstrdq_scatter_shifted_offset_p_uv2di ((__builtin_neon_di *) 
__base, __offset, __value, __p);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_shifted_offset_s64 (int64_t * __base, uint64x2_t 
__offset, int64x2_t __value)
 {
-  __builtin_mve_vstrdq_scatter_shifted_offset_sv2di (__base, __offset, 
__value);
+  __builtin_mve_vstrdq_scatter_shifted_offset_sv2di ((__builtin_neon_di *) 
__base, __offset, __value);
 }
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vstrdq_scatter_shifted_offset_u64 (uint64_t * __base, uint64x2_t 
__offset, uint64x2_t __value)
 {
-  __builtin_mve_vstrdq_scatter_shifted_offset_uv2di (__base, __offset, 
__value);
+  __builtin_mve_vstrdq_scatter_shifted_offset_uv2di ((__builtin_neon_di *) 
__base, __offset, __value);
 }
 
 __extension__ extern __inline void
@@ -18968,14 +18968,14 @@ __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrhq_gather_shifted_offset_f16 (float16_t const * __base, uint16x8_t 
__offset)
 {
-  return __builtin_mve_vldrhq_gather_shifted_offset_fv8hf (__base, __offset);
+  return __builtin_mve_vldrhq_gather_shifted_offset_fv8hf ((__builtin_neon_hi 
*) __base, __offset);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrhq_gather_shifted_offset_z_f16 (float16_t const * __base, uint16x8_t 
__offset, mve_pred16_t __p)
 {
-  return __builtin_mve_vldrhq_gather_shifted_offset_z_fv8hf (__base, __offset, 
__p);
+  return __builtin_mve_vldrhq_gather_shifted_offset_z_fv8hf 
((__builtin_neon_hi *) __base, __offset, __p);
 }
 
 __extension__ extern __inline float32x4_t
@@ -19010,84 +19010,84 @@ __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vldrwq_gather_shifted_offset_f32 (float32_t const * __base, uint32x4_t 

[PATCH][GCC][Arm]: MVE: Fix constant load pattern

2020-04-07 Thread Andre Vieira (lists)

Hi,

This patch fixes the constant load pattern for MVE, this was not 
accounting correctly for label + offset cases.


Added test that ICE'd before and removed the scan assemblers for the 
mve_vector* tests as they were too fragile.


Bootstrapped on arm-linux-gnueabihf and regression tested on arm-none-eabi.

Is this OK for trunk?

gcc/ChangeLog:
2020-04-07  Andre Vieira  

    * config/arm/arm.c (output_move_neon): Deal with label + offset 
cases.

    * config/arm/mve.md (*mve_mov): Handle const vectors.

gcc/testsuite/ChangeLog:
2020-04-07  Andre Vieira  

    * gcc.target/arm/mve/intrinsics/mve_load_from_array.c: New test.
    * gcc.target/arm/mve/intrinsics/mve_vector_float.c: Remove 
scan-assembler.

    * gcc.target/arm/mve/intrinsics/mve_vector_float1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_int1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_int2.c: Likewise.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
d5207e0d8f07f9be5265fc6d175c148c6cdd53cb..1af9d5cf145f6d01e364a1afd7ceb3df5da86c9a
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -20122,52 +20122,43 @@ output_move_neon (rtx *operands)
  break;
}
   /* Fall through.  */
-case LABEL_REF:
 case PLUS:
+  addr = XEXP (addr, 0);
+  /* Fall through.  */
+case LABEL_REF:
   {
int i;
int overlap = -1;
-   if (TARGET_HAVE_MVE && !BYTES_BIG_ENDIAN
-   && GET_CODE (addr) != LABEL_REF)
+   for (i = 0; i < nregs; i++)
  {
-   sprintf (buff, "v%srw.32\t%%q0, %%1", load ? "ld" : "st");
-   ops[0] = reg;
-   ops[1] = mem;
-   output_asm_insn (buff, ops);
- }
-   else
- {
-   for (i = 0; i < nregs; i++)
+   /* We're only using DImode here because it's a convenient
+  size.  */
+   ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i);
+   ops[1] = adjust_address (mem, DImode, 8 * i);
+   if (reg_overlap_mentioned_p (ops[0], mem))
  {
-   /* We're only using DImode here because it's a convenient
-  size.  */
-   ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * i);
-   ops[1] = adjust_address (mem, DImode, 8 * i);
-   if (reg_overlap_mentioned_p (ops[0], mem))
- {
-   gcc_assert (overlap == -1);
-   overlap = i;
- }
-   else
- {
-   if (TARGET_HAVE_MVE && GET_CODE (addr) == LABEL_REF)
- sprintf (buff, "v%sr.64\t%%P0, %%1", load ? "ld" : "st");
-   else
- sprintf (buff, "v%sr%%?\t%%P0, %%1", load ? "ld" : "st");
-   output_asm_insn (buff, ops);
- }
+   gcc_assert (overlap == -1);
+   overlap = i;
  }
-   if (overlap != -1)
+   else
  {
-   ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * overlap);
-   ops[1] = adjust_address (mem, SImode, 8 * overlap);
if (TARGET_HAVE_MVE && GET_CODE (addr) == LABEL_REF)
- sprintf (buff, "v%sr.32\t%%P0, %%1", load ? "ld" : "st");
+ sprintf (buff, "v%sr.64\t%%P0, %%1", load ? "ld" : "st");
else
  sprintf (buff, "v%sr%%?\t%%P0, %%1", load ? "ld" : "st");
output_asm_insn (buff, ops);
  }
  }
+   if (overlap != -1)
+ {
+   ops[0] = gen_rtx_REG (DImode, REGNO (reg) + 2 * overlap);
+   ops[1] = adjust_address (mem, SImode, 8 * overlap);
+   if (TARGET_HAVE_MVE && GET_CODE (addr) == LABEL_REF)
+ sprintf (buff, "v%sr.32\t%%P0, %%1", load ? "ld" : "st");
+   else
+ sprintf (buff, "v%sr%%?\t%%P0, %%1", load ? "ld" : "st");
+   output_asm_insn (buff, ops);
+ }
 
 return "";
   }
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 
d1028f4542b4972b4080e46544c86d625d77383a..10abc3fae3709891346b63213afb1fe3754af41a
 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -695,9 +695,9 @@ (define_insn "*mve_mov"
 case 2:
   return "vmov\t%Q0, %R0, %e1  @ \;vmov\t%J0, %K0, %f1";
 case 4:
-  if ((TARGET_HAVE_MVE_FLOAT && VALID_MVE_SF_MODE (mode))
- || (MEM_P (operands[1])
- && GET_CODE (XEXP (operands[1], 0)) == LABEL_REF))
+  if (MEM_P (operands[1])
+ && (GET_CODE (XEXP (operands[1], 0)) == LABEL_REF
+ || GET_CODE (XEXP (operands[1], 0)) == CONST))
return output_move_neon (operands);
   else
return "vldrb.8 %q0, %E1";
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_load_from_array.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_load_from_array.c
new file mode 100644
index 

[PATCH][GCC][Arm]: MVE Don't use lsll for 32-bit shifts scalar

2020-04-07 Thread Andre Vieira (lists)

Hi,

After fixing the v[id]wdups using the "moving the wrap parameter" into 
the top-end of a DImode operand using a shift, I noticed we were using 
lsll for 32-bit shifts in scalars, where we don't need to, as we can 
simply do a move, which is much better if we don't need to use the 
bottom part.


We can solve this in a better way, but for now this will do.

Regression tested on arm-none-eabi.

Is this OK for trunk?

2020-04-07  Andre Vieira  

    * config/arm/arm.d (ashldi3): Don't use lsll for constant 32-bit
    shifts.

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 
1a7ea0d701e5677965574d877d0fe4b2f5bc149f..6d5560398dae3d0ace0342b4907542d2a6865f70
 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -4422,7 +4422,8 @@ (define_expand "ashldi3"
 operands[2] = force_reg (SImode, operands[2]);
 
   /* Armv8.1-M Mainline double shifts are not expanded.  */
-  if (arm_reg_or_long_shift_imm (operands[2], GET_MODE (operands[2])))
+  if (arm_reg_or_long_shift_imm (operands[2], GET_MODE (operands[2]))
+ && (REG_P (operands[2]) || INTVAL(operands[2]) != 32))
 {
  if (!reg_overlap_mentioned_p(operands[0], operands[1]))
emit_insn (gen_movdi (operands[0], operands[1]));


[PATCH][GCC][Arm]: MVE: Fix v[id]wdup's

2020-04-07 Thread Andre Vieira (lists)

Hi,

This patch fixes v[id]wdup intrinsics. They had two issues:
1) the predicated versions did not link the incoming inactive vector 
parameter to the output

2) The backend didn't enforce the wrap limit operand be in an odd register.

1) was fixed like we did for all other predicated intrinsics
2) requires a temporary hack where we pass the value in the top end of 
DImode operand. The proper fix would be to add a register CLASS but this 
interacted badly with other existing targets codegen.  We will look to 
fix this properly in GCC 11.


Regression tested on arm-none-eabi.

Is this OK for trunk?

gcc/ChangeLog:
2020-04-07  Andre Vieira  

    * config/arm/arm_mve.h: Fix v[id]wdup intrinsics.
    * config/arm/mve/md: Fix v[id]wdup patterns.

diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 
e31c2e8fdc4f500bf9408d05ad86e151397627f7..47eead71d9515b4103a5b66999a3f9357dc3c3be
 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -13585,29 +13585,33 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_n_u8 (uint8x16_t __inactive, uint32_t __a, uint32_t __b, const 
int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_n_u32 (uint32x4_t __inactive, uint32_t __a, uint32_t __b, 
const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv4si (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv4si (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_n_u16 (uint16x8_t __inactive, uint32_t __a, uint32_t __b, 
const int __imm, mve_pred16_t __p)
 {
-  return __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, __a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, __a, __c, __imm, __p);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_wb_u8 (uint8x16_t __inactive, uint32_t * __a, uint32_t __b, 
const int __imm, mve_pred16_t __p)
 {
-  uint8x16_t __res =  __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, *__a, __b, 
__imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint8x16_t __res =  __builtin_mve_vdwdupq_m_n_uv16qi (__inactive, *__a, __c, 
__imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv16qi (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13615,8 +13619,9 @@ __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_wb_u32 (uint32x4_t __inactive, uint32_t * __a, uint32_t __b, 
const int __imm, mve_pred16_t __p)
 {
-  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__inactive, *__a, __b, 
__imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint32x4_t __res =  __builtin_mve_vdwdupq_m_n_uv4si (__inactive, *__a, __c, 
__imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv4si (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13624,8 +13629,9 @@ __extension__ extern __inline uint16x8_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_m_wb_u16 (uint16x8_t __inactive, uint32_t * __a, uint32_t __b, 
const int __imm, mve_pred16_t __p)
 {
-  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, *__a, __b, 
__imm, __p);
-  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__inactive, *__a, __b, __imm, __p);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  uint16x8_t __res =  __builtin_mve_vdwdupq_m_n_uv8hi (__inactive, *__a, __c, 
__imm, __p);
+  *__a = __builtin_mve_vdwdupq_m_wb_uv8hi (__inactive, *__a, __c, __imm, __p);
   return __res;
 }
 
@@ -13633,29 +13639,33 @@ __extension__ extern __inline uint8x16_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_n_u8 (uint32_t __a, uint32_t __b, const int __imm)
 {
-  return __builtin_mve_vdwdupq_n_uv16qi (__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_n_uv16qi (__a, __c, __imm);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 __arm_vdwdupq_n_u32 (uint32_t __a, uint32_t __b, const int __imm)
 {
-  return __builtin_mve_vdwdupq_n_uv4si (__a, __b, __imm);
+  uint64_t __c = ((uint64_t) __b) << 32;
+  return __builtin_mve_vdwdupq_n_uv4si (__a, __c, __imm);
 }
 
 __extension__ 

Re: [PATCH][GCC][Arm]: MVE: Fix constant load pattern

2020-04-07 Thread Andre Vieira (lists)
The diff looks weird, but this only removes the first if 
(TARGET_HAVE_MVE... ) block and updates the variable 'addr' which is 
only used in the consecutive(TARGET_HAVE_MVE ..) blocks. So it doesn't 
change NEON codegen.


It's unfortunate the diff looks so complicated :(

Cheers,
Andre

On 07/04/2020 11:52, Kyrylo Tkachov wrote:



-Original Message-
From: Andre Vieira (lists) 
Sent: 07 April 2020 11:35
To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm]: MVE: Fix constant load pattern

Hi,

This patch fixes the constant load pattern for MVE, this was not
accounting correctly for label + offset cases.

Added test that ICE'd before and removed the scan assemblers for the
mve_vector* tests as they were too fragile.

Bootstrapped on arm-linux-gnueabihf and regression tested on arm-none-
eabi.

Is this OK for trunk?

This makes me a bit nervous as it touches the common output_move_neon code ☹ 
but it looks like it mostly shuffles things around.
Ok for trunk but please watch out for fallout.
Thanks,
Kyrill


gcc/ChangeLog:
2020-04-07  Andre Vieira  

      * config/arm/arm.c (output_move_neon): Deal with label + offset
cases.
      * config/arm/mve.md (*mve_mov): Handle const vectors.

gcc/testsuite/ChangeLog:
2020-04-07  Andre Vieira  

      * gcc.target/arm/mve/intrinsics/mve_load_from_array.c: New test.
      * gcc.target/arm/mve/intrinsics/mve_vector_float.c: Remove
scan-assembler.
      * gcc.target/arm/mve/intrinsics/mve_vector_float1.c: Likewise.
      * gcc.target/arm/mve/intrinsics/mve_vector_int1.c: Likewise.
      * gcc.target/arm/mve/intrinsics/mve_vector_int2.c: Likewise.


Re: [testsuite][arm] Fix cmse-15.c expected output

2020-04-07 Thread Andre Vieira (lists)

On 06/04/2020 16:12, Christophe Lyon via Gcc-patches wrote:

Hi,

While checking Martin's fix for PR ipa/94445, he made me realize that
the cmse-15.c testcase still fails at -Os because ICF means that we
generate
nonsecure2:
 b   nonsecure0

which is OK, but does not match the currently expected
nonsecure2:
...
 bl  __gnu_cmse_nonsecure_call

(see https://gcc.gnu.org/pipermail/gcc-patches/2020-April/543190.html)

The test has already different expectations for v8-M and v8.1-M.

I've decided to try to use check-function-bodies to account for the
different possibilities:
- v8-M vs v8.1-M via two different prefixes
- code generation variants (-0?) via multiple regexps

I've tested that the test now passes with --target-board=-march=armv8-m.main
and --target-board=-march=armv8.1-m.main.

I feel this a bit too much of a burden for the purpose, maybe there's
a better way of handling all these alternatives (in particular,
there's a lot of duplication since the expected code for the secure*
functions is the same for v8-M and v8.1-M).

OK?

Thanks,

Christophe

Hi Christophe,

This check-function-bodies functionality is pretty sweet, I assume the ( 
A | B ) checks for either of them?
If so that looks like a good improvement. Ideally we'd also check the 
clearing for the v8.1-M cases, but that wasn't there before either and 
they would need again splitting for -mfloat-abi=soft+softfp and 
-mfloat-abi=hard.



So yeah this LGTM but you need approval from a port/global maintainer.

Cheers,
Andre


Re: [PATCH][GCC][Arm]: MVE: Fix polymorphism for scalars and constants

2020-04-07 Thread Andre Vieira (lists)

Now with the zipped patch so it reaches the mailing list.

Sorry for that.

On 07/04/2020 09:57, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 07 April 2020 09:57
To: Kyrylo Tkachov ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH][GCC][Arm]: MVE: Fix polymorphism for scalars and
constants

Hi,

I rebased this patch and made some extra fixes.

This patch merges some polymorphic functions that were uncorrectly
separating scalar variants. It also simplifies the way we detect
scalars and constants in mve_typeid.

I also fixed some polymorphic intrinsics that were splitting of scalar cases.

Regression tested for arm-none-eabi.

Is this OK for trunk?

Ok.
Thanks,
Kyrill


2020-04-07  Andre Vieira  

      * config/arm/arm_mve.h (vsubq_n): Merge with...
      (vsubq): ... this.
      (vmulq_n): Merge with...
      (vmulq): ... this.
      (__ARM_mve_typeid): Simplify scalar and constant detection.

2020-04-07  Andre Vieira  

      * gcc.target/arm/mve/intrinsics/vmulq_n_f16.c: Fix test.
      * gcc.target/arm/mve/intrinsics/vmulq_n_f32.c: Likewise.
      * gcc.target/arm/mve/intrinsics/vmulq_n_s16.c: Likewise.
      * gcc.target/arm/mve/intrinsics/vmulq_n_s32.c: Likewise.
      * gcc.target/arm/mve/intrinsics/vmulq_n_s8.c: Likewise.
      * gcc.target/arm/mve/intrinsics/vmulq_n_u16.c: Likewise.
      * gcc.target/arm/mve/intrinsics/vmulq_n_u32.c: Likewise.
      * gcc.target/arm/mve/intrinsics/vmulq_n_u8.c: Likewise.

On 02/04/2020 10:58, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 02 April 2020 09:22
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm]: MVE: Fix polymorphism for scalars and
constants

Hi,

This patch merges some polymorphic functions that were incorrectly
separating scalar variants. It also simplifies the way we detect
scalars and constants in mve_typeid.

Regression tested for arm-none-eabi.

Is this OK for trunk?

Ok.
Thanks,
Kyrill


2020-04-02  Andre Vieira  

       * config/arm/arm_mve.h (vsubq_n): Merge with...
       (vsubq): ... this.
       (vmulq_n): Merge with...
       (vmulq): ... this.
       (__ARM_mve_typeid): Simplify scalar and constant detection.
<>


Re: [PATCH 3/5] testsuite: [arm/mve] Fix mve_move_gpr_to_gpr.c

2020-04-14 Thread Andre Vieira (lists)

On 10/04/2020 14:55, Christophe Lyon via Gcc-patches wrote:

This test can pass with a hard-float toolchain, provided we don't
force -mfloat-abi=softfp.

This patch removes this useless option, as well as -save-temps which
is implied by arm_v8_1m_mve_fp.

Hi Christophe,

LGTM, but you need to wait for maintainer approval.

Cheers,
Andre


2020-04-10  Christophe Lyon  

gcc/tesuite/
* gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c: Remove
useless options.
---
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c
index 374bc4d..53300e5 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c
@@ -1,6 +1,6 @@
  /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
  /* { dg-add-options arm_v8_1m_mve_fp } */
-/* { dg-additional-options "-O2 -mfloat-abi=softfp --save-temps" } */
+/* { dg-additional-options "-O2" } */
  
  #include "arm_mve.h"
  


Re: [PATCH 5/5] testsuite: [arm/mve] Include arm_mve.h in arm_v8_1m_mve_ok

2020-04-14 Thread Andre Vieira (lists)



On 10/04/2020 14:55, Christophe Lyon via Gcc-patches wrote:

Since arm_mve.h includes stdint.h, its use requires the presence of
the right gnu/stub-*.h, so make sure to include it when checking the
arm_v8_1m_mve_ok_nocache effective target, otherwise we can decide MVE
is supported while it's not really. This makes several tests
unsupported rather than fail.

Hi Christophe,

LGTM, but you need to wait for maintainer approval.

Cheers,
Andre


2020-04-10  Christophe Lyon  

gcc/testsuite/
* lib/target-supports.exp
(check_effective_target_arm_v8_1m_mve_ok_nocache): Include
arm_mve.h.
---
  gcc/testsuite/lib/target-supports.exp | 1 +
  1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 6c8dd01..d16498d 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4965,6 +4965,7 @@ proc check_effective_target_arm_v8_1m_mve_ok_nocache { } {
#if __ARM_BIG_ENDIAN
#error "MVE intrinsics are not supported in Big-Endian mode."
#endif
+   #include 
  } "$flags -mthumb"] } {
  set et_arm_v8_1m_mve_flags "$flags -mthumb --save-temps"
  return 1


Re: [PATCH 4/5] testsuite: [arm/mve] Use dg-add-options arm_v8_1m_mve in MVE tests

2020-04-14 Thread Andre Vieira (lists)

On 10/04/2020 14:55, Christophe Lyon via Gcc-patches wrote:

Several ARM/MVE tests can be compiled even if the toolchain does not
support -mfloat-abi=hard (softfp is OK).

Use dg-add-options arm_v8_1m_mve or arm_v8_1m_mve_fp instead of using
dg-additional-options.

Hi Christophe,

I think a bunch of these tests were initially meant to test hard float 
abi with vectors, especially in the MVE integer cases, this is what the 
scan-assemblers are meant to test. However, it seems to pass for 
float-abi=softfp too ... which means these tests don't really mean 
anything. I think it would be good to remove the scan-assembler tests 
for now and improve these tests later with run tests, or some smarter 
function body testing.


I suggest we apply your changes and I can follow up with a patch to 
remove the scan-assemblers for now, if a maintainer agrees with me that is.


Cheers,
Andre

2020-04-10  Christophe Lyon  

gcc/testsuite/
* gcc.target/arm/mve/intrinsics/mve_vector_float.c: Use
arm_v8_1m_mve_fp.
* gcc.target/arm/mve/intrinsics/mve_vector_float1.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vector_float2.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vector_int.c: Use
arm_v8_1m_mve.
* gcc.target/arm/mve/intrinsics/mve_vector_int1.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vector_int2.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vector_uint.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vector_uint1.c: Likewise.
* gcc.target/arm/mve/intrinsics/mve_vector_uint2.c: Likewise.
---
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c| 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int1.c   | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int2.c   | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_uint.c   | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_uint1.c  | 2 +-
  gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_uint2.c  | 2 +-
  9 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c
index 881157f..6519b81 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c
@@ -1,6 +1,6 @@
  /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
  /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } 
{""} } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfpu=auto 
-mfloat-abi=hard -mthumb --save-temps" } */
  
  #include "arm_mve.h"
  
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c

index 9515ed6..855e3b8 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c
@@ -1,6 +1,6 @@
  /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
  /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } 
{""} } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfpu=auto 
-mfloat-abi=hard -mthumb --save-temps" } */
  
  #include "arm_mve.h"
  
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c

index 3ce8ea3..e3cf8f8 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c
@@ -1,6 +1,6 @@
  /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
  /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } 
{""} } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfpu=auto 
-mfloat-abi=hard -mthumb --save-temps" } */
  
  #include "arm_mve.h"
  
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c

index dab0705..e70cbc1 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c
@@ -1,6 +1,6 @@
  /* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
  /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } 
{""} } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve -mfpu=auto -mfloat-abi=hard 
-mthumb --save-temps" } */
  
  #include "arm_mve.h"
  
diff --git 

Re: [PATCH 1/5] testsuite: [arm] Add arm_softfp_ok and arm_hard_ok effective targets.

2020-04-14 Thread Andre Vieira (lists)

On 10/04/2020 14:55, Christophe Lyon via Gcc-patches wrote:

For arm-linux-gnueabi* targets, a toolchain cannot support the
float-abi opposite to the one it has been configured for: since glibc
does not support such multilibs, we end up lacking gnu/stubs-*.h when
including stdint.h for instance.

This patch introduces two new effective targets to detect whether we
can compile tests with -mfloat-abi=softfp or -mfloat-abi=hard.

This enables to make such tests unsupported rather than fail.

Hi Christophe,

LGTM, but you need to wait for maintainer approval.

Cheers,
Andre

2020-04-10  Christophe Lyon  

gcc/testsuite/
* lib/target-supports.exp (arm_softfp_ok): New effective target.
(arm_hard_ok): Likewise.
---
  gcc/testsuite/lib/target-supports.exp | 20 
  1 file changed, 20 insertions(+)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 3758bb3..6c8dd01 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4739,6 +4739,26 @@ proc check_effective_target_default_branch_protection { 
} {
  return [check_configured_with "enable-standard-branch-protection"]
  }
  
+# Return 1 if this is an ARM target supporting -mfloat-abi=softfp.

+
+proc check_effective_target_arm_softfp_ok { } {
+return [check_no_compiler_messages arm_softfp_ok object {
+   #include 
+   int dummy;
+   int main (void) { return 0; }
+   } "-mfloat-abi=softfp"]
+}
+
+# Return 1 if this is an ARM target supporting -mfloat-abi=hard.
+
+proc check_effective_target_arm_hard_ok { } {
+return [check_no_compiler_messages arm_hard_ok object {
+   #include 
+   int dummy;
+   int main (void) { return 0; }
+   } "-mfloat-abi=hard"]
+}
+
  # Return 1 if the target supports ARMv8.1-M MVE with floating point
  # instructions, 0 otherwise.  The test is valid for ARM.
  # Record the command line options needed.


Re: [PATCH 2/5] testsuite: [arm/mve] Use arm_softfp and arm_hard as needed in MVE tests

2020-04-14 Thread Andre Vieira (lists)

On 10/04/2020 14:55, Christophe Lyon via Gcc-patches wrote:

Some MVE tests explicitly test a -mfloat-abi=hard option, but we need
to check that the toolchain actually supports it (which may not be the
case for arm-linux-gnueabi* targets).

We also make use of dg-add-options arm_v8_1m_mve_fp and arm_v8_1m_mve
instead of duplicating the corresponding options in
dg-additional-options where we keep only -mfloat-abi to override the
option selected by arm_v8_1m_mve_fp.

Hi Christophe,

This sounds good!! Thank you for doing this.

diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
index 1462dd4..0fa3afd 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
@@ -1,6 +1,8 @@
  /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-require-effective-target arm_hard_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
  /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } 
{""} } */

I was wondering, do we still need the skip-if with the arm_hard_ok?

-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=hard -mthumb 
-mfpu=auto --save-temps" } */
+/* { dg-additional-options "-mfloat-abi=hard" } */
  
  #include "arm_mve.h"
  
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c

index d528133..1fca110 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c
@@ -1,5 +1,7 @@
  /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=softfp -mthumb 
-mfpu=auto --save-temps" } */
+/* { dg-require-effective-target arm_softfp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-mfloat-abi=softfp" } */
  
  #include "arm_mve.h"
  
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c

index 59ca724..726f9ec 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
@@ -1,6 +1,8 @@
  /* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_hard_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
  /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } 
{""} } */

Same here.

-/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=hard -mthumb 
-mfpu=auto --save-temps" } */
+/* { dg-additional-options "-mfloat-abi=hard" } */
  
  #include "arm_mve.h"
  
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c

index ce297ea..7f39905 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
@@ -1,6 +1,8 @@
  /* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_softfp_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
  /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } 
{""} } */

And here.

-/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=softfp -mthumb 
-mfpu=auto --save-temps" } */
+/* { dg-additional-options "-mfloat-abi=softfp" } */
  
  #include "arm_mve.h"
  

LGTM.


[committed][GCC][Arm]: MVE: Fix unintended change to tests

2020-04-03 Thread Andre Vieira (lists)
When committing my last patch I accidentally removed -mfpu=auto from the 
following tests. This puts it back.


2020-04-03  Andre Vieira  

    * gcc.target/arm/mve/intrinsics/mve_vector_float.c: Put 
-mfpu=auto back.

    * gcc.target/arm/mve/intrinsics/mve_vector_float1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_float2.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_int.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_int1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_int2.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_uint.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_uint1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_uint2.c: Likewise.

diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c
index 
a6f95a63859c8b130f2c63f788cbea766dd8c5b2..9de47e6a1e0214fef26630b7959f11e58809d2c0
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } {""} 
} */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=hard 
-mthumb --save-temps" } */
+/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfpu=auto 
-mfloat-abi=hard -mthumb --save-temps" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c
index 
7745eecacca5faa082d3440c50585f62fd34c0af..ba8fb6dd5da4464dd8e58e837d84540acd1d
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float1.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } {""} 
} */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=hard 
-mthumb --save-temps" } */
+/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfpu=auto 
-mfloat-abi=hard -mthumb --save-temps" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c
index 
02653f08359176a245234448e1273fe106e324e3..3ce8ea3b303509df1ecd8096b990ea9b02846c79
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_float2.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
 /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } {""} 
} */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=hard 
-mthumb --save-temps" } */
+/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfpu=auto 
-mfloat-abi=hard -mthumb --save-temps" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c
index 
5c009ba746278e2bc742d5a89ca510f0899b5db2..dab07051bda3b823b2643d8d0c6aa266515a84c2
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
 /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } {""} 
} */
-/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=hard -mthumb 
--save-temps" } */
+/* { dg-additional-options "-march=armv8.1-m.main+mve -mfpu=auto 
-mfloat-abi=hard -mthumb --save-temps" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int1.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int1.c
index 
50f0bd1efa52b4b2aaf505664b3e571309a26bd3..2d2fd116dfcfcdb04220e92090706c362350e8d2
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int1.c
@@ -1,6 +1,6 @@
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
 /* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } {""} 
} */
-/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=hard -mthumb 
--save-temps" } */
+/* { dg-additional-options "-march=armv8.1-m.main+mve -mfpu=auto 
-mfloat-abi=hard -mthumb --save-temps" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int2.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int2.c
index 
11540966944614142bf46ea6953a27dff45187d4..7ec85866993f9919fc6945d056a9cf8fcf361300
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_vector_int2.c
+++ 

[PATCH][GCC][Arm]: Do not process rest of MVE header file after unsupported error

2020-04-02 Thread Andre Vieira (lists)

Hi,

This patch makes sure the rest of the header file is not parsed if MVE 
is not supported.  The user should not be including this file if MVE is 
not supported, nevertheless making sure it doesn't parse the rest of the 
header file will save the user from a huge error output that would be 
rather useless.


Is this OK for trunk?

gcc/ChangeLog:
2020-04-02  Andre Vieira  

    * config/arm/arm_mve.h: Condition the header file on 
__ARM_FEATURE_MVE.


diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 
f1dcdc2153217e796c58526ba0e5be11be642234..1ce55bd2fc4f5c6a171ffe116d7fd9029e11a619
 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -24,11 +24,9 @@
 
 #if __ARM_BIG_ENDIAN
 #error "MVE intrinsics are not supported in Big-Endian mode."
-#endif
-
-#if !__ARM_FEATURE_MVE
+#elif !__ARM_FEATURE_MVE
 #error "MVE feature not supported"
-#endif
+#else
 
 #include 
 #ifndef  __cplusplus
@@ -27554,4 +27552,5 @@ extern void *__ARM_undef;
 }
 #endif
 
+#endif /* __ARM_FEATURE_MVE  */
 #endif /* _GCC_ARM_MVE_H.  */


[PATCH][GCC-8][Aarch64]: Fix for PR target/9481

2020-04-28 Thread Andre Vieira (lists)

Hi,

Backport of PR target/94518: Fix memmodel index in 
aarch64_store_exclusive_pair


This fixes bootstrap with --enable-checking=yes,rtl for aarch64.

OK for gcc-8?

Cheers,
Andre

gcc/ChangeLog:
2020-04-28  Andre Vieira  

    PR target/94814
    Backport from gcc-9.
    2020-04-07  Kyrylo Tkachov  

    PR target/94518
    2019-09-23  Richard Sandiford 

    * config/aarch64/atomics.md (aarch64_store_exclusive_pair): Fix
    memmodel index.

diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
index 
1005462ae23aa13dbc3013a255aa189096e33366..0e0b03731922d8e50e8468de94e0ff345d10c32f
 100644
--- a/gcc/config/aarch64/atomics.md
+++ b/gcc/config/aarch64/atomics.md
@@ -752,7 +752,7 @@
  UNSPECV_SX))]
   ""
   {
-enum memmodel model = memmodel_from_int (INTVAL (operands[3]));
+enum memmodel model = memmodel_from_int (INTVAL (operands[4]));
 if (is_mm_relaxed (model) || is_mm_consume (model) || is_mm_acquire 
(model))
   return "stxp\t%w0, %x2, %x3, %1";
 else


[PATCH][GCC][Arm]: Fix bootstrap failure with rtl-checking

2020-04-27 Thread Andre Vieira (lists)

Hi,

The code change that caused this regression was not meant to affect neon 
code-gen, however I missed the REG fall through.  This patch makes sure 
we only get the left-hand of the PLUS if it is indeed a PLUS expr.


I suggest that in gcc-11 this code is cleaned up, as I do not think we 
even need the overlap checks, NEON only loads from or stores to FP 
registers and these can't be used in its addressing modes.


Bootstrapped arm-linux-gnueabihf with '--enable-checking=yes,rtl' for 
armv7-a and amrv8-a.


Is this OK for trunk?

gcc/ChangeLog:
2020-04-27  Andre Vieira  

    * config/arm/arm.c (output_move_neon): Only get the first operand,
    if addr is PLUS.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
0151bda90d961ae1a001c61cd5e94d6ec67e3aea..74454dddbb948a5d37f502e8e2146a81cb83d58b
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -20145,7 +20145,8 @@ output_move_neon (rtx *operands)
}
   /* Fall through.  */
 case PLUS:
-  addr = XEXP (addr, 0);
+  if (GET_CODE (addr) == PLUS)
+   addr = XEXP (addr, 0);
   /* Fall through.  */
 case LABEL_REF:
   {


[PATCH][wwwdocs] Add -moutline-atomics for AArch64 on gcc-9 and gcc-8

2020-04-24 Thread Andre Vieira (lists)
Add the backported functionality of -moutline-atomics for AArch64 to the 
gcc-9 and gcc-8 changes.html


Validates. Is this OK?
diff --git a/htdocs/gcc-8/changes.html b/htdocs/gcc-8/changes.html
index 
83dd1bc010a6e4debb76790b3fe62275bf0e0657..83e57db181294110f71a5d59960fb4d3fed7be98
 100644
--- a/htdocs/gcc-8/changes.html
+++ b/htdocs/gcc-8/changes.html
@@ -1394,5 +1394,22 @@ known to be fixed in the 8.4 release. This list might 
not be
 complete (that is, it is possible that some PRs that have been fixed
 are not listed here).
 
+
+GCC 8.5
+
+ Target Specific Changes
+
+AArch64
+  
+
+  The option -moutline-atomics has been added to aid
+  deployment of the Large System Extensions (LSE) on GNU/Linux systems built
+  with a baseline architecture targeting Armv8-A.  When the option is
+  specified code is emitted to detect the presence of LSE instructions at
+  runtime and use them for standard atomic operations.
+  For more information please refer to the documentation.
+
+  
+
 
 
diff --git a/htdocs/gcc-9/changes.html b/htdocs/gcc-9/changes.html
index 
74c7cde72ef5ab8ec059e20a8da3e46907ecd9a3..a2a28a9aeb851cae298e828d2c4b57c6fa414cf4
 100644
--- a/htdocs/gcc-9/changes.html
+++ b/htdocs/gcc-9/changes.html
@@ -1132,5 +1132,21 @@ complete (that is, it is possible that some PRs that 
have been fixed
 are not listed here).
 
 
+GCC 9.4
+
+ Target Specific Changes
+
+AArch64
+  
+
+  The option -moutline-atomics has been added to aid
+  deployment of the Large System Extensions (LSE) on GNU/Linux systems built
+  with a baseline architecture targeting Armv8-A.  When the option is
+  specified code is emitted to detect the presence of LSE instructions at
+  runtime and use them for standard atomic operations.
+  For more information please refer to the documentation.
+
+  
+
 
 


Re: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics

2020-04-22 Thread Andre Vieira (lists)



On 20/04/2020 09:42, Kyrylo Tkachov wrote:

Hi Andre,


-Original Message-
From: Andre Vieira (lists) 
Sent: 16 April 2020 13:24
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; Richard Sandiford
; s...@amazon.com
Subject: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics

Hi,

This series backports all the patches and fixes regarding outline
atomics to the gcc-8 branch.

Bootstrapped the series for aarch64-linux-gnu and regression tested.
Is this OK for gcc-8?

Andre Vieira (19):
aarch64: Add early clobber for aarch64_store_exclusive
aarch64: Simplify LSE cas generation
aarch64: Improve cas generation
aarch64: Improve swp generation
aarch64: Improve atomic-op lse generation
aarch64: Remove early clobber from ATOMIC_LDOP scratch
aarch64: Extend %R for integer registers
aarch64: Implement TImode compare-and-swap
aarch64: Tidy aarch64_split_compare_and_swap
aarch64: Add out-of-line functions for LSE atomics
Add visibility to libfunc constructors
aarch64: Implement -moutline-atomics
Aarch64: Fix shrinkwrapping interactions with atomics (PR92692)
aarch64: Fix store-exclusive in load-operate LSE helpers
aarch64: Configure for sys/auxv.h in libgcc for lse-init.c
aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]
aarch64: Fix bootstrap with old binutils [PR93053]

Thanks for putting these together.
Before they can go in we need to get this fix for PR93053 into GCC 9.
Can you please test it on that branch to help Jakub out?
Thanks,
Kyrill
Bootstrapped and regression tested the PR93053 fix from Jakub on gcc-9 
branch and it looks good.

aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]
re PR target/90724 (ICE with __sync_bool_compare_and_swap with
-march=armv8.2-a+sve)




[committed][gcc-9] aarch64: Fix bootstrap with old binutils [PR93053]

2020-04-22 Thread Andre Vieira (lists)

Went ahead and committed the backport to gcc-9.

As reported in the PR, GCC 10 (and also 9.3.1 but not 9.3.0) fails to build
when using older binutils which lack LSE support, because those instructions
are used in libgcc.
Thanks to Kyrylo's hint, the following patches (hopefully) allow it to build
even with older binutils by using .inst directive if LSE support isn't
available in the assembler.

2020-04-22  Andre Vieira  

    Backport from mainline.
    2020-04-15  Jakub Jelinek  

    PR target/93053
    * configure.ac (LIBGCC_CHECK_AS_LSE): Add HAVE_AS_LSE checking.
    * config/aarch64/lse.S: Include auto-target.h, if HAVE_AS_LSE
    is not defined, use just .arch armv8-a.
    (B, M, N, OPN): Define.
    (COMMENT): New .macro.
    (CAS, CASP, SWP, LDOP): Use .inst directive if HAVE_AS_LSE is not
    defined.  Otherwise, move the operands right after the glue? and
    comment out operands where the macros are used.
    * configure: Regenerated.
    * config.in: Regenerated.

On 22/04/2020 10:59, Kyrylo Tkachov wrote:

Hi Andre,


-Original Message-
From: Andre Vieira (lists) 
Sent: 22 April 2020 09:26
To: Kyrylo Tkachov ; gcc-patches@gcc.gnu.org
Cc: Richard Sandiford ; s...@amazon.com
Subject: Re: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics


On 20/04/2020 09:42, Kyrylo Tkachov wrote:

Hi Andre,


-Original Message-
From: Andre Vieira (lists) 
Sent: 16 April 2020 13:24
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov ; Richard Sandiford
; s...@amazon.com
Subject: [PATCH 0/19][GCC-8] aarch64: Backport outline atomics

Hi,

This series backports all the patches and fixes regarding outline
atomics to the gcc-8 branch.

Bootstrapped the series for aarch64-linux-gnu and regression tested.
Is this OK for gcc-8?

Andre Vieira (19):
aarch64: Add early clobber for aarch64_store_exclusive
aarch64: Simplify LSE cas generation
aarch64: Improve cas generation
aarch64: Improve swp generation
aarch64: Improve atomic-op lse generation
aarch64: Remove early clobber from ATOMIC_LDOP scratch
aarch64: Extend %R for integer registers
aarch64: Implement TImode compare-and-swap
aarch64: Tidy aarch64_split_compare_and_swap
aarch64: Add out-of-line functions for LSE atomics
Add visibility to libfunc constructors
aarch64: Implement -moutline-atomics
Aarch64: Fix shrinkwrapping interactions with atomics (PR92692)
aarch64: Fix store-exclusive in load-operate LSE helpers
aarch64: Configure for sys/auxv.h in libgcc for lse-init.c
aarch64: Fix up aarch64_compare_and_swaphi pattern [PR94368]
aarch64: Fix bootstrap with old binutils [PR93053]

Thanks for putting these together.
Before they can go in we need to get this fix for PR93053 into GCC 9.
Can you please test it on that branch to help Jakub out?
Thanks,
Kyrill

Bootstrapped and regression tested the PR93053 fix from Jakub on gcc-9
branch and it looks good.

Thanks, can you please apply the patch to the gcc-9 branch then? (making sure 
the PR markers are there in the commit message so that Bugzilla is updated).
We can then proceed with the GCC 8 backports.

Kyrill


aarch64: Fix ICE due to aarch64_gen_compare_reg_maybe_ze [PR94435]
re PR target/90724 (ICE with __sync_bool_compare_and_swap with
-march=armv8.2-a+sve)
diff --git a/libgcc/config.in b/libgcc/config.in
index 
59a3d8daf52e72e548d3d9425d6043d5e0c663ad..5be5321d2584392bac1ec3af779cd96823212902
 100644
--- a/libgcc/config.in
+++ b/libgcc/config.in
@@ -10,6 +10,9 @@
*/
 #undef HAVE_AS_CFI_SECTIONS
 
+/* Define to 1 if the assembler supports LSE. */
+#undef HAVE_AS_LSE
+
 /* Define to 1 if the target assembler supports thread-local storage. */
 #undef HAVE_CC_TLS
 
diff --git a/libgcc/config/aarch64/lse.S b/libgcc/config/aarch64/lse.S
index 
c7979382ad7770b61bb1c64d32ba2395963a9d7a..f7f1c19587beaec2ccb6371378d54d50139ba1c9
 100644
--- a/libgcc/config/aarch64/lse.S
+++ b/libgcc/config/aarch64/lse.S
@@ -48,8 +48,14 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  If 
not, see
  * separately to minimize code size.
  */
 
+#include "auto-target.h"
+
 /* Tell the assembler to accept LSE instructions.  */
+#ifdef HAVE_AS_LSE
.arch armv8-a+lse
+#else
+   .arch armv8-a
+#endif
 
 /* Declare the symbol gating the LSE implementations.  */
.hidden __aarch64_have_lse_atomics
@@ -58,12 +64,19 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 #if SIZE == 1
 # define S b
 # define UXT   uxtb
+# define B 0x
 #elif SIZE == 2
 # define S h
 # define UXT   uxth
+# define B 0x4000
 #elif SIZE == 4 || SIZE == 8 || SIZE == 16
 # define S
 # define UXT   mov
+# if SIZE == 4
+#  define B0x8000
+# elif SIZE == 8
+#  define B0xc000
+# endif
 #else
 # error
 #endif
@@ -72,18 +85,26 @@ see the files COPYING3 and COPYING.RUNTIME respectively.  
If not, see
 # define SUFF  _relax
 # define A
 # define L
+# define M 0x00
+# define N 0x00
 #elif MODEL == 2
 # define SUFF  _acq
 # define A a
 

[PATCH][GCC-8][Aarch64]: Backport Force TImode values into even registers

2020-04-29 Thread Andre Vieira (lists)

Hi,

This is a backport from trunk/gcc-9 that I think we need now that we 
have backported the casp LSE instructions.


Bootstrapped and regression tested on aarch64.

Is this OK for gcc-8?

Cheers,
Andre

The LSE CASP instruction requires values to be placed in even
register pairs.  A solution involving two additional register
classes was rejected in favor of the much simpler solution of
simply requiring all TImode values to be aligned.

gcc/ChangeLog:
2020-04-29  Andre Vieira  

    Backport from mainline.
    2018-10-31  Richard Henderson 

    * config/aarch64/aarch64.c (aarch64_hard_regno_mode_ok): Force
    16-byte modes held in GP registers to use an even regno.

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
5eec1aae54abe04b8320deaf8202621c8e193c01..525deba56ea363a621cccec1a923da241908dd06
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1369,10 +1369,14 @@ aarch64_hard_regno_mode_ok (unsigned regno, 
machine_mode mode)
   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 return mode == Pmode;
 
-  if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
-return true;
-
-  if (FP_REGNUM_P (regno))
+  if (GP_REGNUM_P (regno))
+{
+  if (known_le (GET_MODE_SIZE (mode), 8))
+   return true;
+  else if (known_le (GET_MODE_SIZE (mode), 16))
+   return (regno & 1) == 0;
+}
+  else if (FP_REGNUM_P (regno))
 {
   if (vec_flags & VEC_STRUCT)
return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;


[PATCH, GCC, Arm]: Fix no_cond issue introduced by MVE

2020-03-19 Thread Andre Vieira (lists)

Hi,

This was a matter of mistaken logic in (define_attr "conds" ..). This 
was setting the conds attribute for any neon instruction to no_cond 
which was messing up code generation.


Bootsrapped and regression tested arm-linux-gnueabihf.

Is this OK for trunk?

2020-03-19  Andre Vieira  

    * config/arm/arm.md (define_attr "conds"): Fix logic for neon and mve.

On 17/03/2020 11:55, Christophe Lyon via Gcc-patches wrote:

On Mon, 16 Mar 2020 at 18:41, Kyrylo Tkachov  wrote:

Hi Srinath,

I've pushed the first three of the patches for you to master:
[ARM][GCC][3/x]: MVE ACLE intrinsics framework patch.
[ARM][GCC][2/x]: MVE ACLE intrinsics framework patch.
[ARM][GCC][1/x]: MVE ACLE intrinsics framework patch.

For this first one I adjusted the add_options directives for mve to actually 
add armv8.1-m.main+mve or armv8.1-m.main+mve.fp to the march line so that the 
tests don't appear as UNSUPPORTED when running the testsuite without any 
explicit options specified.
Please keep an eye out for any fallout in the coming days (though I've smoked 
tested the patches myself before committing)


Hi,

I've noticed regressions on arm-none-linux-gnueabihf
--with-mode arm
--with-cpu cortex-a9
--with-fpu neon-fp16
(--with-mode arm is important, or force -marm when running the tests)
FAIL: gcc.target/arm/neon-cond-1.c execution test
FAIL: gfortran.dg/array_constructor_7.f90   -O3 -g  execution test
FAIL: gfortran.dg/complex_intrinsic_5.f90   -O0  execution test
FAIL: gfortran.dg/complex_intrinsic_5.f90   -O1  execution test
FAIL: gfortran.dg/complex_intrinsic_5.f90   -O2  execution test
FAIL: gfortran.dg/complex_intrinsic_5.f90   -O3 -fomit-frame-pointer
-funroll-loops -fpeel-loops -ftracer -finline-functions  execution
test
FAIL: gfortran.dg/complex_intrinsic_5.f90   -O3 -g  execution test
FAIL: gfortran.dg/complex_intrinsic_5.f90   -Os  execution test

Christophe


Thanks,
Kyrill

-Original Message-
From: Gcc-patches  On Behalf Of Srinath 
Parvathaneni
Sent: 16 March 2020 10:53
To: Kyrill Tkachov ; gcc-patches@gcc.gnu.org
Subject: Re: [PATCH v3][ARM][GCC][1/x]: MVE ACLE intrinsics framework patch.

Hi Kyrill,


This is ok but please bootstrap it on arm-none-linux-gnueabihf as well.

I have bootstrapped this patch on arm-none-linux-gnueabihf and found no issues.
There is problem with git commit rights, could you commit this patch on my 
behalf.

Regards
SRI

From: Kyrill Tkachov 
Sent: 12 March 2020 11:15
To: Srinath Parvathaneni ; gcc-patches@gcc.gnu.org 

Subject: Re: [PATCH v3][ARM][GCC][1/x]: MVE ACLE intrinsics framework patch.

Hi Srinath,

On 3/10/20 6:19 PM, Srinath Parvathaneni wrote:

Hello Kyrill,

This patch addresses all the comments in patch version v2.
(version v2)
https://gcc.gnu.org/pipermail/gcc-patches/2020-February/540415.html



Hello,

This patch creates the required framework for MVE ACLE intrinsics.

The following changes are done in this patch to support MVE ACLE
intrinsics.

Header file arm_mve.h is added to source code, which contains the
definitions of MVE ACLE intrinsics and different data types used in
MVE. Machine description file mve.md is also added which contains the
RTL patterns defined for MVE.

A new reigster "p0" is added which is used in by MVE predicated
patterns. A new register class "VPR_REG"
is added and its contents are defined in REG_CLASS_CONTENTS.

The vec-common.md file is modified to support the standard move
patterns. The prefix of neon functions which are also used by MVE is
changed from "neon_" to "simd_".
eg: neon_immediate_valid_for_move changed to
simd_immediate_valid_for_move.

In the patch standard patterns mve_move, mve_store and move_load for
MVE are added and neon.md and vfp.md files are modified to support
this common patterns.

Please refer to Arm reference manual [1] for more details.

[1] https://developer.arm.com/docs/ddi0553/latest

Regression tested on target arm-none-eabi and armeb-none-eabi and
found no regressions.

Ok for trunk?


This is ok but please bootstrap it on arm-none-linux-gnueabihf as well.

Thanks,

Kyrill



Thanks,
Srinath

gcc/ChangeLog:

2020-03-06  Andre Vieira 
 Mihail Ionescu  
 Srinath Parvathaneni 

 * config.gcc (arm_mve.h): Include mve intrinsics header file.
 * config/arm/aout.h (p0): Add new register name for MVE predicated
 cases.
 * config/arm-builtins.c (ARM_BUILTIN_SIMD_LANE_CHECK): Define
macro
 common to Neon and MVE.
 (ARM_BUILTIN_NEON_LANE_CHECK): Renamed to
ARM_BUILTIN_SIMD_LANE_CHECK.
 (arm_init_simd_builtin_types): Disable poly types for MVE.
 (arm_init_neon_builtins): Move a check to arm_init_builtins
function.
 (arm_init_builtins): Use ARM_BUILTIN_SIMD_LANE_CHECK instead of
 ARM_BUILTIN_NEON_LANE_CHECK.
 (mve_dereference_pointer): Add function.
 (arm_expand_builtin_args): Call to mve_dereference_pointer
when MVE is
 enabled.
 

[PATCH 2/2][GCC][Arm]: Fix testisms for MVE testsuite

2020-03-20 Thread Andre Vieira (lists)

Hi,

This patch fixes some testism where -mfpu=auto was missing or where we 
could end up with -mfloat-abi=hard and soft on the same command-line.


Tested on arm-none-eabi.

Is this OK for trunk?

gcc/testsuite/ChangeLog:
2020-03-20  Andre Vieira  

    * gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c: Fix testisms.
    * gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_fpu1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_fpu2.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_fpu3.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_libcall1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_libcall2.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_float.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_float1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_float2.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_int.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_int1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_int2.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_uint.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_uint1.c: Likewise.
    * gcc.target/arm/mve/intrinsics/mve_vector_uint2.c: Likewise.
    * gcc.target/arm/mve/intrinsics/vshrntq_m_n_u32.c
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
index 
17ba616c041378b88463cb7ef150b70b2e7b95ad..d552fbdffee2ac57092e20ce6a02ab3a4b4bd367
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu1.c
@@ -1,6 +1,7 @@
 /* { dg-do compile  } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=hard 
-mthumb" } */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } {""} 
} */
+/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=hard 
-mthumb -mfpu=auto" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c
index 
7b877c4a90c506343d6b4edb750ba06ce3d7a68d..e40b82ec161ac4155c3e38fad1438c296db76486
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fp_fpu2.c
@@ -1,6 +1,6 @@
 /* { dg-do compile  } */
 /* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=softfp 
-mthumb" } */
+/* { dg-additional-options "-march=armv8.1-m.main+mve.fp -mfloat-abi=softfp 
-mthumb -mfpu=auto" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
index 
85fbb5767edc3c25ceb4d6da780d47afa1ee416c..e04cb61043859b2c5484f97d1450ef97076d6a2d
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu1.c
@@ -1,6 +1,7 @@
 /* { dg-do compile  } */
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=hard 
-mthumb" } */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } {""} 
} */
+/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=hard -mthumb 
-mfpu=auto" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
index 
23b3683ae559b3f7bf6c3ad11c4070ad2ddb9387..f52c3627551750cbaaf54f0ac94020db7a92c6ca
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu2.c
@@ -1,6 +1,7 @@
 /* { dg-do compile  } */
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=softfp 
-mthumb" } */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=soft" } {""} 
} */
+/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=softfp 
-mthumb -mfpu=auto" } */
 
 #include "arm_mve.h"
 
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu3.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu3.c
index 
8f7fa348d130e8456d5300ac25821fd96f9d5a97..1f249ca885faeea58951a702d090d49af525eeed
 100644
--- a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu3.c
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_fpu3.c
@@ -1,6 +1,7 @@
 /* { dg-do compile  } */
 /* { dg-require-effective-target arm_v8_1m_mve_ok } */
-/* { dg-additional-options "-march=armv8.1-m.main+mve -mfloat-abi=soft 
-mthumb" } */
+/* { dg-skip-if "Incompatible float ABI" { *-*-* } { "-mfloat-abi=hard" } { "" 
} } */
+/* { dg-additional-options "-march=armv8.1-m.main+mve 

[PATCH 1/2][GCC][Arm]: Fix MVE move from GPR -> GPR

2020-03-20 Thread Andre Vieira (lists)

Hi,

This patch fixes the pattern mve_mov for the case where both MVE vectors 
are in R registers and the move does not get optimized away.  I use the 
same approach as we do for NEON, where we use four register moves.


Bootstrapped on arm-linux-gnueabihf and ran mve testsuite on arm-none-eabi.

Is this OK for trunk?

gcc/ChangeLog:
2020-03-20  Andre Vieira  

    * config/arm/mve.md (mve_mov): Fix R->R case.

gcc/testsuite/ChangeLog:
2020-03-**  Andre Vieira  

    * gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c: New test.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 
3cdb2e71cf04d45d220f6667646d226c8015659a..3015df7a6af0ab50e0ae47894f63597ada8566c5
 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -295,7 +295,7 @@ (define_insn "*mve_mov"
   else
return "vldrb.8 %q0, %E1";
 case 5:
-  return output_move_neon (operands);
+  return output_move_quad (operands);
 case 7:
   return "vstrb.8 %q1, %E0";
 default:
@@ -303,7 +303,7 @@ (define_insn "*mve_mov"
   return "";
 }
 }
-  [(set_attr "type" 
"mve_move,mve_move,mve_move,mve_move,mve_load,mve_move,mve_move,mve_store")
+  [(set_attr "type" 
"mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_move,mve_store")
(set_attr "length" "4,8,8,4,8,8,4,4")
(set_attr "thumb2_pool_range" "*,*,*,*,1018,*,*,*")
(set_attr "neg_pool_range" "*,*,*,*,996,*,*,*")])
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c 
b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c
new file mode 100644
index 
..791b8529a052dfca42e12648b6967eca3a2a985e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/mve_move_gpr_to_gpr.c
@@ -0,0 +1,18 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2 -mfloat-abi=softfp" } */
+
+#include "arm_mve.h"
+
+extern int bar (float16x8_t, float16_t);
+
+extern void foobar (float16_t);
+
+int
+foo (float16x8_t a, float16_t b)
+{
+  foobar (b);
+  return bar (a, b);
+}
+


[PATCH 0/2][GCC][Arm]: MVE fixes for codegen and testsuite

2020-03-20 Thread Andre Vieira (lists)

Hi,

I noticed some issues with the patches that landed on trunk and this 
patch series fixes them.  The first issue was revealed after I fixed the 
testisms (in patch 2/2).

Andre Vieira (2):
Fix MVE move from GPR -> GPR
Fix testisms for MVE testsuite



[PATCH][GCC][Arm]: Revert changes to {get, set}_fpscr

2020-03-20 Thread Andre Vieira (lists)

Hi,

MVE made changes to {get,set}_fpscr to enable the compiler to optimize
unneccesary gets and sets when using these for intrinsics that use 
and/or write
the carry bit.  However, these actually get and set the full FPSCR 
register and
are used by fp env intrinsics to modify the fp context.  So MVE should 
not be

using these.

This fixes regressions for gcc.dg/atomic/c11-atomic-exec-5.c

Bootstrapped and tested arm-linux-gnueabihf.

Is this OK for trunk?

gcc/ChangeLog:
2020-03-20  Andre Vieira  

    * config/arm/unspecs.md (UNSPEC_GET_FPSCR): Rename this to ...
    (VUNSPEC_GET_FPSCR): ... this, and move it to vunspec.
    * config/arm/vfp.md: (get_fpscr, set_fpscr): Revert to old 
patterns.


diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 
e76609f79418af38b70746336dd43592a1dc8713..f0b1f465de4b63d624510783576700519044717d
 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -170,7 +170,6 @@ (define_c_enum "unspec" [
   UNSPEC_TORC  ; Used by the intrinsic form of the iWMMXt TORC 
instruction.
   UNSPEC_TORVSC; Used by the intrinsic form of the iWMMXt 
TORVSC instruction.
   UNSPEC_TEXTRC; Used by the intrinsic form of the iWMMXt 
TEXTRC instruction.
-  UNSPEC_GET_FPSCR ; Represent fetch of FPSCR content.
 ])
 
 
@@ -217,6 +216,7 @@ (define_c_enum "unspecv" [
   VUNSPEC_SLX  ; Represent a store-register-release-exclusive.
   VUNSPEC_LDA  ; Represent a store-register-acquire.
   VUNSPEC_STL  ; Represent a store-register-release.
+  VUNSPEC_GET_FPSCR; Represent fetch of FPSCR content.
   VUNSPEC_SET_FPSCR; Represent assign of FPSCR content.
   VUNSPEC_PROBE_STACK_RANGE ; Represent stack range probing.
   VUNSPEC_CDP  ; Represent the coprocessor cdp instruction.
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 
eb6ae7bea7927c666f36219797d54c0127001bc1..dfb1031431af3ec87d9cccdee35db04e0adffe04
 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -2096,9 +2096,8 @@ (define_insn "3"
 
 ;; Write Floating-point Status and Control Register.
 (define_insn "set_fpscr"
-  [(set (reg:SI VFPCC_REGNUM)
-   (unspec_volatile:SI
-[(match_operand:SI 0 "register_operand" "r")] VUNSPEC_SET_FPSCR))]
+  [(unspec_volatile [(match_operand:SI 0 "register_operand" "r")]
+VUNSPEC_SET_FPSCR)]
   "TARGET_VFP_BASE"
   "mcr\\tp10, 7, %0, cr1, cr0, 0\\t @SET_FPSCR"
   [(set_attr "type" "mrs")])
@@ -2106,7 +2105,7 @@ (define_insn "set_fpscr"
 ;; Read Floating-point Status and Control Register.
 (define_insn "get_fpscr"
   [(set (match_operand:SI 0 "register_operand" "=r")
-   (unspec:SI [(reg:SI VFPCC_REGNUM)] UNSPEC_GET_FPSCR))]
+(unspec_volatile:SI [(const_int 0)] VUNSPEC_GET_FPSCR))]
   "TARGET_VFP_BASE"
   "mrc\\tp10, 7, %0, cr1, cr0, 0\\t @GET_FPSCR"
   [(set_attr "type" "mrs")])


[PATCH 1/2] arm: Add earlyclobber to MVE instructions that require it

2020-03-23 Thread Andre Vieira (lists)


Hi,

This patch adds an earlyclobber to the MVE instructions that require it 
and were missing it. These are vrev64 and 32-bit element variants of 
vcadd, vhcadd vcmul, vmull[bt] and vqdmull[bt].


Regression tested on arm-none-eabi.

Is this OK for trunk?

Cheers,
Andre

2020-03-23  Andre Vieira  

    * config/arm/mve.md (earlyclobber_32): New mode attribute.
    (mve_vrev64q_*, mve_vcaddq*, mve_vhcaddq_*, mve_vcmulq_*,
 mve_vmull[bt]q_*, mve_vqdmull[bt]q_*): Add appropriate early 
clobbers.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 
2e28d9d8408127dd52b9d16c772e7f27a47d390a..0cd67962a2641a3be46fe67819e093c0a712751b
 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -411,6 +411,8 @@ (define_mode_attr MVE_B_ELEM [ (V16QI "V16QI") (V8HI 
"V8QI") (V4SI "V4QI")])
 (define_mode_attr MVE_H_ELEM [ (V8HI "V8HI") (V4SI "V4HI")])
 (define_mode_attr V_sz_elem1 [(V16QI "b") (V8HI  "h") (V4SI "w") (V8HF "h")
  (V4SF "w")])
+(define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=")
+   (V8HF "=w") (V4SF "=")])
 
 (define_int_iterator VCVTQ_TO_F [VCVTQ_TO_F_S VCVTQ_TO_F_U])
 (define_int_iterator VMVNQ_N [VMVNQ_N_U VMVNQ_N_S])
@@ -856,7 +858,7 @@ (define_insn "mve_vrndaq_f"
 ;;
 (define_insn "mve_vrev64q_f"
   [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
+   (set (match_operand:MVE_0 0 "s_register_operand" "=")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")]
 VREV64Q_F))
   ]
@@ -967,7 +969,7 @@ (define_insn "mve_vcvtq_to_f_"
 ;;
 (define_insn "mve_vrev64q_"
   [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+   (set (match_operand:MVE_2 0 "s_register_operand" "=")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")]
 VREV64Q))
   ]
@@ -1541,7 +1543,7 @@ (define_insn "mve_vbrsrq_n_"
 ;;
 (define_insn "mve_vcaddq_rot270_"
   [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+   (set (match_operand:MVE_2 0 "s_register_operand" "")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
   (match_operand:MVE_2 2 "s_register_operand" "w")]
 VCADDQ_ROT270))
@@ -1556,7 +1558,7 @@ (define_insn "mve_vcaddq_rot270_"
 ;;
 (define_insn "mve_vcaddq_rot90_"
   [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+   (set (match_operand:MVE_2 0 "s_register_operand" "")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
   (match_operand:MVE_2 2 "s_register_operand" "w")]
 VCADDQ_ROT90))
@@ -1841,7 +1843,7 @@ (define_insn "mve_vhaddq_"
 ;;
 (define_insn "mve_vhcaddq_rot270_s"
   [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+   (set (match_operand:MVE_2 0 "s_register_operand" "")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
   (match_operand:MVE_2 2 "s_register_operand" "w")]
 VHCADDQ_ROT270_S))
@@ -1856,7 +1858,7 @@ (define_insn "mve_vhcaddq_rot270_s"
 ;;
 (define_insn "mve_vhcaddq_rot90_s"
   [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
+   (set (match_operand:MVE_2 0 "s_register_operand" "")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w")
   (match_operand:MVE_2 2 "s_register_operand" "w")]
 VHCADDQ_ROT90_S))
@@ -2096,7 +2098,7 @@ (define_insn "mve_vmulhq_"
 ;;
 (define_insn "mve_vmullbq_int_"
   [
-   (set (match_operand: 0 "s_register_operand" "=w")
+   (set (match_operand: 0 "s_register_operand" 
"")
(unspec: [(match_operand:MVE_2 1 "s_register_operand" 
"w")
  (match_operand:MVE_2 2 "s_register_operand" 
"w")]
 VMULLBQ_INT))
@@ -2111,7 +2113,7 @@ (define_insn "mve_vmullbq_int_"
 ;;
 (define_insn "mve_vmulltq_int_"
   [
-   (set (match_operand: 0 "s_register_operand" "=w")
+   (set (match_operand: 0 "s_register_operand" 
"")
(unspec: [(match_operand:MVE_2 1 "s_register_operand" 
"w")
  (match_operand:MVE_2 2 "s_register_operand" 
"w")]
 VMULLTQ_INT))
@@ -2621,7 +2623,7 @@ (define_insn "mve_vbicq_n_"
 ;;
 (define_insn "mve_vcaddq_rot270_f"
   [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
+   (set (match_operand:MVE_0 0 "s_register_operand" "")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
   (match_operand:MVE_0 2 "s_register_operand" "w")]
 VCADDQ_ROT270_F))
@@ -2636,7 +2638,7 @@ (define_insn "mve_vcaddq_rot270_f"
 ;;
 (define_insn "mve_vcaddq_rot90_f"
   [
-   (set (match_operand:MVE_0 0 "s_register_operand" "=w")
+   (set (match_operand:MVE_0 0 "s_register_operand" "")
(unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w")
   (match_operand:MVE_0 2 "s_register_operand" "w")]
 VCADDQ_ROT90_F))
@@ -2831,7 +2833,7 @@ (define_insn "mve_vcmpneq_n_f"
 ;;

[PATCH 0/2] arm: Enable assembling when testing MVE

2020-03-23 Thread Andre Vieira (lists)

Hi,

This patch series changes all MVE tests into assembly tests so we check whether 
the generated assembly is syntactically correct.  The first patch of the series 
fixes an issue this caught where the instructions don't allow destination and 
source registers to be the same.

Andre Vieira (2):
arm: Add earlyclobber to MVE instructions that require it
testsuite, arm: Change tests to assemble



[AArch64][GCC-8][GCC-9] Use __getauxval instead of getauxval in LSE detection code in libgcc

2020-05-28 Thread Andre Vieira (lists)

The patch applies cleanly on gcc-9 and gcc-8.
I bootstrapped this on aarch64-none-linux-gnu and tested 
aarch64-none-elf for both.


Is this OK for those backports?

libgcc/ChangeLog:
2020-05-28  Andre Vieira  

    Backport from mainline.
    2020-05-06  Kyrylo Tkachov  

    * config/aarch64/lse-init.c (init_have_lse_atomics): Use __getauxval
    instead of getauxval.
    (AT_HWCAP): Define.
    (HWCAP_ATOMICS): Define.
    Guard detection on __gnu_linux__.

On 06/05/2020 16:24, Kyrylo Tkachov wrote:



-Original Message-
From: Joseph Myers 
Sent: 06 May 2020 15:46
To: Richard Biener 
Cc: Kyrylo Tkachov ; Florian Weimer
; Szabolcs Nagy ; gcc-
patc...@gcc.gnu.org; Jakub Jelinek 
Subject: Re: [PATCH][AArch64] Use __getauxval instead of getauxval in LSE
detection code in libgcc

On Wed, 6 May 2020, Richard Biener wrote:


Here is the updated patch for the record.
Jakub, richi, is this ok for the GCC 10 branch?

I'll defer to Joseph who is release manager as well.

This version is OK with me.

Thank you Joseph,
I've committed this version to trunk and the gcc-10 branch.
Kyrill


--
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-30 Thread Andre Vieira (lists)



On 29/06/2020 11:15, Christophe Lyon wrote:

On Mon, 29 Jun 2020 at 10:56, Andre Vieira (lists)
 wrote:


On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while compiling
for size (-Os) with a thumb-1 target the generated code will clear the
registers r7-r10. These however are callee saved and should be preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 in a
way to make sure they are not used, as pushing and popping hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of
'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, so it
is a rather misnoemer, works in our advantage here though as it does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if
stable.)

Ok.
Thanks,
Kyrill

As I was getting ready to push this I noticed I didn't add any skip-ifs
to prevent this failing with specific target options. So here's a new
version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built with
non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with selected FPU
cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os

Resending as I don't think my earlier one made it to the lists (sorry if
you are receiving this double!)

I'm not following this, before I go off and try to reproduce it, what do
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os'?
These are the options you are seeing in the log file? Surely they should
override the default options? Only thing I can think of is this might
need an extra -mfloat-abi=soft to make sure it overrides the default
float-abi.  Could you give that a try?

No it doesn't make a difference alone.

I also had to add:
-mfpu=auto (that clears the above warning)
-mthumb otherwise we now get cc1: error: target CPU does not support ARM mode

Looks like some effective-target machinery is needed
So I had a look at this,  I was pretty sure that -mfloat-abi=soft 
overwrote -mfpu=<>, which in large it does, as in no FP instructions 
will be generated but the error you see only checks for the right number 
of FP registers. Which doesn't check whether 'TARGET_HARD_FLOAT' is set 
or not. I'll fix this too and use the check-effective-target for 
armv8-m.base for this test as it is indeed a better approach than my bag 
of skip-ifs. I'm testing it locally to make sure my changes don't break 
anything.


Cheers,
Andre


Christophe



Cheers,
Andre

Christophe


Cheers,
Andre

Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

PR target/95646
* config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return):
Use 'callee_saved_reg_p' instead of
'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

PR target/95646
* gcc.target/arm/pr95646.c: New test.


Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-07-06 Thread Andre Vieira (lists)


On 30/06/2020 14:50, Andre Vieira (lists) wrote:


On 29/06/2020 11:15, Christophe Lyon wrote:

On Mon, 29 Jun 2020 at 10:56, Andre Vieira (lists)
 wrote:


On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber 
callee saved

registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while 
compiling
for size (-Os) with a thumb-1 target the generated code will 
clear the
registers r7-r10. These however are callee saved and should be 
preserved

accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 
in a
way to make sure they are not used, as pushing and popping 
hi-registers

requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which 
accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be 
aware of

'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, 
so it
is a rather misnoemer, works in our advantage here though as it 
does

exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous 
versions if

stable.)

Ok.
Thanks,
Kyrill
As I was getting ready to push this I noticed I didn't add any 
skip-ifs

to prevent this failing with specific target options. So here's a new
version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built with
non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with selected FPU
cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os
Resending as I don't think my earlier one made it to the lists 
(sorry if

you are receiving this double!)

I'm not following this, before I go off and try to reproduce it, 
what do
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse 
-Os'?
These are the options you are seeing in the log file? Surely they 
should

override the default options? Only thing I can think of is this might
need an extra -mfloat-abi=soft to make sure it overrides the default
float-abi.  Could you give that a try?

No it doesn't make a difference alone.

I also had to add:
-mfpu=auto (that clears the above warning)
-mthumb otherwise we now get cc1: error: target CPU does not support 
ARM mode


Looks like some effective-target machinery is needed
So I had a look at this,  I was pretty sure that -mfloat-abi=soft 
overwrote -mfpu=<>, which in large it does, as in no FP instructions 
will be generated but the error you see only checks for the right 
number of FP registers. Which doesn't check whether 
'TARGET_HARD_FLOAT' is set or not. I'll fix this too and use the 
check-effective-target for armv8-m.base for this test as it is indeed 
a better approach than my bag of skip-ifs. I'm testing it locally to 
make sure my changes don't break anything.


Cheers,
Andre

Hi,

Sorry for the delay. So I changed the test to use the effective-target 
machinery as you suggested and I also made sure that you don't get the 
"ARMv8-M Security Extensions incompatible with selected FPU" when 
-mfloat-abi=soft.

Further changed 'asm' to '__asm__' to avoid failures with '-std=' options.

Regression tested on arm-none-eabi.
@Christophe: could you test this for your configuration, shouldn't fail 
anymore!


Is this OK for trunk?

Cheers,
Andre

gcc/ChangeLog:
2020-07-06  Andre Vieira  

    * config/arm/arm.c (arm_options_perform_arch_sanity_checks): Do not
    check +D32 for CMSE if -mfloat-abi=soft

gcc/testsuite/ChangeLog:
2020-07-06  Andre Vieira  

    * gcc.target/arm/pr95646.c: Fix testism.


Christophe



Cheers,
Andre

Christophe


Cheers,
Andre

Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira 

    PR target/95646
    * config/arm/arm.c: 
(cmse_nonsecure_entry_clear_before_return):

Use 'callee_saved_reg_p' instead of
    'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira 

    PR target/95646
    * gcc.target/arm/pr95646.c: New test.
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
dac9a6fb5c41ce42cd7a278b417eab25239a043c..38500220bfb2a1ddbbff15eb552451701f7256d5
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -3834,7 +3834,7 @@ arm_options_perform_arch_sanity_checks (void)
 
   /* We don't clear D16-D31 VFP registers for cmse_nonsecure_call functions
  and ARMv8-

Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-23 Thread Andre Vieira (lists)

On 23/06/2020 13:10, Kyrylo Tkachov wrote:



-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while compiling
for size (-Os) with a thumb-1 target the generated code will clear the
registers r7-r10. These however are callee saved and should be preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 in a
way to make sure they are not used, as pushing and popping hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of
'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, so it
is a rather misnoemer, works in our advantage here though as it does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if
stable.)

Ok.
Thanks,
Kyrill
As I was getting ready to push this I noticed I didn't add any skip-ifs 
to prevent this failing with specific target options. So here's a new 
version with those.


Still OK?

Cheers,
Andre



Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

      PR target/95646
      * config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return):
Use 'callee_saved_reg_p' instead of
      'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

      PR target/95646
      * gcc.target/arm/pr95646.c: New test.
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
6b7ca829f1c8cbe3d427da474b079882dc522e1a..dac9a6fb5c41ce42cd7a278b417eab25239a043c
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -26960,7 +26960,7 @@ cmse_nonsecure_entry_clear_before_return (void)
continue;
   if (IN_RANGE (regno, IP_REGNUM, PC_REGNUM))
continue;
-  if (call_used_or_fixed_reg_p (regno)
+  if (!callee_saved_reg_p (regno)
  && (!IN_RANGE (regno, FIRST_VFP_REGNUM, LAST_VFP_REGNUM)
  || TARGET_HARD_FLOAT))
bitmap_set_bit (to_clear_bitmap, regno);
diff --git a/gcc/testsuite/gcc.target/arm/pr95646.c 
b/gcc/testsuite/gcc.target/arm/pr95646.c
new file mode 100644
index 
..12d06a0c8c1ed7de1f8d4d15130432259e613a32
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr95646.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-march=*" } 
{ "-march=armv8-m.base" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-mcpu=*" } { 
"-mcpu=cortex-m23" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-mfpu=*" } { 
} } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { 
"-mfloat-abi=*" } { "-mfloat-abi=soft" } } */
+/* { dg-options "-mcpu=cortex-m23 -mcmse" } */
+/* { dg-additional-options "-Os" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+int __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return 1;
+}
+/* { { dg-final { scan-assembler-not "mov\tr9, r0" } } */
+
+/*
+** __acle_se_bar:
+** mov (r[0-3]), r9
+** push{\1}
+** ...
+** pop {(r[0-3])}
+** mov r9, \2
+** ...
+** bxnslr
+*/
+int __attribute__ ((cmse_nonsecure_entry))
+bar (void)
+{
+  asm ("": : : "r9");
+  return 1;
+}


Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-29 Thread Andre Vieira (lists)



On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while compiling
for size (-Os) with a thumb-1 target the generated code will clear the
registers r7-r10. These however are callee saved and should be preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 in a
way to make sure they are not used, as pushing and popping hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of
'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, so it
is a rather misnoemer, works in our advantage here though as it does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if
stable.)

Ok.
Thanks,
Kyrill

As I was getting ready to push this I noticed I didn't add any skip-ifs
to prevent this failing with specific target options. So here's a new
version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built with
non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with selected FPU
cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os
I'm not following this, before I go off and try to reproduce it, what do 
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os'? 
These are the options you are seeing in the log file? Surely they should 
override the default options? Only thing I can think of is this might 
need an extra -mfloat-abi=soft to make sure it overrides the default 
float-abi.  Could you give that a try?


Cheers,
Andre


Christophe


Cheers,
Andre

Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

   PR target/95646
   * config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return):
Use 'callee_saved_reg_p' instead of
   'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

   PR target/95646
   * gcc.target/arm/pr95646.c: New test.


Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-29 Thread Andre Vieira (lists)



On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while compiling
for size (-Os) with a thumb-1 target the generated code will clear the
registers r7-r10. These however are callee saved and should be preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1 in a
way to make sure they are not used, as pushing and popping hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of
'callee_saved_reg_p''s definition, as it does still take call used
registers into account, which aren't callee_saved in my opinion, so it
is a rather misnoemer, works in our advantage here though as it does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if
stable.)

Ok.
Thanks,
Kyrill

As I was getting ready to push this I noticed I didn't add any skip-ifs
to prevent this failing with specific target options. So here's a new
version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built with
non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with selected FPU
cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os
Resending as I don't think my earlier one made it to the lists (sorry if 
you are receiving this double!)


I'm not following this, before I go off and try to reproduce it, what do 
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os'? 
These are the options you are seeing in the log file? Surely they should 
override the default options? Only thing I can think of is this might 
need an extra -mfloat-abi=soft to make sure it overrides the default 
float-abi.  Could you give that a try?


Cheers,
Andre


Christophe


Cheers,
Andre

Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

   PR target/95646
   * config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return):
Use 'callee_saved_reg_p' instead of
   'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

   PR target/95646
   * gcc.target/arm/pr95646.c: New test.


[PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-06-22 Thread Andre Vieira (lists)

Hi,

As reported in bugzilla when the -mcmse option is used while compiling 
for size (-Os) with a thumb-1 target the generated code will clear the 
registers r7-r10. These however are callee saved and should be preserved 
accross ABI boundaries. The reason this happens is because these 
registers are made "fixed" when optimising for size with Thumb-1 in a 
way to make sure they are not used, as pushing and popping hi-registers 
requires extra moves to and from LO_REGS.


To fix this, this patch uses 'callee_saved_reg_p', which accounts for 
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be aware of 
'callee_saved_reg_p''s definition, as it does still take call used 
registers into account, which aren't callee_saved in my opinion, so it 
is a rather misnoemer, works in our advantage here though as it does 
exactly what we need.


Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous versions if 
stable.)


Cheers,
Andre

gcc/ChangeLog:
2020-06-22  Andre Vieira  

    PR target/95646
    * config/arm/arm.c: (cmse_nonsecure_entry_clear_before_return): 
Use 'callee_saved_reg_p' instead of

    'calL_used_or_fixed_reg_p'.

gcc/testsuite/ChangeLog:
2020-06-22  Andre Vieira  

    PR target/95646
    * gcc.target/arm/pr95646.c: New test.

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
6b7ca829f1c8cbe3d427da474b079882dc522e1a..dac9a6fb5c41ce42cd7a278b417eab25239a043c
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -26960,7 +26960,7 @@ cmse_nonsecure_entry_clear_before_return (void)
continue;
   if (IN_RANGE (regno, IP_REGNUM, PC_REGNUM))
continue;
-  if (call_used_or_fixed_reg_p (regno)
+  if (!callee_saved_reg_p (regno)
  && (!IN_RANGE (regno, FIRST_VFP_REGNUM, LAST_VFP_REGNUM)
  || TARGET_HARD_FLOAT))
bitmap_set_bit (to_clear_bitmap, regno);
diff --git a/gcc/testsuite/gcc.target/arm/pr95646.c 
b/gcc/testsuite/gcc.target/arm/pr95646.c
new file mode 100644
index 
..c9fdc37618ccaddcdb597647c7076054af17789a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr95646.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-mcmse -Os -mcpu=cortex-m23" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+int __attribute__ ((cmse_nonsecure_entry))
+foo (void)
+{
+  return 1;
+}
+/* { { dg-final { scan-assembler-not "mov\tr9, r0" } } */
+
+/*
+** __acle_se_bar:
+** mov (r[0-3]), r9
+** push{\1}
+** ...
+** pop {(r[0-3])}
+** mov r9, \2
+** ...
+** bxnslr
+*/
+int __attribute__ ((cmse_nonsecure_entry))
+bar (void)
+{
+  asm ("": : : "r9");
+  return 1;
+}


[RFC][vect] BB SLP reduction prototype

2020-06-09 Thread Andre Vieira (lists)

Hi,

So this is my rework of the code you sent me, I have not included the 
'permute' code you included as I can't figure out what it is meant to be 
doing. Maybe something to look at later.


I have also included three tests that show it working for some simple 
cases and even a nested one.


Unfortunately it will not handle other simple cases where reassoc 
doesn't put the reduction in the form of :

sum0 = a + b;
sum1 = c + sum0;
...

For instance a testcase I have been looking at is:
unsigned int u32_single_abs_sum (unsigned int * a, unsigned int * b)
{
  unsigned int sub0 = a[0] - b[0];
  unsigned int sub1 = a[1] - b[1];
  unsigned int sub2 = a[2] - b[2];
  unsigned int sub3 = a[3] - b[3];
  unsigned int sum = sub0 + sub1;
  sum += sub2;
  sum += sub3;
  return sum;
}

Unfortunately, the code that reaches slp will look like:
  _1 = *a_10(D);
  _2 = *b_11(D);
  _3 = MEM[(unsigned int *)a_10(D) + 4B];
  _4 = MEM[(unsigned int *)b_11(D) + 4B];
  _5 = MEM[(unsigned int *)a_10(D) + 8B];
  _6 = MEM[(unsigned int *)b_11(D) + 8B];
  _7 = MEM[(unsigned int *)a_10(D) + 12B];
  _8 = MEM[(unsigned int *)b_11(D) + 12B];
  _28 = _1 - _2;
  _29 = _3 + _28;
  _30 = _29 - _4;
  _31 = _5 + _30;
  _32 = _31 - _6;
  _33 = _7 + _32;
  sum_18 = _33 - _8;
  return sum_18;

Which doesn't have the format expected as I described above... I am 
wondering how to teach it to support this. Maybe starting with your 
suggestion of making plus_expr and minus_expr have the same hash, so it 
groups all these statements together might be a start, but you'd still 
need to 'rebalance' the tree somehow I need to give this a bit more 
thought but I wanted to share what I have so far.


The code is severely lacking in comments for now btw...

Cheers,
Andre

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-1.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-1.c
new file mode 100644
index 
..66b53ff9bb1e77414e7493c07ab87d46f4d33651
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-1.c
@@ -0,0 +1,66 @@
+/* { dg-require-effective-target vect_int } */
+#include 
+#include "tree-vect.h"
+extern int abs (int);
+
+#define ABS4(N)\
+  sum += abs (a[N]);   \
+  sum += abs (a[N+1]); \
+  sum += abs (a[N+2]); \
+  sum += abs (a[N+3]);
+
+#define ABS8(N)  \
+  ABS4(N)\
+  ABS4(N+4)
+
+#define ABS16(N)  \
+  ABS8(N)\
+  ABS8(N+8)
+
+__attribute__ ((noipa)) unsigned char
+u8_single_abs_sum (signed char * a)
+{
+  unsigned char sum = 0;
+  ABS16(0)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned short
+u16_single_abs_sum (signed short * a)
+{
+  unsigned short sum = 0;
+  ABS8(0)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned int
+u32_single_abs_sum (signed int * a)
+{
+  unsigned int sum = 0;
+  ABS4(0)
+  return sum;
+}
+
+signed char u8[16] = {0, 1, 2, 3, 4, 5, 6, -7, -8, -9, -10, -11, -12, -13,
+   -14, -15};
+signed short u16[8] = {0, 1, 2, 3, 4, -5, -6, -7};
+signed int u32[4] = {-10, -20, 30, 40};
+
+
+int main (void)
+{
+  check_vect ();
+
+  if (u8_single_abs_sum (&(u8[0])) != 120)
+abort ();
+
+  if (u16_single_abs_sum (&(u16[0])) != 28)
+abort ();
+
+  if (u32_single_abs_sum (&(u32[0])) != 100)
+abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "basic block vectorized" 3 "slp2" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-2.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-2.c
new file mode 100644
index 
..298a22cfef687f6634d61bf808a41942c3ce4a85
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-reduc-2.c
@@ -0,0 +1,82 @@
+/* { dg-require-effective-target vect_int } */
+#include 
+#include "tree-vect.h"
+extern int abs (int);
+
+#define ABS4(N)\
+  sum += abs (a[N]);   \
+  sum += abs (a[N+1]); \
+  sum += abs (a[N+2]); \
+  sum += abs (a[N+3]);
+
+#define ABS8(N)  \
+  ABS4(N)\
+  ABS4(N+4)
+
+#define ABS16(N)  \
+  ABS8(N)\
+  ABS8(N+8)
+
+__attribute__ ((noipa)) unsigned char
+u8_double_abs_sum (signed char * a)
+{
+  unsigned char sum = 0;
+  ABS16(0)
+  ABS16(16)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned short
+u16_double_abs_sum (signed short * a)
+{
+  unsigned short sum = 0;
+  ABS16(0)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned int
+u32_double_abs_sum (signed int * a)
+{
+  unsigned int sum = 0;
+  ABS8(0)
+  return sum;
+}
+
+__attribute__ ((noipa)) unsigned int
+u32_triple_abs_sum (signed int * a)
+{
+  unsigned int sum = 0;
+  ABS8(0)
+  ABS4(8)
+  return sum;
+}
+
+signed char u8[32] = {0, 1, 2, 3, 4, 5, 6, -7, -8, -9, -10, -11, -12, -13,
+ -14, -15, 0, 1, 2, 3, 4, 5, 6, -7, -8, -9, -10, -11, -12, 
-13,
+ -14, -30};
+
+signed short u16[16] = {0, 1, 2, 3, 4, -5, -6, -7, 10, 20, 30, 40, -10, -20,
+  -30, -40};
+signed int u32[16] = {-10, -20, 30, 40, 100, 200, -300, -500, -600, -700, 1000,
+2000};
+

Re: [RFC][vect] BB SLP reduction prototype

2020-06-09 Thread Andre Vieira (lists)
The 'you' here is Richi, which Richi is probably aware but maybe not the 
rest of the list :')


On 09/06/2020 15:29, Andre Vieira (lists) wrote:

Hi,

So this is my rework of the code you sent me, I have not included the 
'permute' code you included as I can't figure out what it is meant to 
be doing. Maybe something to look at later.


I have also included three tests that show it working for some simple 
cases and even a nested one.


Unfortunately it will not handle other simple cases where reassoc 
doesn't put the reduction in the form of :

sum0 = a + b;
sum1 = c + sum0;
...

For instance a testcase I have been looking at is:
unsigned int u32_single_abs_sum (unsigned int * a, unsigned int * b)
{
  unsigned int sub0 = a[0] - b[0];
  unsigned int sub1 = a[1] - b[1];
  unsigned int sub2 = a[2] - b[2];
  unsigned int sub3 = a[3] - b[3];
  unsigned int sum = sub0 + sub1;
  sum += sub2;
  sum += sub3;
  return sum;
}

Unfortunately, the code that reaches slp will look like:
  _1 = *a_10(D);
  _2 = *b_11(D);
  _3 = MEM[(unsigned int *)a_10(D) + 4B];
  _4 = MEM[(unsigned int *)b_11(D) + 4B];
  _5 = MEM[(unsigned int *)a_10(D) + 8B];
  _6 = MEM[(unsigned int *)b_11(D) + 8B];
  _7 = MEM[(unsigned int *)a_10(D) + 12B];
  _8 = MEM[(unsigned int *)b_11(D) + 12B];
  _28 = _1 - _2;
  _29 = _3 + _28;
  _30 = _29 - _4;
  _31 = _5 + _30;
  _32 = _31 - _6;
  _33 = _7 + _32;
  sum_18 = _33 - _8;
  return sum_18;

Which doesn't have the format expected as I described above... I am 
wondering how to teach it to support this. Maybe starting with your 
suggestion of making plus_expr and minus_expr have the same hash, so 
it groups all these statements together might be a start, but you'd 
still need to 'rebalance' the tree somehow I need to give this a 
bit more thought but I wanted to share what I have so far.


The code is severely lacking in comments for now btw...

Cheers,
Andre



Re: [PATCH][GCC][Arm] PR target/95646: Do not clobber callee saved registers with CMSE

2020-07-20 Thread Andre Vieira (lists)



On 08/07/2020 09:04, Andre Simoes Dias Vieira wrote:


On 07/07/2020 13:43, Christophe Lyon wrote:

Hi,


On Mon, 6 Jul 2020 at 16:31, Andre Vieira (lists)
 wrote:


On 30/06/2020 14:50, Andre Vieira (lists) wrote:

On 29/06/2020 11:15, Christophe Lyon wrote:

On Mon, 29 Jun 2020 at 10:56, Andre Vieira (lists)
 wrote:

On 23/06/2020 21:52, Christophe Lyon wrote:

On Tue, 23 Jun 2020 at 15:28, Andre Vieira (lists)
 wrote:

On 23/06/2020 13:10, Kyrylo Tkachov wrote:

-Original Message-
From: Andre Vieira (lists) 
Sent: 22 June 2020 09:52
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov 
Subject: [PATCH][GCC][Arm] PR target/95646: Do not clobber
callee saved
registers with CMSE

Hi,

As reported in bugzilla when the -mcmse option is used while
compiling
for size (-Os) with a thumb-1 target the generated code will
clear the
registers r7-r10. These however are callee saved and should be
preserved
accross ABI boundaries. The reason this happens is because these
registers are made "fixed" when optimising for size with Thumb-1
in a
way to make sure they are not used, as pushing and popping
hi-registers
requires extra moves to and from LO_REGS.

To fix this, this patch uses 'callee_saved_reg_p', which
accounts for
this optimisation, instead of 'call_used_or_fixed_reg_p'. Be
aware of
'callee_saved_reg_p''s definition, as it does still take call 
used

registers into account, which aren't callee_saved in my opinion,
so it
is a rather misnoemer, works in our advantage here though as it
does
exactly what we need.

Regression tested on arm-none-eabi.

Is this OK for trunk? (Will eventually backport to previous
versions if
stable.)

Ok.
Thanks,
Kyrill

As I was getting ready to push this I noticed I didn't add any
skip-ifs
to prevent this failing with specific target options. So here's 
a new

version with those.

Still OK?


Hi,

This is not sufficient to skip arm-linux-gnueabi* configs built 
with

non-default cpu/fpu.

For instance, with arm-linux-gnueabihf --with-cpu=cortex-a9
--with-fpu=neon-fp16 --with-float=hard
I see:
FAIL: gcc.target/arm/pr95646.c (test for excess errors)
Excess errors:
cc1: error: ARMv8-M Security Extensions incompatible with 
selected FPU

cc1: error: target CPU does not support ARM mode

and the testcase is compiled with -mcpu=cortex-m23 -mcmse -Os

Resending as I don't think my earlier one made it to the lists
(sorry if
you are receiving this double!)

I'm not following this, before I go off and try to reproduce it,
what do
you mean by 'the testcase is compiled with -mcpu=cortex-m23 -mcmse
-Os'?
These are the options you are seeing in the log file? Surely they
should
override the default options? Only thing I can think of is this 
might

need an extra -mfloat-abi=soft to make sure it overrides the default
float-abi.  Could you give that a try?

No it doesn't make a difference alone.

I also had to add:
-mfpu=auto (that clears the above warning)
-mthumb otherwise we now get cc1: error: target CPU does not support
ARM mode

Looks like some effective-target machinery is needed

So I had a look at this,  I was pretty sure that -mfloat-abi=soft
overwrote -mfpu=<>, which in large it does, as in no FP instructions
will be generated but the error you see only checks for the right
number of FP registers. Which doesn't check whether
'TARGET_HARD_FLOAT' is set or not. I'll fix this too and use the
check-effective-target for armv8-m.base for this test as it is indeed
a better approach than my bag of skip-ifs. I'm testing it locally to
make sure my changes don't break anything.

Cheers,
Andre

Hi,

Sorry for the delay. So I changed the test to use the effective-target
machinery as you suggested and I also made sure that you don't get the
"ARMv8-M Security Extensions incompatible with selected FPU" when
-mfloat-abi=soft.
Further changed 'asm' to '__asm__' to avoid failures with '-std=' 
options.


Regression tested on arm-none-eabi.
@Christophe: could you test this for your configuration, shouldn't fail
anymore!


Indeed with your patch I don't see any failure with pr95646.c

Note that it is still unsupported with arm-eabi when running the tests
with -mcpu=cortex-mXX
because the compiler complains that -mcpu=cortex-mXX conflicts with
-march=armv8-m.base,
thus the effective-target test fails.

BTW, is that warning useful/practical? Wouldn't it be more convenient
if the last -mcpu/-march
on the command line was the only one taken into account? (I had a
similar issue when
running tests (libstdc++) getting -march=armv8-m.main+fp from their
multilib environment
and forcing -mcpu=cortex-m33 because it also means '+dsp' and produces
a warning;
I had to use -mcpu=cortex-m33 -march=armv8-m.main+fp+dsp to 
workaround this)
Yeah I've been annoyed by that before, also in the context of testing 
multilibs.


Even though I can see how it can be a useful warning though, if you 
are using these in build-systems and you accidentally introduce a new 
(incompatible) -mcpu/-march alo

Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

2023-10-16 Thread Andre Vieira (lists)

Hey,

Just a minor update to the patch, I had missed the libgomp testsuite, so 
had to make some adjustments there too.


gcc/ChangeLog:

* config/aarch64/aarch64.cc (lane_size): New function.
(aarch64_simd_clone_compute_vecsize_and_simdlen): Determine 
simdlen according to NDS rule
and reject combination of simdlen and types that lead to 
vectors larger than 128bits.


gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add aarch64 targets to vect_simd_clones.
* c-c++-common/gomp/declare-variant-14.c: Adapt test for aarch64.
* c-c++-common/gomp/pr60823-1.c: Likewise.
* c-c++-common/gomp/pr60823-2.c: Likewise.
* c-c++-common/gomp/pr60823-3.c: Likewise.
* g++.dg/gomp/attrs-10.C: Likewise.
* g++.dg/gomp/declare-simd-1.C: Likewise.
* g++.dg/gomp/declare-simd-3.C: Likewise.
* g++.dg/gomp/declare-simd-4.C: Likewise.
* g++.dg/gomp/declare-simd-7.C: Likewise.
* g++.dg/gomp/declare-simd-8.C: Likewise.
* g++.dg/gomp/pr88182.C: Likewise.
* gcc.dg/declare-simd.c: Likewise.
* gcc.dg/gomp/declare-simd-1.c: Likewise.
* gcc.dg/gomp/declare-simd-3.c: Likewise.
* gcc.dg/gomp/pr87887-1.c: Likewise.
* gcc.dg/gomp/pr87895-1.c: Likewise.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/pr99542.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-1.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-2.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-4.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-5.c: Likewise.
* gcc.dg/gcc.dg/vect/vect-simd-clone-8.c: Likewise.
* gfortran.dg/gomp/declare-simd-2.f90: Likewise.
* gfortran.dg/gomp/declare-simd-coarray-lib.f90: Likewise.
* gfortran.dg/gomp/declare-variant-14.f90: Likewise.
* gfortran.dg/gomp/pr79154-1.f90: Likewise.
* gfortran.dg/gomp/pr83977.f90: Likewise.

libgomp/testsuite/ChangeLog:

* libgomp.c/declare-variant-1.c: Adapt test for aarch64.
* libgomp.fortran/declare-simd-1.f90: Likewise.diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
9fbfc548a891f5d11940c6fd3c49a14bfbdec886..37507f091c2a6154fa944c3a9fad6a655ab5d5a1
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -27414,33 +27414,62 @@ supported_simd_type (tree t)
   return false;
 }
 
-/* Return true for types that currently are supported as SIMD return
-   or argument types.  */
+/* Determine the lane size for the clone argument/return type.  This follows
+   the LS(P) rule in the VFABIA64.  */
 
-static bool
-currently_supported_simd_type (tree t, tree b)
+static unsigned
+lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
 {
-  if (COMPLEX_FLOAT_TYPE_P (t))
-return false;
+  gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
 
-  if (TYPE_SIZE (t) != TYPE_SIZE (b))
-return false;
+  /* For non map-to-vector types that are pointers we use the element type it
+ points to.  */
+  if (POINTER_TYPE_P (type))
+switch (clone_arg_type)
+  {
+  default:
+   break;
+  case SIMD_CLONE_ARG_TYPE_UNIFORM:
+  case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
+  case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
+   type = TREE_TYPE (type);
+   break;
+  }
 
-  return supported_simd_type (t);
+  /* For types (or types pointers of non map-to-vector types point to) that are
+ integers or floating point, we use their size if they are 1, 2, 4 or 8.
+   */
+  if (INTEGRAL_TYPE_P (type)
+  || SCALAR_FLOAT_TYPE_P (type))
+  switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
+   {
+   default:
+ break;
+   case 1:
+   case 2:
+   case 4:
+   case 8:
+ return TYPE_PRECISION (type);
+   }
+  /* For any other we use the size of uintptr_t.  For map-to-vector types that
+ are pointers, using the size of uintptr_t is the same as using the size of
+ their type, seeing all pointers are the same size as uintptr_t.  */
+  return POINTER_SIZE;
 }
 
+
 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
 
 static int
 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
struct cgraph_simd_clone *clonei,
-   tree base_type, int num,
-   bool explicit_p)
+   tree base_type ATTRIBUTE_UNUSED,
+   int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int elt_bits, count;
+  unsigned int nds_elt_bits;
+  int count;
   unsigned HOST_WIDE_INT const_simdlen;
-  poly_uint64 vec_bits;
 
   if (!TARGET_SIMD)
 return 0;
@@ -27460,80 +27489,135 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
 }
 
   

Re: Check that passes do not forget to define profile

2023-10-17 Thread Andre Vieira (lists)

So OK to commit this?

This patch makes sure the profile_count information is initialized for 
the new

bb created in move_sese_region_to_fn.

gcc/ChangeLog:

* tree-cfg.cc (move_sese_region_to_fn): Initialize profile_count for
new basic block.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


On 04/10/2023 12:02, Jan Hubicka wrote:

Hi Honza,

My current patch set for AArch64 VLA omp codegen started failing on
gcc.dg/gomp/pr87898.c after this. I traced it back to
'move_sese_region_to_fn' in tree/cfg.cc not setting count for the bb
created.

I was able to 'fix' it locally by setting the count of the new bb to the
accumulation of e->count () of all the entry_endges (if initialized). I'm
however not even close to certain that's the right approach, attached patch
for illustration.

Kind regards,
Andre
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc



index 
ffab7518b1568b58e610e26feb9e3cab18ddb3c2..32fc47ae683164bf8fac477fbe6e2c998382e754
 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -8160,11 +8160,15 @@ move_sese_region_to_fn (struct function *dest_cfun, 
basic_block entry_bb,
bb = create_empty_bb (entry_pred[0]);
if (current_loops)
  add_bb_to_loop (bb, loop);
+  profile_count count = profile_count::zero ();
for (i = 0; i < num_entry_edges; i++)
  {
e = make_edge (entry_pred[i], bb, entry_flag[i]);
e->probability = entry_prob[i];
+  if (e->count ().initialized_p ())
+   count += e->count ();
  }
+  bb->count = count;


This looks generally right - if you create a BB you need to set its
count and unless it has self-loop that is the sum of counts of
incommping edges.

However the initialized_p check should be unnecessary: if one of entry
edges to BB is uninitialized, the + operation will make bb count
uninitialized too, which is OK.

Honza
  
for (i = 0; i < num_exit_edges; i++)

  {
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 
ffab7518b1568b58e610e26feb9e3cab18ddb3c2..ffeb20b717aead756844c5f48c2cc23f5e9f14a6
 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -8160,11 +8160,14 @@ move_sese_region_to_fn (struct function *dest_cfun, 
basic_block entry_bb,
   bb = create_empty_bb (entry_pred[0]);
   if (current_loops)
 add_bb_to_loop (bb, loop);
+  profile_count count = profile_count::zero ();
   for (i = 0; i < num_entry_edges; i++)
 {
   e = make_edge (entry_pred[i], bb, entry_flag[i]);
   e->probability = entry_prob[i];
+  count += e->count ();
 }
+  bb->count = count;
 
   for (i = 0; i < num_exit_edges; i++)
 {


[PATCH] vect: allow using inbranch simdclones for masked loops

2023-11-02 Thread Andre Vieira (lists)

Hi,

In a previous patch I did most of the work for this, but forgot to 
change the check for number of arguments matching between call and 
simdclone.  This check should accept calls without a mask to be matched 
against simdclones with mask arguments.  I also added tests to verify 
this feature actually works.



For the simd-builtins tests I decided to remove the sin (double) 
simdclone which would now be used, because it was inbranch and we enable 
their use for not inbranch.  Given the nature of the test, removing it 
made more sense, but thats not a strong opinion, happy to change.


Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


OK for trunk?

PS: I'll be away for two weeks from tomorrow, it would be really nice if 
this can go in for gcc-14, otherwise the previous work I did for this 
won't have any actual visible effect :(



gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_simd_clone_call): Allow unmasked
calls to use masked simdclones.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-20.c: New file.
* gfortran.dg/simd-builtins-1.h: Adapt.
* gfortran.dg/simd-builtins-6.f90: Adapt.diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c
new file mode 100644
index 
..9f51a68f3a0c8851af2cd26bd8235c771b851d7d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-20.c
@@ -0,0 +1,87 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-additional-options "-fopenmp-simd --param vect-epilogues-nomask=0" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+
+/* Test that simd inbranch clones work correctly.  */
+
+#ifndef TYPE
+#define TYPE int
+#endif
+
+/* A simple function that will be cloned.  */
+#pragma omp declare simd inbranch
+TYPE __attribute__((noinline))
+foo (TYPE a)
+{
+  return a + 1;
+}
+
+/* Check that "inbranch" clones are called correctly.  */
+
+void __attribute__((noipa))
+masked (TYPE * __restrict a, TYPE * __restrict b, int size)
+{
+  #pragma omp simd
+  for (int i = 0; i < size; i++)
+b[i] = foo(a[i]);
+}
+
+/* Check that "inbranch" works when there might be unrolling.  */
+
+void __attribute__((noipa))
+masked_fixed (TYPE * __restrict a, TYPE * __restrict b)
+{
+  #pragma omp simd
+  for (int i = 0; i < 128; i++)
+b[i] = foo(a[i]);
+}
+
+/* Validate the outputs.  */
+
+void
+check_masked (TYPE *b, int size)
+{
+  for (int i = 0; i < size; i++)
+if (b[i] != (TYPE)(i + 1))
+  {
+   __builtin_printf ("error at %d\n", i);
+   __builtin_exit (1);
+  }
+}
+
+int
+main ()
+{
+  TYPE a[1024];
+  TYPE b[1024];
+
+  for (int i = 0; i < 1024; i++)
+a[i] = i;
+
+  masked_fixed (a, b);
+  check_masked (b, 128);
+
+  /* Test various sizes to cover machines with different vectorization
+ factors.  */
+  for (int size = 8; size <= 1024; size *= 2)
+{
+  masked (a, b, size);
+  check_masked (b, size);
+}
+
+  /* Test sizes that might exercise the partial vector code-path.  */
+  for (int size = 8; size <= 1024; size *= 2)
+{
+  masked (a, b, size-4);
+  check_masked (b, size-4);
+}
+
+  return 0;
+}
+
+/* Ensure the the in-branch simd clones are used on targets that support them. 
 */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { aarch64*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { x86_64*-*-* } } } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gfortran.dg/simd-builtins-1.h 
b/gcc/testsuite/gfortran.dg/simd-builtins-1.h
index 
88d555cf41ad065ea525a63d7c05d15d3e5b54ed..08b73514a67d5791d35203530d039741946e9dcc
 100644
--- a/gcc/testsuite/gfortran.dg/simd-builtins-1.h
+++ b/gcc/testsuite/gfortran.dg/simd-builtins-1.h
@@ -1,4 +1,3 @@
-!GCC$ builtin (sin) attributes simd (inbranch)
 !GCC$ builtin (sinf) attributes simd (notinbranch)
 !GCC$ builtin (cosf) attributes simd
 !GCC$ builtin (cosf) attributes simd (notinbranch)
diff --git a/gcc/testsuite/gfortran.dg/simd-builtins-6.f90 
b/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
index 
60bcac78f3e0cc492930f3eb73cf97065312dc1c..2c68f9f1818a35674a0aef15793aa312a48199a8
 100644
--- a/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
+++ b/gcc/testsuite/gfortran.dg/simd-builtins-6.f90
@@ -2,7 +2,6 @@
 ! { dg-additional-options "-nostdinc -Ofast -fdump-tree-optimized" }
 ! { dg-additional-options "-msse2 -mno-avx" { target i?86-*-linux* 
x86_64-*-linux* } }
 
-!GCC$ builtin (sin) attributes simd (inbranch)
 !GCC$ builtin (sinf) attributes simd (notinbranch)
 !GCC$ builtin (cosf) attributes simd
 !GCC$ builtin (cosf) attributes simd (notinbranch)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 

Re: [PATCH] vect: allow using inbranch simdclones for masked loops

2023-11-03 Thread Andre Vieira (lists)




On 03/11/2023 07:31, Richard Biener wrote:



OK.

I do wonder about the gfortran testsuite adjustments though.

!GCC$ builtin (sin) attributes simd (inbranch)

   ! this should not be using simd clone
   y4 = sin(x8)

previously we wouldn't vectorize this as no notinbranch simd function
is available but now we do since we use the inbranch function for the
notinbranch call.  If that's desired then a better modification of
the test would be to expect vectorization, no?



I was in two minds about this. I interpreted the test to be about the 
fact that sin is overloaded in fortran, given the name of the program 
'program test_overloaded_intrinsic', and thus I thought it was testing 
that it calls sinf when a real(4) is passed and sin for a real(8) and 
that simdclones aren't used for the wrong overload. That doesn't quite 
explain why the pragma for sin(double) was added in the first place, 
that wouldn't have been necessary, but then again neither are the cos 
and cosf.


Happy to put it back in and test that the 'masked' simdclone is used 
using some regexp too.


[RFC] vect: disable multiple calls of poly simdclones

2023-11-03 Thread Andre Vieira (lists)

Hi,

The current codegen code to support VF's that are multiples of a 
simdclone simdlen rely on BIT_FIELD_REF to create multiple input 
vectors.  This does not work for non-constant simdclones, so we should 
disable using such clones when
the VF is a multiple of the non-constant simdlen until we change the 
codegen to support those.


Enabling SVE simdclone support will cause ICEs if the vectorizer decides 
to use a SVE simdclone with a VF that is larger than the simdlen. I'll 
be away for the next two weeks, so cant' really discuss this further.
I initially tried to solve the problem, but the way 
vectorizable_simd_clone_call is structured doesn't make it easy to 
replace BIT_FIELD_REF with the poly-suitable solution right now of using 
unpack_{hi,lo}. Unfortunately I only found this now as I was adding 
further tests for SVE :(


gcc/ChangeLog:

* tree-vect-stmts.cc (vectorizable_simd_clone_call): Reject simdclones
with non-constant simdlen when VF is not exactly the same.diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
5f262cae2aae784e3ef4fd07455b7aa742797b51..dc3e0716161838aef66cf37342499006673336d6
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4165,7 +4165,10 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
if (!constant_multiple_p (vf * group_size, n->simdclone->simdlen,
  _calls)
|| (!n->simdclone->inbranch && (masked_call_offset > 0))
-   || (nargs != simd_nargs))
+   || (nargs != simd_nargs)
+   /* Currently we do not support multiple calls of non-constant
+  simdlen as poly vectors can not be accessed by BIT_FIELD_REF.  */
+   || (!n->simdclone->simdlen.is_constant () && num_calls != 1))
  continue;
if (num_calls != 1)
  this_badness += exact_log2 (num_calls) * 4096;


Re: [PATCH6/8] omp: Reorder call for TARGET_SIMD_CLONE_ADJUST (was Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM)

2023-10-30 Thread Andre Vieira (lists)

Hi Richi,

Friendly ping on this. I'm going away for two weeks end of this week, so 
I won't be here for end of stage-1, but I'd still very much like to get 
this done for GCC 14.


I don't know if you had a chance to look at this yet when you reviewed 
the other patches or if you maybe just missed it? A quick td;lr this 
moves around the TARGET_SIMD_CLONE_ADJUST call after we've vectorized 
the types in simdclones to avoid having to add the extra target hooks to 
change the types.  This required some moving around of the code that 
constructed the adjustments and the code that constructed the array for 
the return value.


Kind regards,
Andre

On 18/10/2023 15:41, Andre Vieira (lists) wrote:
This patch moves the call to TARGET_SIMD_CLONE_ADJUST until after the 
arguments and return types have been transformed into vector types.  It 
also constructs the adjuments and retval modifications after this call, 
allowing targets to alter the types of the arguments and return of the 
clone prior to the modifications to the function definition.


Is this OK?

gcc/ChangeLog:

     * omp-simd-clone.cc (simd_clone_adjust_return_type): Hoist out
     code to create return array and don't return new type.
     (simd_clone_adjust_argument_types): Hoist out code that creates
     ipa_param_body_adjustments and don't return them.
     (simd_clone_adjust): Call TARGET_SIMD_CLONE_ADJUST after return
     and argument types have been vectorized, create adjustments and
     return array after the hook.
     (expand_simd_clones): Call TARGET_SIMD_CLONE_ADJUST after return
     and argument types have been vectorized.

On 04/10/2023 13:40, Andre Vieira (lists) wrote:



On 04/10/2023 11:41, Richard Biener wrote:

On Wed, 4 Oct 2023, Andre Vieira (lists) wrote:




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:

This patch adds a new target hook to enable us to adapt the types 
of return
and parameters of simd clones.  We use this in two ways, the first 
one is

to
make sure we can create valid SVE types, including the SVE type 
attribute,
when creating a SVE simd clone, even when the target options do 
not support
SVE.  We are following the same behaviour seen with x86 that 
creates simd
clones according to the ABI rules when no simdlen is provided, 
even if that
simdlen is not supported by the current target options.  Note that 
this

doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by 
the time we
call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not 
vector.  The

same is true for the return type one.

Also the changes to the types need to be taken into consideration in
'adjustments' I think.


Nothing in the three existing implementations of 
TARGET_SIMD_CLONE_ADJUST

relies on this ordering I think, how about moving the hook invocation
after simd_clone_adjust_argument_types?



But that wouldn't change the 'ipa_param_body_adjustments' for when we 
have a function definition and we need to redo the body.

Richard.

PS: I hope the subject line survived, my email client is having a 
bit of a

wobble this morning... it's what you get for updating software :(


Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-09-28 Thread Andre Vieira (lists)

Hi,

On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote:

Hi Stam,





The arm parts look sensible but we'd need review for the df-core.h and 
df-core.cc changes.
Maybe Jeff can help or can recommend someone to take a look?
Thanks,
Kyrill



FWIW the changes LGTM, if we don't want these in df-core we can always 
implement the extra utility locally. It's really just a helper function 
to check if df_bb_regno_first_def_find and df_bb_regno_last_def_find 
yield the same result, meaning we only have a single definition.


Kind regards,
Andre


Re: [PATCH 6/8] vect: Add vector_mode paramater to simd_clone_usable

2023-09-28 Thread Andre Vieira (lists)




On 31/08/2023 07:39, Richard Biener wrote:

On Wed, Aug 30, 2023 at 5:02 PM Andre Vieira (lists)
 wrote:




On 30/08/2023 14:01, Richard Biener wrote:

On Wed, Aug 30, 2023 at 11:15 AM Andre Vieira (lists) via Gcc-patches
 wrote:


This patch adds a machine_mode parameter to the TARGET_SIMD_CLONE_USABLE
hook to enable rejecting SVE modes when the target architecture does not
support SVE.


How does the graph node of the SIMD clone lack this information?  That is, it
should have information on the types (and thus modes) for all formal arguments
and return values already, no?  At least the target would know how to
instantiate
it if it's not readily available at the point of use.



Yes it does, but that's the modes the simd clone itself uses, it does
not know what vector_mode we are currently vectorizing for. Which is
exactly why we need the vinfo's vector_mode to make sure the simd clone
and its types are compatible with the vector mode.

In practice, to make sure that a SVE simd clones are only used in loops
being vectorized for SVE modes. Having said that... I just realized that
the simdlen check already takes care of that currently...

by simdlen check I mean the one that writes off simdclones that match:
  if (!constant_multiple_p (vf, n->simdclone->simdlen, _calls)

However, when using -msve-vector-bits this will become an issue, as the
VF will be constant and we will match NEON simdclones.  This requires
some further attention though given that we now also reject the use of
SVE simdclones when using -msve-vector-bits, and I'm not entirely sure
we should...


Hmm, but vectorizable_simdclone should check for compatible types here
and if they are compatible why should we reject them?  Are -msve-vector-bits
"SVE" modes different from "NEON" modes?  I suppose not, because otherwise
the type compatibility check would say incompatible.

Prior to transformation we do all checks on the original scalar values, 
not the vector types. But I do believe you are right in that we don't 
need to pass the vector_mode. The simdlen check should be enough and if 
the length is the same or a multiple of the rest of the could should be 
able to deal with that and any conversions when dealing with things like 
SVE types that require the attribute.


I'll update the patch series soon and after that I'll look at how this 
reacts to -msve-vector-bits in more detail.


Thanks,
Andre


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)




On 26/09/2023 17:48, Jakub Jelinek wrote:

On Tue, Sep 26, 2023 at 05:24:26PM +0100, Andre Vieira (lists) wrote:

@@ -5816,6 +5817,18 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl)
+   {
+ clobbers_memory = true;
+ break;
+   }
+ if ((flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }


Should IFN_MASK_LOAD/STORE really go through this?  I thought those have
first argument address of the memory being conditionally loaded/stored, not
function address.


No it shouldn't, my bad...
Surprising testing didn't catch it though, I'm guessing 
gimple_call_addr_fndecl just returned null everytime for those. I'll 
clean it up.


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-27 Thread Andre Vieira (lists)



On 26/09/2023 17:37, Andrew Stubbs wrote:

I don't have authority to approve anything, but here's a review anyway.

Thanks for working on this.


Thank you for reviewing and apologies for the mess of a patch, may have 
rushed it ;)


diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c

new file mode 100644
index 
..09127b8cb6f2e3699b6073591f58be7047330273

--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+/* { dg-additional-options "-fopenmp-simd" } */
+


Do you need -fopenmp-simd for this?

Nope, I keep forgetting that you only need it for pragmas.

Dealt with the other comments too.

Any thoughts on changing gimple_call_internal_fn  instead? My main 
argument against is that IFN_MASK_CALL should not appear outside of 
ifconvert and vectorizer. On the other hand, we may inspect the flags 
elsewhere in the vectorizer (now or in the future) and changing 
gimple_call_internal_fn would prevent the need to handle the IFN 
separately elsewhere.


Kind Regards,
Andrediff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
new file mode 100644
index 
..e7ed56ca75470464307d0d266dacfa0d8d6e43c1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,22 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+
+int __attribute__ ((__simd__, const)) fn (int);
+
+void test (int * __restrict__ a, int * __restrict__ b, int n)
+{
+  for (int i = 0; i < n; ++i)
+{
+  int a_;
+  if (b[i] > 0)
+a_ = fn (b[i]);
+  else
+a_ = b[i] + 5;
+  a[i] = a_;
+}
+}
+
+/* { dg-final { scan-tree-dump-not {loop contains function calls or data 
references} "vect" } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 
6d3b7c2290e4db9c1168a4c763facb481157c97c..689aaeed72282bb0da2a17e19fb923a06e8d62fa
 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "vr-values.h"
 #include "range-op.h"
 #include "tree-ssa-loop-ivopts.h"
+#include "calls.h"
 
 static struct datadep_stats
 {
@@ -5816,6 +5817,15 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ break;
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl
+ || (flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }
break;
  default:
clobbers_memory = true;
@@ -5852,7 +5862,7 @@ get_references_in_stmt (gimple *stmt, vec *references)
 }
   else if (stmt_code == GIMPLE_CALL)
 {
-  unsigned i, n;
+  unsigned i = 0, n;
   tree ptr, type;
   unsigned int align;
 
@@ -5879,13 +5889,16 @@ get_references_in_stmt (gimple *stmt, vec *references)
   ptr);
references->safe_push (ref);
return false;
+ case IFN_MASK_CALL:
+   i = 1;
+   gcc_fallthrough ();
  default:
break;
  }
 
   op0 = gimple_call_lhs (stmt);
   n = gimple_call_num_args (stmt);
-  for (i = 0; i < n; i++)
+  for (; i < n; i++)
{
  op1 = gimple_call_arg (stmt, i);
 


Re: [PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)




On 26/09/2023 21:26, Bernhard Reutner-Fischer wrote:

On 26 September 2023 18:46:11 CEST, Tobias Burnus  
wrote:

On 26.09.23 18:37, Andrew Stubbs wrote:

If the fall-through is deliberate please add a /* FALLTHROUGH */
comment (or whatever spelling disables the warning).


It's: gcc_fallthrough ();

Which gets converted to "__attribute__((fallthrough))"; it could also
expand to "[[fallthrough]]" but that's C++17 (and, also, an C23 feature
- albeit so far unimplemented in gcc).


OT
IIRC we do parse comments for a number of spellings of the hint by the user 
that the fallthrough is deliberate:

https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html

See the numerous levels of -Wimplicit-fallthrough=n, the default being 3.

---8<---
-Wimplicit-fallthrough=3 case sensitively matches one of the following regular 
expressions:
-fallthrough
@fallthrough@
lint -fallthrough[ \t]*
[ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?
FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
[ \t.!]*(Else,? |Intentional(ly)? )?
Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
[ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?
fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
---8<---

Just FWIW.
thanks,


I was surprised my bootstrap didn't catch this, I thought we generated 
warnings in such cases and bootstrap builds with -Werror does it not?


Re: Check that passes do not forget to define profile

2023-10-03 Thread Andre Vieira (lists)

Hi Honza,

My current patch set for AArch64 VLA omp codegen started failing on 
gcc.dg/gomp/pr87898.c after this. I traced it back to 
'move_sese_region_to_fn' in tree/cfg.cc not setting count for the bb 
created.


I was able to 'fix' it locally by setting the count of the new bb to the 
accumulation of e->count () of all the entry_endges (if initialized). 
I'm however not even close to certain that's the right approach, 
attached patch for illustration.


Kind regards,
Andre

On 24/08/2023 14:14, Jan Hubicka via Gcc-patches wrote:

Hi,
this patch extends verifier to check that all probabilities and counts are
initialized if profile is supposed to be present.  This is a bit complicated
by the posibility that we inline !flag_guess_branch_probability function
into function with profile defined and in this case we need to stop
verification.  For this reason I added flag to cfg structure tracking this.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* cfg.h (struct control_flow_graph): New field full_profile.
* auto-profile.cc (afdo_annotate_cfg): Set full_profile to true.
* cfg.cc (init_flow): Set full_profile to false.
* graphite.cc (graphite_transform_loops): Set full_profile to false.
* lto-streamer-in.cc (input_cfg): Initialize full_profile flag.
* predict.cc (pass_profile::execute): Set full_profile to true.
* symtab-thunks.cc (expand_thunk): Set full_profile to true.
* tree-cfg.cc (gimple_verify_flow_info): Verify that profile is full
if full_profile is set.
* tree-inline.cc (initialize_cfun): Initialize full_profile.
(expand_call_inline): Combine full_profile.


diff --git a/gcc/auto-profile.cc b/gcc/auto-profile.cc
index e3af3555e75..ff3b763945c 100644
--- a/gcc/auto-profile.cc
+++ b/gcc/auto-profile.cc
@@ -1578,6 +1578,7 @@ afdo_annotate_cfg (const stmt_set _stmts)
  }
update_max_bb_count ();
profile_status_for_fn (cfun) = PROFILE_READ;
+  cfun->cfg->full_profile = true;
if (flag_value_profile_transformations)
  {
gimple_value_profile_transformations ();
diff --git a/gcc/cfg.cc b/gcc/cfg.cc
index 9eb9916f61a..b7865f14e7f 100644
--- a/gcc/cfg.cc
+++ b/gcc/cfg.cc
@@ -81,6 +81,7 @@ init_flow (struct function *the_fun)
  = ENTRY_BLOCK_PTR_FOR_FN (the_fun);
the_fun->cfg->edge_flags_allocated = EDGE_ALL_FLAGS;
the_fun->cfg->bb_flags_allocated = BB_ALL_FLAGS;
+  the_fun->cfg->full_profile = false;
  }
  
  /* Helper function for remove_edge and free_cffg.  Frees edge structure
diff --git a/gcc/cfg.h b/gcc/cfg.h
index a0e944979c8..53e2553012c 100644
--- a/gcc/cfg.h
+++ b/gcc/cfg.h
@@ -78,6 +78,9 @@ struct GTY(()) control_flow_graph {
/* Dynamically allocated edge/bb flags.  */
int edge_flags_allocated;
int bb_flags_allocated;
+
+  /* Set if the profile is computed on every edge and basic block.  */
+  bool full_profile;
  };
  
  
diff --git a/gcc/graphite.cc b/gcc/graphite.cc

index 19f8975ffa2..2b387d5b016 100644
--- a/gcc/graphite.cc
+++ b/gcc/graphite.cc
@@ -512,6 +512,8 @@ graphite_transform_loops (void)
  
if (changed)

  {
+  /* FIXME: Graphite does not update profile meaningfully currently.  */
+  cfun->cfg->full_profile = false;
cleanup_tree_cfg ();
profile_status_for_fn (cfun) = PROFILE_ABSENT;
release_recorded_exits (cfun);
diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc
index 0cce14414ca..d3128fcebe4 100644
--- a/gcc/lto-streamer-in.cc
+++ b/gcc/lto-streamer-in.cc
@@ -1030,6 +1030,7 @@ input_cfg (class lto_input_block *ib, class data_in 
*data_in,
basic_block p_bb;
unsigned int i;
int index;
+  bool full_profile = false;
  
init_empty_tree_cfg_for_function (fn);
  
@@ -1071,6 +1072,8 @@ input_cfg (class lto_input_block *ib, class data_in *data_in,

  data_in->location_cache.input_location_and_block (>goto_locus,
, ib, data_in);
  e->probability = profile_probability::stream_in (ib);
+ if (!e->probability.initialized_p ())
+   full_profile = false;
  
  	}
  
@@ -1145,6 +1148,7 @@ input_cfg (class lto_input_block *ib, class data_in *data_in,
  
/* Rebuild the loop tree.  */

flow_loops_find (loops);
+  cfun->cfg->full_profile = full_profile;
  }
  
  
diff --git a/gcc/predict.cc b/gcc/predict.cc

index 5a1a561cc24..396746cbfd1 100644
--- a/gcc/predict.cc
+++ b/gcc/predict.cc
@@ -4131,6 +4131,7 @@ pass_profile::execute (function *fun)
  scev_initialize ();
  
tree_estimate_probability (false);

+  cfun->cfg->full_profile = true;
  
if (nb_loops > 1)

  scev_finalize ();
diff --git a/gcc/symtab-thunks.cc b/gcc/symtab-thunks.cc
index 4c04235c41b..23ead0d2138 100644
--- a/gcc/symtab-thunks.cc
+++ b/gcc/symtab-thunks.cc
@@ -648,6 +648,7 @@ expand_thunk (cgraph_node *node, bool output_asm_thunks,
  ? PROFILE_READ : PROFILE_GUESSED;
/* FIXME: C++ FE 

Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM

2023-10-04 Thread Andre Vieira (lists)




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


This patch adds a new target hook to enable us to adapt the types of return
and parameters of simd clones.  We use this in two ways, the first one is to
make sure we can create valid SVE types, including the SVE type attribute,
when creating a SVE simd clone, even when the target options do not support
SVE.  We are following the same behaviour seen with x86 that creates simd
clones according to the ABI rules when no simdlen is provided, even if that
simdlen is not supported by the current target options.  Note that this
doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by the 
time we call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not 
vector.  The same is true for the return type one.


Also the changes to the types need to be taken into consideration in 
'adjustments' I think.


PS: I hope the subject line survived, my email client is having a bit of 
a wobble this morning... it's what you get for updating software :(


Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM

2023-10-04 Thread Andre Vieira (lists)




On 04/10/2023 11:41, Richard Biener wrote:

On Wed, 4 Oct 2023, Andre Vieira (lists) wrote:




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


This patch adds a new target hook to enable us to adapt the types of return
and parameters of simd clones.  We use this in two ways, the first one is
to
make sure we can create valid SVE types, including the SVE type attribute,
when creating a SVE simd clone, even when the target options do not support
SVE.  We are following the same behaviour seen with x86 that creates simd
clones according to the ABI rules when no simdlen is provided, even if that
simdlen is not supported by the current target options.  Note that this
doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by the time we
call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not vector.  The
same is true for the return type one.

Also the changes to the types need to be taken into consideration in
'adjustments' I think.


Nothing in the three existing implementations of TARGET_SIMD_CLONE_ADJUST
relies on this ordering I think, how about moving the hook invocation
after simd_clone_adjust_argument_types?



But that wouldn't change the 'ipa_param_body_adjustments' for when we 
have a function definition and we need to redo the body.

Richard.


PS: I hope the subject line survived, my email client is having a bit of a
wobble this morning... it's what you get for updating software :(


[PATCH] vect, omp: inbranch simdclone dropping const

2023-09-26 Thread Andre Vieira (lists)
The const attribute is ignored when simdclone's are used inbranch. This 
is due to the fact that when analyzing a MASK_CALL we were not looking 
at the targeted function for flags, but instead only at the internal 
function call itself.
This patch adds code to make sure we look at the target function to 
check for the const attribute and enables the autovectorization of 
inbranch const simdclones without needing the loop to be adorned the 
'openmp simd' pragma.


Not sure about how to add new includes to the ChangeLog. Which brings me 
to another point, I contemplated changing gimple_call_flags to do the 
checking of flags of the first argument of IFN_MASK_CALL itself rather 
than only calling internal_fn_flags on gimple_call_internal_fn (stmt), 
but that might be a bit too intrusive, opinions welcome :)


Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


Is this OK for trunk?

gcc/ChangeLog:

* tree-vect-data-ref.cc (include calls.h): Add new include.
(get_references_in_stmt): Correctly handle IFN_MASK_CALL.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-19.c: New test.diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
new file mode 100644
index 
..09127b8cb6f2e3699b6073591f58be7047330273
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-19.c
@@ -0,0 +1,23 @@
+/* { dg-require-effective-target vect_simd_clones } */
+/* { dg-do compile } */
+/* { dg-additional-options "-fopenmp-simd" } */
+
+int __attribute__ ((__simd__, const)) fn (int);
+
+void test (int * __restrict__ a, int * __restrict__ b, int n)
+{
+  for (int i = 0; i < n; ++i)
+{
+  int a_;
+  if (b[i] > 0)
+a_ = fn (b[i]);
+  else
+a_ = b[i] + 5;
+  a[i] = a_;
+}
+}
+
+/* { dg-final { scan-tree-dump-not {loop contains function calls or data 
references} "vect" } } */
+
+/* The LTO test produces two dump files and we scan the wrong one.  */
+/* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/tree-data-ref.cc b/gcc/tree-data-ref.cc
index 
6d3b7c2290e4db9c1168a4c763facb481157c97c..2926c3925ee7897fef53c16cfd1d19d23dbf05f3
 100644
--- a/gcc/tree-data-ref.cc
+++ b/gcc/tree-data-ref.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "vr-values.h"
 #include "range-op.h"
 #include "tree-ssa-loop-ivopts.h"
+#include "calls.h"
 
 static struct datadep_stats
 {
@@ -5816,6 +5817,18 @@ get_references_in_stmt (gimple *stmt, vec *references)
}
  case IFN_MASK_LOAD:
  case IFN_MASK_STORE:
+ case IFN_MASK_CALL:
+   {
+ tree orig_fndecl
+   = gimple_call_addr_fndecl (gimple_call_arg (stmt, 0));
+ if (!orig_fndecl)
+   {
+ clobbers_memory = true;
+ break;
+   }
+ if ((flags_from_decl_or_type (orig_fndecl) & ECF_CONST) == 0)
+   clobbers_memory = true;
+   }
break;
  default:
clobbers_memory = true;
@@ -5852,7 +5865,7 @@ get_references_in_stmt (gimple *stmt, vec *references)
 }
   else if (stmt_code == GIMPLE_CALL)
 {
-  unsigned i, n;
+  unsigned i  = 0, n;
   tree ptr, type;
   unsigned int align;
 
@@ -5879,13 +5892,15 @@ get_references_in_stmt (gimple *stmt, vec *references)
   ptr);
references->safe_push (ref);
return false;
+ case IFN_MASK_CALL:
+   i = 1;
  default:
break;
  }
 
   op0 = gimple_call_lhs (stmt);
   n = gimple_call_num_args (stmt);
-  for (i = 0; i < n; i++)
+  for (; i < n; i++)
{
  op1 = gimple_call_arg (stmt, i);
 


Re: aarch64, vect, omp: Add SVE support for simd clones [PR 96342]

2023-10-18 Thread Andre Vieira (lists)

Hi,

I noticed I had missed one of the preparatory patches at the start of 
this series (first one) added now, also removed the 'vect: Add 
vector_mode paramater to simd_clone_usable' since after review we no 
longer deemed it necessary. And replaced the old vect: Add 
TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM with omp: Reorder call for 
TARGET_SIMD_CLONE_ADJUST after comments.


Bootstrapped and regression tested the series on 
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu.



Andre Vieira (8):

omp: Replace simd_clone_supbarts with TYPE_VECTOR_SUBPARTS [NEW]
parloops: Copy target and optimizations when creating a function clone 
[Reviewed]
parloops: Allow poly nit and bound [Cond Reviewed, made the requested 
changes]
vect: Fix vect_get_smallest_scalar_type for simd clones [First Reviewe, 
made the requested changes, OK?]
vect: don't allow fully masked loops with non-masked simd clones [PR 
110485] [Reviewed]

vect: Use inbranch simdclones in masked loops [Needs review]
vect: omp: Reorder call for TARGET_SIMD_CLONE_ADJUST [NEW]
aarch64: Add SVE support for simd clones [PR 96342] [Needs review]

PS: apologies for the inconsistent numbering of the emails, things got a 
bit confusing with removing and adding patches to the series.


On 30/08/2023 09:49, Andre Vieira (lists) via Gcc-patches wrote:

Hi,

This patch series aims to implement support for SVE simd clones when not 
specifying a 'simdlen' clause for AArch64. This patch depends on my 
earlier patch: '[PATCH] aarch64: enable mixed-types for aarch64 
simdclones'.


Bootstrapped and regression tested the series on 
aarch64-unknown-linux-gnu and x86_64-pc-linux-gnu. I also tried building 
the patches separately, but that was before some further clean-up 
restructuring, so will do that again prior to pushing.


Andre Vieira (8):

parloops: Copy target and optimizations when creating a function clone
parloops: Allow poly nit and bound
vect: Fix vect_get_smallest_scalar_type for simd clones
vect: don't allow fully masked loops with non-masked simd clones [PR 
110485]

vect: Use inbranch simdclones in masked loops
vect: Add vector_mode paramater to simd_clone_usable
vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM
aarch64: Add SVE support for simd clones [PR 96342]


Re: [PATCH 1/8] parloops: Copy target and optimizations when creating a function clone

2023-10-18 Thread Andre Vieira (lists)

Just posting a rebase for completion.

On 30/08/2023 13:31, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:



SVE simd clones require to be compiled with a SVE target enabled or the
argument types will not be created properly. To achieve this we need to copy
DECL_FUNCTION_SPECIFIC_TARGET from the original function declaration to the
clones.  I decided it was probably also a good idea to copy
DECL_FUNCTION_SPECIFIC_OPTIMIZATION in case the original function is meant to
be compiled with specific optimization options.


OK.


gcc/ChangeLog:

* tree-parloops.cc (create_loop_fn): Copy specific target and
optimization options to clone.

diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 
e495bbd65270bdf90bae2c4a2b52777522352a77..a35f3d5023b06e5ef96eb4222488fcb34dd7bd45
 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2203,6 +2203,11 @@ create_loop_fn (location_t loc)
   DECL_CONTEXT (t) = decl;
   TREE_USED (t) = 1;
   DECL_ARGUMENTS (decl) = t;
+  DECL_FUNCTION_SPECIFIC_TARGET (decl)
+= DECL_FUNCTION_SPECIFIC_TARGET (act_cfun->decl);
+  DECL_FUNCTION_SPECIFIC_OPTIMIZATION (decl)
+= DECL_FUNCTION_SPECIFIC_OPTIMIZATION (act_cfun->decl);
+
 
   allocate_struct_function (decl, false);
 


Re: [Patch 2/8] parloops: Allow poly nit and bound

2023-10-18 Thread Andre Vieira (lists)

Posting the changed patch for completion, already reviewed.

On 30/08/2023 13:32, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


Teach parloops how to handle a poly nit and bound e ahead of the changes to
enable non-constant simdlen.


Can you use poly_int_tree_p to combine INTEGER_CST || POLY_INT_CST please?

OK with that change.


gcc/ChangeLog:

* tree-parloops.cc (try_to_transform_to_exit_first_loop_alt): Accept
poly NIT and ALT_BOUND.

diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index 
a35f3d5023b06e5ef96eb4222488fcb34dd7bd45..80f3dd6dce281e1eb1d76d38bd09e6638a875142
 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -2531,14 +2531,15 @@ try_transform_to_exit_first_loop_alt (class loop *loop,
   tree nit_type = TREE_TYPE (nit);
 
   /* Figure out whether nit + 1 overflows.  */
-  if (TREE_CODE (nit) == INTEGER_CST)
+  if (poly_int_tree_p (nit))
 {
   if (!tree_int_cst_equal (nit, TYPE_MAX_VALUE (nit_type)))
{
  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
   nit, build_one_cst (nit_type));
 
- gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
+ gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST
+ || TREE_CODE (alt_bound) == POLY_INT_CST);
  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
  return true;
}


Re: [PATCH 8/8] aarch64: Add SVE support for simd clones [PR 96342]

2023-10-18 Thread Andre Vieira (lists)

Rebased, no major changes, still needs review.

On 30/08/2023 10:19, Andre Vieira (lists) via Gcc-patches wrote:
This patch finalizes adding support for the generation of SVE simd 
clones when no simdlen is provided, following the ABI rules where the 
widest data type determines the minimum amount of elements in a length 
agnostic vector.


gcc/ChangeLog:

     * config/aarch64/aarch64-protos.h (add_sve_type_attribute): 
Declare.
 * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute): 
Make

 visibility global.
 * config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
 chosen over SIMD ABI if a SVE type is used in return or arguments.
 (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd 
clone

 when no simdlen is provided, according to ABI rules.
 (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
 (aarch64_simd_clone_adjust_ret_or_param): New.
 (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
 * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
 (simd_clone_adjust): Adapt safelen check to be compatible with VLA
 simdlen.

gcc/testsuite/ChangeLog:

 * c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
 * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
 * gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
 longer necessary.
 * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..769d637f63724a7f0044f48f3dd683e0fb46049c
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1005,6 +1005,8 @@ namespace aarch64_sve {
 #ifdef GCC_TARGET_H
   bool verify_type_context (location_t, type_context_kind, const_tree, bool);
 #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+ const char *, const char *);
 }
 
 extern void aarch64_split_combinev16qi (rtx operands[3]);
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 
161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
 /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE vectors
and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
mangling of the type.  ACLE_NAME is the  name of the type.  */
-static void
+void
 add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
const char *mangled_name, const char *acle_name)
 {
   tree mangled_name_tree
 = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+= (acle_name ? get_identifier (acle_name) : NULL_TREE);
 
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);
+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
   value = tree_cons (NULL_TREE, mangled_name_tree, value);
   value = tree_cons (NULL_TREE, size_int (num_pr), value);
   value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
37507f091c2a6154fa944c3a9fad6a655ab5d5a1..cb0947b18c6a611d55579b5b08d93f6a4a9c3b2c
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4080,13 +4080,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree 
fntype)
 static const predefined_function_abi &
 aarch64_fntype_abi (const_tree fntype)
 {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-return aarch64_simd_abi ();
-
   if (aarch64_returns_value_in_sve_regs_p (fntype)
   || aarch64_takes_arguments_in_sve_regs_p (fntype))
 return aarch64_sve_abi ();
 
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
+return aarch64_simd_abi ();
+
   return default_function_abi;
 }
 
@@ -27467,7 +27467,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
int num, bool explicit_p)
 {
   tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
   int count;
   unsigned HOST_WIDE_INT const_simdlen;
 
@@ -27513,10 +27513,14 @@ aarch64_simd_clone_compute_vecsize_and_simdlen 
(struct cgraph_node *node,
   if (TREE_CODE (ret_type) != VOID_TYPE)
 {
   nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
+  wds_elt_bits = nds_elt_bits;
   vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
 }
   else
-nds_elt_bits = POINTER_SIZE;
+{
+  nds_elt_bits = POINTER_SIZE;
+  wds_elt_bits = 0;
+}
 
   int i;
   tree type_arg_types = T

[PATCH6/8] omp: Reorder call for TARGET_SIMD_CLONE_ADJUST (was Re: [PATCH7/8] vect: Add TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM)

2023-10-18 Thread Andre Vieira (lists)
This patch moves the call to TARGET_SIMD_CLONE_ADJUST until after the 
arguments and return types have been transformed into vector types.  It 
also constructs the adjuments and retval modifications after this call, 
allowing targets to alter the types of the arguments and return of the 
clone prior to the modifications to the function definition.


Is this OK?

gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_adjust_return_type): Hoist out
code to create return array and don't return new type.
(simd_clone_adjust_argument_types): Hoist out code that creates
ipa_param_body_adjustments and don't return them.
(simd_clone_adjust): Call TARGET_SIMD_CLONE_ADJUST after return
and argument types have been vectorized, create adjustments and
return array after the hook.
(expand_simd_clones): Call TARGET_SIMD_CLONE_ADJUST after return
and argument types have been vectorized.

On 04/10/2023 13:40, Andre Vieira (lists) wrote:



On 04/10/2023 11:41, Richard Biener wrote:

On Wed, 4 Oct 2023, Andre Vieira (lists) wrote:




On 30/08/2023 14:04, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:

This patch adds a new target hook to enable us to adapt the types 
of return
and parameters of simd clones.  We use this in two ways, the first 
one is

to
make sure we can create valid SVE types, including the SVE type 
attribute,
when creating a SVE simd clone, even when the target options do not 
support
SVE.  We are following the same behaviour seen with x86 that 
creates simd
clones according to the ABI rules when no simdlen is provided, even 
if that
simdlen is not supported by the current target options.  Note that 
this

doesn't mean the simd clone will be used in auto-vectorization.


You are not documenting the bool parameter of the new hook.

What's wrong with doing the adjustment in TARGET_SIMD_CLONE_ADJUST?


simd_clone_adjust_argument_types is called after that hook, so by the 
time we
call TARGET_SIMD_CLONE_ADJUST the types are still in scalar, not 
vector.  The

same is true for the return type one.

Also the changes to the types need to be taken into consideration in
'adjustments' I think.


Nothing in the three existing implementations of TARGET_SIMD_CLONE_ADJUST
relies on this ordering I think, how about moving the hook invocation
after simd_clone_adjust_argument_types?



But that wouldn't change the 'ipa_param_body_adjustments' for when we 
have a function definition and we need to redo the body.

Richard.

PS: I hope the subject line survived, my email client is having a bit 
of a

wobble this morning... it's what you get for updating software :(diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
ef0b9b48c7212900023bc0eaebca5e1f9389db77..fb80888190c88e29895ecfbbe1b17d390c9a9dfe
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -701,10 +701,9 @@ simd_clone_create (struct cgraph_node *old_node, bool 
force_local)
 }
 
 /* Adjust the return type of the given function to its appropriate
-   vector counterpart.  Returns a simd array to be used throughout the
-   function as a return value.  */
+   vector counterpart.  */
 
-static tree
+static void
 simd_clone_adjust_return_type (struct cgraph_node *node)
 {
   tree fndecl = node->decl;
@@ -714,7 +713,7 @@ simd_clone_adjust_return_type (struct cgraph_node *node)
 
   /* Adjust the function return type.  */
   if (orig_rettype == void_type_node)
-return NULL_TREE;
+return;
   t = TREE_TYPE (TREE_TYPE (fndecl));
   if (INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
 veclen = node->simdclone->vecsize_int;
@@ -737,24 +736,6 @@ simd_clone_adjust_return_type (struct cgraph_node *node)
veclen));
 }
   TREE_TYPE (TREE_TYPE (fndecl)) = t;
-  if (!node->definition)
-return NULL_TREE;
-
-  t = DECL_RESULT (fndecl);
-  /* Adjust the DECL_RESULT.  */
-  gcc_assert (TREE_TYPE (t) != void_type_node);
-  TREE_TYPE (t) = TREE_TYPE (TREE_TYPE (fndecl));
-  relayout_decl (t);
-
-  tree atype = build_array_type_nelts (orig_rettype,
-  node->simdclone->simdlen);
-  if (maybe_ne (veclen, node->simdclone->simdlen))
-return build1 (VIEW_CONVERT_EXPR, atype, t);
-
-  /* Set up a SIMD array to use as the return value.  */
-  tree retval = create_tmp_var_raw (atype, "retval");
-  gimple_add_tmp_var (retval);
-  return retval;
 }
 
 /* Each vector argument has a corresponding array to be used locally
@@ -788,7 +769,7 @@ create_tmp_simd_array (const char *prefix, tree type, 
poly_uint64 simdlen)
declarations will be remapped.  New arguments which are not to be remapped
are marked with USER_FLAG.  */
 
-static ipa_param_body_adjustments *
+static void
 simd_clone_adjust_argument_types (struct cgraph_node *node)
 {
   auto_vec args;
@@ -798,15 +779,11 @@ simd_clone_adjust_argument_types (

Re: [PATCH 5/8] vect: Use inbranch simdclones in masked loops

2023-10-18 Thread Andre Vieira (lists)

Rebased, needs review.

On 30/08/2023 10:13, Andre Vieira (lists) via Gcc-patches wrote:
This patch enables the compiler to use inbranch simdclones when 
generating masked loops in autovectorization.


gcc/ChangeLog:

 * omp-simd-clone.cc (simd_clone_adjust_argument_types): Make function
 compatible with mask parameters in clone.
 * tree-vect-stmts.cc (vect_convert): New helper function.
 (vect_build_all_ones_mask): Allow vector boolean typed masks.
 (vectorizable_simd_clone_call): Enable the use of masked clones in
 fully masked loops.diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
a42643400ddcf10961633448b49d4caafb999f12..ef0b9b48c7212900023bc0eaebca5e1f9389db77
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -807,8 +807,14 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
 {
   ipa_adjusted_param adj;
   memset (, 0, sizeof (adj));
-  tree parm = args[i];
-  tree parm_type = node->definition ? TREE_TYPE (parm) : parm;
+  tree parm = NULL_TREE;
+  tree parm_type = NULL_TREE;
+  if(i < args.length())
+   {
+ parm = args[i];
+ parm_type = node->definition ? TREE_TYPE (parm) : parm;
+   }
+
   adj.base_index = i;
   adj.prev_clone_index = i;
 
@@ -1547,7 +1553,7 @@ simd_clone_adjust (struct cgraph_node *node)
  mask = gimple_assign_lhs (g);
  g = gimple_build_assign (make_ssa_name (TREE_TYPE (mask)),
   BIT_AND_EXPR, mask,
-  build_int_cst (TREE_TYPE (mask), 1));
+  build_one_cst (TREE_TYPE (mask)));
  gsi_insert_after (, g, GSI_CONTINUE_LINKING);
  mask = gimple_assign_lhs (g);
}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
731acc76350cae39c899a866584068cff247183a..6e2c70c1d3970af652c1e50e41b144162884bf24
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1594,6 +1594,20 @@ check_load_store_for_partial_vectors (loop_vec_info 
loop_vinfo, tree vectype,
 }
 }
 
+/* Return SSA name of the result of the conversion of OPERAND into type TYPE.
+   The conversion statement is inserted at GSI.  */
+
+static tree
+vect_convert (vec_info *vinfo, stmt_vec_info stmt_info, tree type, tree 
operand,
+ gimple_stmt_iterator *gsi)
+{
+  operand = build1 (VIEW_CONVERT_EXPR, type, operand);
+  gassign *new_stmt = gimple_build_assign (make_ssa_name (type),
+  operand);
+  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+  return gimple_get_lhs (new_stmt);
+}
+
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
that needs to be applied to all loads and stores in a vectorized loop.
@@ -2547,7 +2561,8 @@ vect_build_all_ones_mask (vec_info *vinfo,
 {
   if (TREE_CODE (masktype) == INTEGER_TYPE)
 return build_int_cst (masktype, -1);
-  else if (TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
+  else if (VECTOR_BOOLEAN_TYPE_P (masktype)
+  || TREE_CODE (TREE_TYPE (masktype)) == INTEGER_TYPE)
 {
   tree mask = build_int_cst (TREE_TYPE (masktype), -1);
   mask = build_vector_from_val (masktype, mask);
@@ -4156,7 +4171,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   size_t i, nargs;
   tree lhs, rtype, ratype;
   vec *ret_ctor_elts = NULL;
-  int arg_offset = 0;
+  int masked_call_offset = 0;
 
   /* Is STMT a vectorizable call?   */
   gcall *stmt = dyn_cast  (stmt_info->stmt);
@@ -4171,7 +4186,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
   fndecl = TREE_OPERAND (fndecl, 0);
   gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
-  arg_offset = 1;
+  masked_call_offset = 1;
 }
   if (fndecl == NULL_TREE)
 return false;
@@ -4199,7 +4214,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 return false;
 
   /* Process function arguments.  */
-  nargs = gimple_call_num_args (stmt) - arg_offset;
+  nargs = gimple_call_num_args (stmt) - masked_call_offset;
 
   /* Bail out if the function has zero arguments.  */
   if (nargs == 0)
@@ -4221,7 +4236,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   thisarginfo.op = NULL_TREE;
   thisarginfo.simd_lane_linear = false;
 
-  int op_no = i + arg_offset;
+  int op_no = i + masked_call_offset;
   if (slp_node)
op_no = vect_slp_child_index_for_operand (stmt, op_no);
   if (!vect_is_simple_use (vinfo, stmt_info, slp_node,
@@ -4303,16 +4318,6 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   arginfo.quick_push (thisarginfo);
 }
 
-  if (loop_vinfo
-  && !LOOP_VINFO_VEC

Re: [Patch 3/8] vect: Fix vect_get_smallest_scalar_type for simd clones

2023-10-18 Thread Andre Vieira (lists)

Made it a local function and changed prototype according to comments.

Is this OK?

 gcc/ChangeLog:
* tree-vect-data-refs.cc (vect_get_smallest_scalar_type): Special
case
simd clone calls and only use types that are mapped to vectors.
(simd_clone_call_p): New helper function.

On 30/08/2023 13:54, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


The vect_get_smallest_scalar_type helper function was using any argument to a
simd clone call when trying to determine the smallest scalar type that would
be vectorized.  This included the function pointer type in a MASK_CALL for
instance, and would result in the wrong type being selected.  Instead this
patch special cases simd_clone_call's and uses only scalar types of the
original function that get transformed into vector types.


Looks sensible.

+bool
+simd_clone_call_p (gimple *stmt, cgraph_node **out_node)

you could return the cgraph_node * or NULL here.  Are you going to
use the function elsewhere?  Otherwise put it in the same TU as
the only use please and avoid exporting it.

Richard.


gcc/ChangeLog:

* tree-vect-data-refs.cci (vect_get_smallest_scalar_type): Special
case
simd clone calls and only use types that are mapped to vectors.
* tree-vect-stmts.cc (simd_clone_call_p): New helper function.
* tree-vectorizer.h (simd_clone_call_p): Declare new function.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-simd-clone-16f.c: Remove unnecessary differentation
between targets with different pointer sizes.
* gcc.dg/vect/vect-simd-clone-17f.c: Likewise.
* gcc.dg/vect/vect-simd-clone-18f.c: Likewise.

diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
index 
574698d3e133ecb8700e698fa42a6b05dd6b8a18..7cd29e894d0502a59fadfe67db2db383133022d3
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-16f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-16.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
index 
8bb6d19301a67a3eebce522daaf7d54d88f708d7..177521dc44531479fca1f1a1a0f2010f30fa3fb5
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-17f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-17.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } { "-flto" } { "" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c 
b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
index 
d34f23f4db8e9c237558cc22fe66b7e02b9e6c20..4dd51381d73c0c7c8ec812f24e5054df038059c5
 100644
--- a/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-simd-clone-18f.c
@@ -7,9 +7,8 @@
 #include "vect-simd-clone-18.c"
 
 /* Ensure the the in-branch simd clones are used on targets that support them.
-   Some targets use pairs of vectors and do twice the calls.  */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
{ target { ! { { i?86-*-* x86_64-*-* } && { ! lp64 } } } } } } */
-/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 4 "vect" 
{ target { { i?86*-*-* x86_64-*-* } && { ! lp64 } } } } } */
+ */
+/* { dg-final { scan-tree-dump-times {[\n\r] [^\n]* = foo\.simdclone} 2 "vect" 
} } */
 
 /* The LTO test produces two dump files and we scan the wrong one.  */
 /* { dg-skip-if "" { *-*-* } 

[PATCH 0/8] omp: Replace simd_clone_subparts with TYPE_VECTOR_SUBPARTS

2023-10-18 Thread Andre Vieira (lists)


Refactor simd clone handling code ahead of support for poly simdlen.

gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_subparts): Remove.
(simd_clone_init_simd_arrays): Replace simd_clone_supbarts with
TYPE_VECTOR_SUBPARTS.
(ipa_simd_modify_function_body): Likewise.
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Likewise.
(simd_clone_subparts): Remove.
diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
c1cb7cc8a5c770940bc2032f824e084b37e96dbe..a42643400ddcf10961633448b49d4caafb999f12
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -255,16 +255,6 @@ ok_for_auto_simd_clone (struct cgraph_node *node)
   return true;
 }
 
-
-/* Return the number of elements in vector type VECTYPE, which is associated
-   with a SIMD clone.  At present these always have a constant length.  */
-
-static unsigned HOST_WIDE_INT
-simd_clone_subparts (tree vectype)
-{
-  return TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-}
-
 /* Allocate a fresh `simd_clone' and return it.  NARGS is the number
of arguments to reserve space for.  */
 
@@ -1028,7 +1018,7 @@ simd_clone_init_simd_arrays (struct cgraph_node *node,
}
  continue;
}
-  if (known_eq (simd_clone_subparts (TREE_TYPE (arg)),
+  if (known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg)),
node->simdclone->simdlen))
{
  tree ptype = build_pointer_type (TREE_TYPE (TREE_TYPE (array)));
@@ -1040,7 +1030,7 @@ simd_clone_init_simd_arrays (struct cgraph_node *node,
}
   else
{
- unsigned int simdlen = simd_clone_subparts (TREE_TYPE (arg));
+ poly_uint64 simdlen = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg));
  unsigned int times = vector_unroll_factor (node->simdclone->simdlen,
 simdlen);
  tree ptype = build_pointer_type (TREE_TYPE (TREE_TYPE (array)));
@@ -1226,9 +1216,9 @@ ipa_simd_modify_function_body (struct cgraph_node *node,
  iter, NULL_TREE, NULL_TREE);
   adjustments->register_replacement (&(*adjustments->m_adj_params)[j], r);
 
-  if (multiple_p (node->simdclone->simdlen, simd_clone_subparts (vectype)))
+  if (multiple_p (node->simdclone->simdlen, TYPE_VECTOR_SUBPARTS 
(vectype)))
j += vector_unroll_factor (node->simdclone->simdlen,
-  simd_clone_subparts (vectype)) - 1;
+  TYPE_VECTOR_SUBPARTS (vectype)) - 1;
 }
   adjustments->sort_replacements ();
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
9bb43e98f56d18929c9c02227954fdf38eafefd8..a9156975d64c7a335ffd27614e87f9d11b23d1ba
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4126,16 +4126,6 @@ vect_simd_lane_linear (tree op, class loop *loop,
 }
 }
 
-/* Return the number of elements in vector type VECTYPE, which is associated
-   with a SIMD clone.  At present these vectors always have a constant
-   length.  */
-
-static unsigned HOST_WIDE_INT
-simd_clone_subparts (tree vectype)
-{
-  return TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
-}
-
 /* Function vectorizable_simd_clone_call.
 
Check if STMT_INFO performs a function call that can be vectorized
@@ -4429,7 +4419,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
slp_node);
  if (arginfo[i].vectype == NULL
  || !constant_multiple_p (bestn->simdclone->simdlen,
-  simd_clone_subparts 
(arginfo[i].vectype)))
+  TYPE_VECTOR_SUBPARTS 
(arginfo[i].vectype)))
return false;
}
 
@@ -,10 +4434,11 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
   if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK)
{
+ tree clone_arg_vectype = bestn->simdclone->args[i].vector_type;
  if (bestn->simdclone->mask_mode == VOIDmode)
{
- if (simd_clone_subparts (bestn->simdclone->args[i].vector_type)
- != simd_clone_subparts (arginfo[i].vectype))
+ if (maybe_ne (TYPE_VECTOR_SUBPARTS (clone_arg_vectype),
+   TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))
{
  /* FORNOW we only have partial support for vector-type masks
 that can't hold all of simdlen. */
@@ -4464,7 +4455,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
  if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))
  || maybe_ne (exact_div (bestn->simdclone->simdlen,
  num_mask_args),
-  simd_clone_subparts (arginfo[i].vectype)))
+  TYPE_VECTOR_SUBPARTS (arginfo[i].vectype)))

Re: [PATCH 4/8] vect: don't allow fully masked loops with non-masked simd clones [PR 110485]

2023-10-18 Thread Andre Vieira (lists)
Rebased on top of trunk, minor change to check if loop_vinfo since we 
now do some slp vectorization for simd_clones.


I assume the previous OK still holds.

On 30/08/2023 13:54, Richard Biener wrote:

On Wed, 30 Aug 2023, Andre Vieira (lists) wrote:


When analyzing a loop and choosing a simdclone to use it is possible to choose
a simdclone that cannot be used 'inbranch' for a loop that can use partial
vectors.  This may lead to the vectorizer deciding to use partial vectors
which are not supported for notinbranch simd clones. This patch fixes that by
disabling the use of partial vectors once a notinbranch simd clone has been
selected.


OK.


gcc/ChangeLog:

PR tree-optimization/110485
* tree-vect-stmts.cc (vectorizable_simd_clone_call): Disable partial
vectors usage if a notinbranch simdclone has been selected.

gcc/testsuite/ChangeLog:

* gcc.dg/gomp/pr110485.c: New test.

diff --git a/gcc/testsuite/gcc.dg/gomp/pr110485.c 
b/gcc/testsuite/gcc.dg/gomp/pr110485.c
new file mode 100644
index 
..ba6817a127f40246071e32ccebf692cc4d121d15
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/pr110485.c
@@ -0,0 +1,19 @@
+/* PR 110485 */
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast -fdump-tree-vect-details" } */
+/* { dg-additional-options "-march=znver4 --param=vect-partial-vector-usage=1" 
{ target x86_64-*-* } } */
+#pragma omp declare simd notinbranch uniform(p)
+extern double __attribute__ ((const)) bar (double a, double p);
+
+double a[1024];
+double b[1024];
+
+void foo (int n)
+{
+  #pragma omp simd
+  for (int i = 0; i < n; ++i)
+a[i] = bar (b[i], 71.2);
+}
+
+/* { dg-final { scan-tree-dump-not "MASK_LOAD" "vect" } } */
+/* { dg-final { scan-tree-dump "can't use a fully-masked loop because a 
non-masked simd clone was selected." "vect" { target x86_64-*-* } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
a9156975d64c7a335ffd27614e87f9d11b23d1ba..731acc76350cae39c899a866584068cff247183a
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -4539,6 +4539,17 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
   ? boolean_true_node : boolean_false_node;
simd_clone_info.safe_push (sll);
  }
+
+  if (!bestn->simdclone->inbranch && loop_vinfo)
+   {
+ if (dump_enabled_p ()
+ && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+   dump_printf_loc (MSG_NOTE, vect_location,
+"can't use a fully-masked loop because a"
+" non-masked simd clone was selected.\n");
+ LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+   }
+
   STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
   DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
 /*  vect_model_simple_cost (vinfo, stmt_info, ncopies,


[PATCH] ifcvt: Don't lower bitfields with non-constant offsets [PR 111882]

2023-10-20 Thread Andre Vieira (lists)

Hi,

This patch stops lowering of bitfields by ifcvt when they have non-constant
offsets as we are not likely to be able to do anything useful with those 
during

vectorization.  That also fixes the issue reported in PR 111882, which was
being caused by an offset with a side-effect being lowered, but 
constants have

no side-effects so we will no longer run into that problem.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

OK for trunk?

gcc/ChangeLog:

PR tree-optimization/111882
* tree-if-conv.cc (get_bitfield_rep): Return NULL_TREE for bitfields 
with
non-constant offsets.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr111882.c: New test.diff --git a/gcc/testsuite/gcc.dg/vect/pr111882.c 
b/gcc/testsuite/gcc.dg/vect/pr111882.c
new file mode 100644
index 
..024ad57b6930dd1f516e9c8127e2b440016360c6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr111882.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options { -fdump-tree-ifcvt-all } } */
+
+static void __attribute__((noipa)) f(int n) {
+  int i, j;
+  struct S { char d[n]; int a; int b : 17; int c : 12; };
+  struct S A[100][];
+  for (i = 0; i < 100; i++) {
+asm volatile("" : : "g"([0][0]) : "memory");
+for (j = 0; j < ; j++) A[i][j].b = 2;
+  }
+}
+void g(void) { f(1); }
+
+/* { dg-final { scan-tree-dump-not "Bitfield OK to lower" "ifcvt" } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 
dab7eeb7707ae8f1f342a571f8e5c99e0ef39309..66f3c882cb688049224b344b81637e7b1ae7e36c
 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -3495,6 +3495,7 @@ get_bitfield_rep (gassign *stmt, bool write, tree *bitpos,
: gimple_assign_rhs1 (stmt);
 
   tree field_decl = TREE_OPERAND (comp_ref, 1);
+  tree ref_offset = component_ref_field_offset (comp_ref);
   tree rep_decl = DECL_BIT_FIELD_REPRESENTATIVE (field_decl);
 
   /* Bail out if the representative is not a suitable type for a scalar
@@ -3509,6 +3510,12 @@ get_bitfield_rep (gassign *stmt, bool write, tree 
*bitpos,
   if (compare_tree_int (DECL_SIZE (field_decl), bf_prec) != 0)
 return NULL_TREE;
 
+  if (!TREE_CONSTANT (DECL_FIELD_OFFSET (rep_decl))
+  || !TREE_CONSTANT (DECL_FIELD_BIT_OFFSET (rep_decl))
+  || !TREE_CONSTANT (ref_offset)
+  || !TREE_CONSTANT (DECL_FIELD_BIT_OFFSET (field_decl)))
+return NULL_TREE;
+
   if (struct_expr)
 *struct_expr = TREE_OPERAND (comp_ref, 0);
 
@@ -3529,7 +3536,7 @@ get_bitfield_rep (gassign *stmt, bool write, tree *bitpos,
 the structure and the container from the number of bits from the start
 of the structure and the actual bitfield member. */
   tree bf_pos = fold_build2 (MULT_EXPR, bitsizetype,
-DECL_FIELD_OFFSET (field_decl),
+ref_offset,
 build_int_cst (bitsizetype, BITS_PER_UNIT));
   bf_pos = fold_build2 (PLUS_EXPR, bitsizetype, bf_pos,
DECL_FIELD_BIT_OFFSET (field_decl));


Re: [PATCH] ifcvt: Don't lower bitfields with non-constant offsets [PR 111882]

2023-10-20 Thread Andre Vieira (lists)




On 20/10/2023 14:41, Richard Biener wrote:

On Fri, 20 Oct 2023, Andre Vieira (lists) wrote:


Hi,

This patch stops lowering of bitfields by ifcvt when they have non-constant
offsets as we are not likely to be able to do anything useful with those
during
vectorization.  That also fixes the issue reported in PR 111882, which was
being caused by an offset with a side-effect being lowered, but constants have
no side-effects so we will no longer run into that problem.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu.

OK for trunk?


+  if (!TREE_CONSTANT (DECL_FIELD_OFFSET (rep_decl))
+  || !TREE_CONSTANT (DECL_FIELD_BIT_OFFSET (rep_decl))
+  || !TREE_CONSTANT (ref_offset)
+  || !TREE_CONSTANT (DECL_FIELD_BIT_OFFSET (field_decl)))
+return NULL_TREE;

DECL_FIELD_BIT_OFFSET is always constant.  Please test
TREE_CODE (..) == INTEGER_CST instead of TREE_CONSTANT.


OK with those changes.
After I sent it I realized it would've been nicer to add a diagnostic, 
you OK with:

+  if (dump_file && (dump_flags & TDF_DETAILS))
+   fprintf (dump_file, "\t Bitfield NOT OK to lower,"
+   " offset is non-constant.\n");


Richard.



gcc/ChangeLog:

PR tree-optimization/111882
* tree-if-conv.cc (get_bitfield_rep): Return NULL_TREE for bitfields
with
non-constant offsets.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr111882.c: New test.





Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-10-23 Thread Andre Vieira (lists)
Ping for Jeff or another global maintainer to review the target agnostic 
bits of this, that's:

loop-doloop.cc
df-core.{c,h}

I do have a nitpick myself that I missed last time around:
  /* We expect the condition to be of the form (reg != 0)  */
  cond = XEXP (SET_SRC (cmp), 0);
- if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+ if ((GET_CODE (cond) != NE && GET_CODE (cond) != GE)
+ || XEXP (cond, 1) != const0_rtx)
return 0;
}
Could do with updating the comment to reflect allowing >= now. But happy 
for you to change this once approved by a maintainer.


Kind regards,
Andre

On 11/10/2023 12:34, Stamatis Markianos-Wright wrote:

Hi all,

On 28/09/2023 13:51, Andre Vieira (lists) wrote:

Hi,

On 14/09/2023 13:10, Kyrylo Tkachov via Gcc-patches wrote:

Hi Stam,





The arm parts look sensible but we'd need review for the df-core.h 
and df-core.cc changes.

Maybe Jeff can help or can recommend someone to take a look?


Just thought I'd do a follow-up "ping" on this :)



Thanks,
Kyrill



FWIW the changes LGTM, if we don't want these in df-core we can always 
implement the extra utility locally. It's really just a helper 
function to check if df_bb_regno_first_def_find and 
df_bb_regno_last_def_find yield the same result, meaning we only have 
a single definition.


Kind regards,
Andre


Thanks,

Stam



Re: veclower: improve selection of vector mode when lowering [PR 112787]

2024-02-19 Thread Andre Vieira (lists)

Hi all,

OK to backport this to gcc-12 and gcc-13? Patch applies cleanly, 
bootstrapped and regression tested on aarch64-unknown-linux-gnu. Only 
change is in the testcase as I had to use -march=armv9-a because 
-march=armv8-a+sve conflicts with -mcpu=neoverse-n2 in previous gcc 
versions.


Kind Regards,
Andre

On 20/12/2023 14:30, Richard Biener wrote:

On Wed, 20 Dec 2023, Andre Vieira (lists) wrote:


Thanks, fully agree with all comments.

gcc/ChangeLog:

PR target/112787
* tree-vect-generic (type_for_widest_vector_mode): Change function
 to use original vector type and check widest vector mode has at most
 the same number of elements.
(get_compute_type): Pass original vector type rather than the element
 type to type_for_widest_vector_mode and remove now obsolete check
 for the number of elements.


OK.

Richard.


On 07/12/2023 07:45, Richard Biener wrote:

On Wed, 6 Dec 2023, Andre Vieira (lists) wrote:


Hi,

This patch addresses the issue reported in PR target/112787 by improving
the
compute type selection.  We do this by not considering types with more
elements
than the type we are lowering since we'd reject such types anyway.

gcc/ChangeLog:

  PR target/112787
  * tree-vect-generic (type_for_widest_vector_mode): Add a parameter to
  control maximum amount of elements in resulting vector mode.
  (get_compute_type): Restrict vector_compute_type to a mode no wider
  than the original compute type.

gcc/testsuite/ChangeLog:

  * gcc.target/aarch64/pr112787.c: New test.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
x86_64-pc-linux-gnu.

Is this OK for trunk?


@@ -1347,7 +1347,7 @@ optimize_vector_constructor (gimple_stmt_iterator
*gsi)
  TYPE, or NULL_TREE if none is found.  */

Can you improve the function comment?  It also doesn't mention OP ...

   static tree
-type_for_widest_vector_mode (tree type, optab op)
+type_for_widest_vector_mode (tree type, optab op, poly_int64 max_nunits =
0)
   {
 machine_mode inner_mode = TYPE_MODE (type);
 machine_mode best_mode = VOIDmode, mode;
@@ -1371,7 +1371,9 @@ type_for_widest_vector_mode (tree type, optab op)
 FOR_EACH_MODE_FROM (mode, mode)
   if (GET_MODE_INNER (mode) == inner_mode
  && maybe_gt (GET_MODE_NUNITS (mode), best_nunits)
-   && optab_handler (op, mode) != CODE_FOR_nothing)
+   && optab_handler (op, mode) != CODE_FOR_nothing
+   && (known_eq (max_nunits, 0)
+   || known_lt (GET_MODE_NUNITS (mode), max_nunits)))

max_nunits suggests that known_le would be appropriate instead.

I see the only other caller with similar "problems":

  }
/* Can't use get_compute_type here, as supportable_convert_operation
   doesn't necessarily use an optab and needs two arguments.  */
tree vec_compute_type
  = type_for_widest_vector_mode (TREE_TYPE (arg_type), mov_optab);
if (vec_compute_type
&& VECTOR_MODE_P (TYPE_MODE (vec_compute_type))
&& subparts_gt (arg_type, vec_compute_type))

so please do not default to 0 but adjust this one as well.  It also
seems you then can remove the subparts_gt guards on both
vec_compute_type uses.

I think the API would be cleaner if we'd pass the original vector type
we can then extract TYPE_VECTOR_SUBPARTS from, avoiding the extra arg.

No?

Thanks,
Richard.






[comitted] bitint: Fix testism where __seg_gs was being used for all targets

2024-02-19 Thread Andre Vieira (lists)
Replaced uses of __seg_gs with the MACRO SEG defined in the testcase to 
pick (if any) the right __seg_{gs,fs} keyword based on target.


gcc/testsuite/ChangeLog:

* gcc.dg/bitint-86.c (__seg_gs): Replace with SEG MACRO.diff --git a/gcc/testsuite/gcc.dg/bitint-86.c b/gcc/testsuite/gcc.dg/bitint-86.c
index 
4e5761a203bc39150540326df9c0d88544bb02ef..10a2392b6f530ae165252bdac750061e92d53131
 100644
--- a/gcc/testsuite/gcc.dg/bitint-86.c
+++ b/gcc/testsuite/gcc.dg/bitint-86.c
@@ -15,14 +15,14 @@ struct T { struct S b[4]; };
 #endif
 
 void
-foo (__seg_gs struct T *p)
+foo (SEG struct T *p)
 {
   struct S s;
   p->b[0] = s;
 }
 
 void
-bar (__seg_gs struct T *p, _BitInt(710) x, int y, double z)
+bar (SEG struct T *p, _BitInt(710) x, int y, double z)
 {
   p->b[0].a = x + 42;
   p->b[1].a = x << y;
@@ -31,7 +31,7 @@ bar (__seg_gs struct T *p, _BitInt(710) x, int y, double z)
 }
 
 int
-baz (__seg_gs struct T *p, _BitInt(710) x, _BitInt(710) y)
+baz (SEG struct T *p, _BitInt(710) x, _BitInt(710) y)
 {
   return __builtin_add_overflow (x, y, >b[1].a);
 }


Re: [comitted] bitint: Fix testism where __seg_gs was being used for all targets

2024-02-19 Thread Andre Vieira (lists)




On 19/02/2024 16:17, Jakub Jelinek wrote:

On Mon, Feb 19, 2024 at 04:13:29PM +, Andre Vieira (lists) wrote:

Replaced uses of __seg_gs with the MACRO SEG defined in the testcase to pick
(if any) the right __seg_{gs,fs} keyword based on target.

gcc/testsuite/ChangeLog:

* gcc.dg/bitint-86.c (__seg_gs): Replace with SEG MACRO.


ChangeLog should be
* gcc.dg/bitint-86.c (foo, bar, baz): Replace __seg_gs with SEG.
Otherwise, LGTM.
Sorry for forgetting to do that myself.


Jakub



That makes sense ... but I already pushed it upstream, thought it was 
obvious. Apologies for the ChangeLog mistake :(


[PATCH] Fix tests for gomp

2023-12-13 Thread Andre Vieira (lists)

Hi,

Apologies for the delay and this mixup. I need to do something different

This is to fix testisms initially introduced by:
commit f5fc001a84a7dbb942a6252b3162dd38b4aae311
Author: Andre Vieira 
Date:   Mon Dec 11 14:24:41 2023 +

aarch64: enable mixed-types for aarch64 simdclones

gcc/testsuite/ChangeLog:

* gcc.dg/gomp/pr87887-1.c: Fixed test.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.

libgomp/ChangeLog:

* testsuite/libgomp.c/declare-variant-1.c: Fixed test.
* testsuite/libgomp.fortran/declare-simd-1.f90: Likewise.

OK for trunk? I was intending to commit as obvious, but jakub had made a 
comment about declare-simd-1.f90 so I thought it might be worth just 
sending it up to the mailing list first.


Kind regards,
Andrediff --git a/gcc/testsuite/gcc.dg/gomp/pr87887-1.c 
b/gcc/testsuite/gcc.dg/gomp/pr87887-1.c
index 
281898300c7794d862e62c70a83a33d5aaa8f89e..8b04ffd0809be4e6f5ab97c2e32e800edffbee4f
 100644
--- a/gcc/testsuite/gcc.dg/gomp/pr87887-1.c
+++ b/gcc/testsuite/gcc.dg/gomp/pr87887-1.c
@@ -10,7 +10,6 @@ foo (int x)
 {
   return (struct S) { x };
 }
-/* { dg-warning "unsupported return type ‘struct S’ for ‘simd’ functions" "" { 
target aarch64*-*-* } .-4 } */
 
 #pragma omp declare simd
 int
@@ -18,7 +17,6 @@ bar (struct S x)
 {
   return x.n;
 }
-/* { dg-warning "unsupported argument type ‘struct S’ for ‘simd’ functions" "" 
{ target aarch64*-*-* } .-4 } */
 
 #pragma omp declare simd uniform (x)
 int
diff --git a/gcc/testsuite/gcc.dg/gomp/pr89246-1.c 
b/gcc/testsuite/gcc.dg/gomp/pr89246-1.c
index 
4a0fd74f0639b2832dcb9101e006d127568fbcbd..dfe629c1c6a51624cd94878c638606220cfe94eb
 100644
--- a/gcc/testsuite/gcc.dg/gomp/pr89246-1.c
+++ b/gcc/testsuite/gcc.dg/gomp/pr89246-1.c
@@ -8,7 +8,6 @@ int foo (__int128 x)
 {
   return x;
 }
-/* { dg-warning "unsupported argument type ‘__int128’ for ‘simd’ functions" "" 
{ target aarch64*-*-* } .-4 } */
 
 #pragma omp declare simd
 extern int bar (int x);
diff --git a/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c 
b/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
index 
f12244054bd46fa10e51cc3a688c4cf683689994..354078acd9f3073b8400621a0e7149aee571594b
 100644
--- a/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
+++ b/gcc/testsuite/gcc.dg/gomp/simd-clones-2.c
@@ -19,7 +19,6 @@ float setArray(float *a, float x, int k)
 /* { dg-final { scan-tree-dump "_ZGVnN2ua32vl_setArray" "optimized" { target 
aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "_ZGVnN4ua32vl_setArray" "optimized" { target 
aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "_ZGVnN2vvva32_addit" "optimized" { target 
aarch64*-*-* } } } */
-/* { dg-final { scan-tree-dump "_ZGVnN4vvva32_addit" "optimized" { target 
aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "_ZGVnM2vl66u_addit" "optimized" { target 
aarch64*-*-* } } } */
 /* { dg-final { scan-tree-dump "_ZGVnM4vl66u_addit" "optimized" { target 
aarch64*-*-* } } } */
 
diff --git a/libgomp/testsuite/libgomp.c/declare-variant-1.c 
b/libgomp/testsuite/libgomp.c/declare-variant-1.c
index 
6129f23a0f80585246957022d63608dc3a68f1ff..790e9374054fe3e0ae609796640ff295b61e8389
 100644
--- a/libgomp/testsuite/libgomp.c/declare-variant-1.c
+++ b/libgomp/testsuite/libgomp.c/declare-variant-1.c
@@ -40,16 +40,17 @@ f04 (int a)
 int
 test1 (int x)
 {
-  /* At gimplification time, we can't decide yet which function to call.  */
-  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */
+  /* At gimplification time, we can't decide yet which function to call for
+ x86_64 targets, given the f01 variant.  */
+  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target 
x86_64-*-* } } } */
   /* After simd clones are created, the original non-clone test1 shall
  call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones
  shall call f01 with score 8.  */
   /* { dg-final { scan-ltrans-tree-dump-not "f04 \\\(x" "optimized" } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { 
target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { 
target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { 
target { aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { 
aarch64*-*-* } } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { 
target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { 
target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { 
target { aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { 
aarch64*-*-* } } } } */
   int a = f04 (x);
   int b = f04 (x);
   return a + b;
diff --git a/libgomp/testsuite/libgomp.fortran/declare-simd-1.f90 

Re: [PATCH] Fix tests for gomp

2023-12-13 Thread Andre Vieira (lists)




On 13/12/2023 10:55, Jakub Jelinek wrote:

On Wed, Dec 13, 2023 at 10:43:16AM +, Andre Vieira (lists) wrote:

Hi,

Apologies for the delay and this mixup. I need to do something different

This is to fix testisms initially introduced by:
commit f5fc001a84a7dbb942a6252b3162dd38b4aae311
Author: Andre Vieira 
Date:   Mon Dec 11 14:24:41 2023 +

 aarch64: enable mixed-types for aarch64 simdclones

gcc/testsuite/ChangeLog:

* gcc.dg/gomp/pr87887-1.c: Fixed test.
* gcc.dg/gomp/pr89246-1.c: Likewise.
* gcc.dg/gomp/simd-clones-2.c: Likewise.

libgomp/ChangeLog:

* testsuite/libgomp.c/declare-variant-1.c: Fixed test.
* testsuite/libgomp.fortran/declare-simd-1.f90: Likewise.

OK for trunk? I was intending to commit as obvious, but jakub had made a
comment about declare-simd-1.f90 so I thought it might be worth just sending
it up to the mailing list first.



--- a/libgomp/testsuite/libgomp.c/declare-variant-1.c
+++ b/libgomp/testsuite/libgomp.c/declare-variant-1.c
@@ -40,16 +40,17 @@ f04 (int a)
  int
  test1 (int x)
  {
-  /* At gimplification time, we can't decide yet which function to call.  */
-  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */
+  /* At gimplification time, we can't decide yet which function to call for
+ x86_64 targets, given the f01 variant.  */
+  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" { target 
x86_64-*-* } } } */
/* After simd clones are created, the original non-clone test1 shall
   call f03 (score 6), the sse2/avx/avx2 clones too, but avx512f clones
   shall call f01 with score 8.  */
/* { dg-final { scan-ltrans-tree-dump-not "f04 \\\(x" "optimized" } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { 
target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { 
target { !aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { 
target { aarch64*-*-* } } } } } */
-  /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { 
aarch64*-*-* } } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 14 "optimized" { 
target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f01 \\\(x" 4 "optimized" { 
target { !aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-times "f03 \\\(x" 10 "optimized" { 
target { aarch64*-*-* } } } } */
+  /* { dg-final { scan-ltrans-tree-dump-not "f01 \\\(x" "optimized" { target { 
aarch64*-*-* } } } } */


The changes in this test look all wrong.  The differences are
i?86-*-* x86_64-*-* (which can support avx512f isa) vs. other targets (which
can't).
So, there is nothing aarch64 specific in there and { target x86_64-*-* }
is also incorrect.  It should be simply
{ target i?86-*-* x86_64-*-* }
vs.
{ target { ! { i?86-*-* x86_64-*-* } } }
(never sure about the ! syntaxes).



Hmm I think I understand what you are saying, but I'm not sure I 
agree. So before I enabled simdclone testing for aarch64, this test had 
no target selectors. So it checked the same for 'all simdclone test 
targets'. Which seem to be x86 and amdgcn:


@@ -4321,7 +4321,8 @@ proc check_effective_target_vect_simd_clones { } {
 return [check_cached_effective_target_indexed vect_simd_clones {
   expr { (([istarget i?86-*-*] || [istarget x86_64-*-*])
  && [check_effective_target_avx512f])
|| [istarget amdgcn-*-*]
|| [istarget aarch64*-*-*] }}]
 }

I haven't checked what amdgcn does with this test, but I'd have to 
assume they were passing before? Though I'm not sure how amdgcn would 
pass the original:
 -  /* At gimplification time, we can't decide yet which function to 
call.  */

 -  /* { dg-final { scan-tree-dump-times "f04 \\\(x" 2 "gimple" } } */

I've added Andrew to the mail to see if he can comment on that. Either 
way I'd suggest we either add scan's per target with the expected value 
or stick with my original change of aarch64 vs non-aarch64 as I think 
that would better reflect the changes of enabling this for aarch64 where 
it wasn't ran before.


Re: veclower: improve selection of vector mode when lowering [PR 112787]

2023-12-20 Thread Andre Vieira (lists)

Thanks, fully agree with all comments.

gcc/ChangeLog:

PR target/112787
* tree-vect-generic (type_for_widest_vector_mode): Change function
to use original vector type and check widest vector mode has at 
most

the same number of elements.
(get_compute_type): Pass original vector type rather than the element
type to type_for_widest_vector_mode and remove now obsolete check
for the number of elements.

On 07/12/2023 07:45, Richard Biener wrote:

On Wed, 6 Dec 2023, Andre Vieira (lists) wrote:


Hi,

This patch addresses the issue reported in PR target/112787 by improving the
compute type selection.  We do this by not considering types with more
elements
than the type we are lowering since we'd reject such types anyway.

gcc/ChangeLog:

PR target/112787
* tree-vect-generic (type_for_widest_vector_mode): Add a parameter to
control maximum amount of elements in resulting vector mode.
(get_compute_type): Restrict vector_compute_type to a mode no wider
than the original compute type.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr112787.c: New test.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and
x86_64-pc-linux-gnu.

Is this OK for trunk?


@@ -1347,7 +1347,7 @@ optimize_vector_constructor (gimple_stmt_iterator
*gsi)
 TYPE, or NULL_TREE if none is found.  */

Can you improve the function comment?  It also doesn't mention OP ...

  static tree
-type_for_widest_vector_mode (tree type, optab op)
+type_for_widest_vector_mode (tree type, optab op, poly_int64 max_nunits =
0)
  {
machine_mode inner_mode = TYPE_MODE (type);
machine_mode best_mode = VOIDmode, mode;
@@ -1371,7 +1371,9 @@ type_for_widest_vector_mode (tree type, optab op)
FOR_EACH_MODE_FROM (mode, mode)
  if (GET_MODE_INNER (mode) == inner_mode
 && maybe_gt (GET_MODE_NUNITS (mode), best_nunits)
-   && optab_handler (op, mode) != CODE_FOR_nothing)
+   && optab_handler (op, mode) != CODE_FOR_nothing
+   && (known_eq (max_nunits, 0)
+   || known_lt (GET_MODE_NUNITS (mode), max_nunits)))

max_nunits suggests that known_le would be appropriate instead.

I see the only other caller with similar "problems":

 }
   /* Can't use get_compute_type here, as supportable_convert_operation
  doesn't necessarily use an optab and needs two arguments.  */
   tree vec_compute_type
 = type_for_widest_vector_mode (TREE_TYPE (arg_type), mov_optab);
   if (vec_compute_type
   && VECTOR_MODE_P (TYPE_MODE (vec_compute_type))
   && subparts_gt (arg_type, vec_compute_type))

so please do not default to 0 but adjust this one as well.  It also
seems you then can remove the subparts_gt guards on both
vec_compute_type uses.

I think the API would be cleaner if we'd pass the original vector type
we can then extract TYPE_VECTOR_SUBPARTS from, avoiding the extra arg.

No?

Thanks,
Richard.diff --git a/gcc/testsuite/gcc.target/aarch64/pr112787.c 
b/gcc/testsuite/gcc.target/aarch64/pr112787.c
new file mode 100644
index 
..caca1bf7ef447e4489b2c134d7200a4afd16763f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr112787.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=armv8-a+sve -mcpu=neoverse-n2" } */
+
+typedef int __attribute__((__vector_size__ (64))) vec;
+
+vec fn (vec a, vec b)
+{
+  return a + b;
+}
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+} 4 } } */
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 
a7e6cb87a5e31e3dd2a893ea5652eeebf8d5d214..c906eb3521ea01fd2bdfc89c3476d02c555cf8cc
 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1343,12 +1343,16 @@ optimize_vector_constructor (gimple_stmt_iterator *gsi)
   gsi_replace (gsi, g, false);
 }
 
-/* Return a type for the widest vector mode whose components are of type
-   TYPE, or NULL_TREE if none is found.  */
+/* Return a type for the widest vector mode with the same element type as
+   type ORIGINAL_VECTOR_TYPE, with at most the same number of elements as type
+   ORIGINAL_VECTOR_TYPE and that is supported by the target for an operation
+   with optab OP, or return NULL_TREE if none is found.  */
 
 static tree
-type_for_widest_vector_mode (tree type, optab op)
+type_for_widest_vector_mode (tree original_vector_type, optab op)
 {
+  gcc_assert (VECTOR_TYPE_P (original_vector_type));
+  tree type = TREE_TYPE (original_vector_type);
   machine_mode inner_mode = TYPE_MODE (type);
   machine_mode best_mode = VOIDmode, mode;
   poly_int64 best_nunits = 0;
@@ -1371,7 +1375,9 @@ type_for_widest_vector_mode (tree type, optab op)
   FOR_EACH_MODE_FROM (mode, mode)
 if (GET_MODE_INNER (mode) == inner_mode
&& maybe_gt (GET_MODE_NUNITS (mode), best_nunits)
-   && optab_handler (op

omp: Fix simdclone arguments with veclen lower than simdlen [PR113040]

2023-12-20 Thread Andre Vieira (lists)

This patch fixes an issue introduced by:
commit ea4a3d08f11a59319df7b750a955ac613a3f438a
Author: Andre Vieira 
Date:   Wed Nov 1 17:02:41 2023 +

omp: Reorder call for TARGET_SIMD_CLONE_ADJUST

The problem was that after this patch we no longer added multiple 
arguments for vector arguments where the veclen was lower than the simdlen.


gcc/ChangeLog:

* omp-simd-clone.cc (simd_clone_adjust_argument_types): Add multiple
vector arguments where simdlen is larger than veclen.

Bootstrapped and regression tested on x86_64-pc-linux-gnu and 
aarch64-unknown-linux-gnu.


OK for trunk?

PS: struggling to add a testcase for this, the dumps don't show the 
simdclone prototype and I can't easily create a run-test for this as it 
requires glibc.  Only option is a very flaky assembly scan test to see 
if it's writing to ymm4 (i.e. it is passing enough parameters), but I 
haven't because I don't think that's a good idea.
PPS: maybe we ought to print the simdclone prototype when passing 
-fdump-ipa-simdclone ?diff --git a/gcc/omp-simd-clone.cc b/gcc/omp-simd-clone.cc
index 
3fbe428125243bc02bd58f6e50ac773e8df8..5151fef3bcdaa76802184df43ba13b8709645fd4
 100644
--- a/gcc/omp-simd-clone.cc
+++ b/gcc/omp-simd-clone.cc
@@ -781,6 +781,7 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
   struct cgraph_simd_clone *sc = node->simdclone;
   unsigned i, k;
   poly_uint64 veclen;
+  auto_vec new_params;
 
   for (i = 0; i < sc->nargs; ++i)
 {
@@ -798,9 +799,11 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
   switch (sc->args[i].arg_type)
{
default:
+ new_params.safe_push (parm_type);
  break;
case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_CONSTANT_STEP:
case SIMD_CLONE_ARG_TYPE_LINEAR_UVAL_VARIABLE_STEP:
+ new_params.safe_push (parm_type);
  if (node->definition)
sc->args[i].simd_array
  = create_tmp_simd_array (IDENTIFIER_POINTER (DECL_NAME (parm)),
@@ -828,6 +831,9 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
  else
vtype = build_vector_type (parm_type, veclen);
  sc->args[i].vector_type = vtype;
+ k = vector_unroll_factor (sc->simdlen, veclen);
+ for (unsigned j = 0; j < k; j++)
+   new_params.safe_push (vtype);
 
  if (node->definition)
sc->args[i].simd_array
@@ -893,22 +899,8 @@ simd_clone_adjust_argument_types (struct cgraph_node *node)
last_parm_void = true;
 
   gcc_assert (TYPE_ARG_TYPES (TREE_TYPE (node->decl)));
-  for (i = 0; i < sc->nargs; i++)
-   {
- tree ptype;
- switch (sc->args[i].arg_type)
-   {
-   default:
- ptype = sc->args[i].orig_type;
- break;
-   case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_CONSTANT_STEP:
-   case SIMD_CLONE_ARG_TYPE_LINEAR_VAL_VARIABLE_STEP:
-   case SIMD_CLONE_ARG_TYPE_VECTOR:
- ptype = sc->args[i].vector_type;
- break;
-   }
- new_arg_types = tree_cons (NULL_TREE, ptype, new_arg_types);
-   }
+  for (i = 0; i < new_params.length (); i++)
+   new_arg_types = tree_cons (NULL_TREE, new_params[i], new_arg_types);
   new_reversed = nreverse (new_arg_types);
   if (last_parm_void)
{


Re: [PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-20 Thread Andre Vieira (lists)
Squashed the definition and changes to predicated_doloop_end_internal 
and dlstp*_insn into this patch to make sure the first patch builds 
independently


On 18/12/2023 11:53, Andre Vieira wrote:


Reworked Stam's patch after comments in:
https://gcc.gnu.org/pipermail/gcc-patches/2023-December/640362.html

The original gcc ChangeLog remains unchanged, but I did split up some tests so
here is the testsuite ChangeLog.


gcc/testsuite/ChangeLog:

* gcc.target/arm/lob.h: Update framework.
* gcc.target/arm/lob1.c: Likewise.
* gcc.target/arm/lob6.c: Likewise.
* gcc.target/arm/mve/dlstp-compile-asm.c: New test.
* gcc.target/arm/mve/dlstp-int16x8.c: New test.
* gcc.target/arm/mve/dlstp-int16x8-run.c: New test.
* gcc.target/arm/mve/dlstp-int32x4.c: New test.
* gcc.target/arm/mve/dlstp-int32x4-run.c: New test.
* gcc.target/arm/mve/dlstp-int64x2.c: New test.
* gcc.target/arm/mve/dlstp-int64x2-run.c: New test.
* gcc.target/arm/mve/dlstp-int8x16.c: New test.
* gcc.target/arm/mve/dlstp-int8x16-run.c: New test.
* gcc.target/arm/mve/dlstp-invalid-asm.c: New test.
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 
2f5ca79ed8ddd647b212782a0454ee4fefc07257..4f164c547406c43219900c111401540c7ef9d7d1
 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
 extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern rtx arm_attempt_dlstp_transform (rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 
0c0cb14a8a4f043357b8acd7042a9f9386af1eb1..1ee72bcb7ec4bea5feea8453ceef7702b0088a73
 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const 
arm_attribute_table[] =
 #undef TARGET_HAVE_CONDITIONAL_EXECUTION
 #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
 
@@ -34483,19 +34489,1147 @@ arm_invalid_within_doloop (const rtx_insn *insn)
 }
 
 bool
-arm_target_insn_ok_for_lob (rtx insn)
+arm_target_bb_ok_for_lob (basic_block bb)
 {
-  basic_block bb = BLOCK_FOR_INSN (insn);
   /* Make sure the basic block of the target insn is a simple latch
  having as single predecessor and successor the body of the loop
  itself.  Only simple loops with a single basic block as body are
  supported for 'low over head loop' making sure that LE target is
  above LE itself in the generated code.  */
-
   return single_succ_p (bb)
-&& single_pred_p (bb)
-&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-&& contains_no_active_insn_p (bb);
+&& single_pred_p (bb)
+&& single_succ_edge (bb)->dest == single_pred_edge (bb)->src;
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+   lanes based on the machine mode being used.  */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+  rtx insn_set = single_set (insn);
+  if (insn_set
+  && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+  && (XINT (SET_SRC (insn_set), 1) == VCTP
+ || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+{
+  machine_mode mode = GET_MODE (SET_SRC (insn_set));
+  return (VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+? GET_MODE_NUNITS (mode) : 0;
+}
+  return 0;
+}
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+   sub-rtx of the VPR reg.  The TYPE argument controls whether
+   this function should:
+   * For TYPE == 0, check all operands, including the OUT operands,
+ and return the first occurrence of the VPR reg.
+   * For TYPE == 1, only check the input operands.
+   * For TYPE == 2, only check the output operands.
+   (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn, unsigned int type = 0)
+{
+  gcc_assert (type < 3);
+  if (!NONJUMP_INSN_P (insn))
+return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+ this insn.  */
+  preprocess_constraints (insn);
+
+  for (int op = 0; op < n_operands; op++)
+{
+  requires_vpr = true;
+  if (type == 1 && 

Re: [PATCH 1/2] arm: Add define_attr to to create a mapping between MVE predicated and unpredicated insns

2023-12-20 Thread Andre Vieira (lists)
Reworked patch after Richard's comments and moved 
predicated_doloop_end_internal and dlstp*_insn to the next patch in the 
series to make sure this one builds on its own.


On 18/12/2023 11:53, Andre Vieira wrote:


Re-sending Stam's first patch, same as:
https://gcc.gnu.org/pipermail/gcc-patches/2023-November/635301.html

Hopefully patchworks can pick this up :)
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h
index 
a9c2752c0ea5ecd4597ded254e9426753ac0a098..f0b01b7461f883994a0be137cb6cbf079d54618b
 100644
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -2375,6 +2375,21 @@ extern int making_const_table;
   else if (TARGET_THUMB1)  \
 thumb1_final_prescan_insn (INSN)
 
+/* These defines are useful to refer to the value of the mve_unpredicated_insn
+   insn attribute.  Note that, because these use the get_attr_* function, these
+   will change recog_data if (INSN) isn't current_insn.  */
+#define MVE_VPT_PREDICABLE_INSN_P(INSN)
\
+  (recog_memoized (INSN) >= 0  \
+   && get_attr_mve_unpredicated_insn (INSN) != CODE_FOR_nothing)
+
+#define MVE_VPT_PREDICATED_INSN_P(INSN)
\
+  (MVE_VPT_PREDICABLE_INSN_P (INSN)\
+   && recog_memoized (INSN) != get_attr_mve_unpredicated_insn (INSN))
+
+#define MVE_VPT_UNPREDICATED_INSN_P(INSN)  \
+  (MVE_VPT_PREDICABLE_INSN_P (INSN)\
+   && recog_memoized (INSN) == get_attr_mve_unpredicated_insn (INSN))
+
 #define ARM_SIGN_EXTEND(x)  ((HOST_WIDE_INT)   \
   (HOST_BITS_PER_WIDE_INT <= 32 ? (unsigned HOST_WIDE_INT) (x) \
: unsigned HOST_WIDE_INT)(x)) & (unsigned HOST_WIDE_INT) 0x) |\
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 
07eaf06cdeace750fe1c7d399deb833ef5fc2b66..296212be33ffe6397b05491d8854d2a59f7c54df
 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -124,6 +124,12 @@ (define_attr "fpu" "none,vfp"
 ; and not all ARM insns do.
 (define_attr "predicated" "yes,no" (const_string "no"))
 
+; An attribute that encodes the CODE_FOR_ of the MVE VPT unpredicated
+; version of a VPT-predicated instruction.  For unpredicated instructions
+; that are predicable, encode the same pattern's CODE_FOR_ as a way to
+; encode that it is a predicable instruction.
+(define_attr "mve_unpredicated_insn" "" (symbol_ref "CODE_FOR_nothing"))
+
 ; LENGTH of an instruction (in bytes)
 (define_attr "length" ""
   (const_int 4))
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 
a980353810166312d5bdfc8ad58b2825c910d0a0..5ea2d9e866891bdb3dc73fcf6cbd6cdd2f989951
 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2305,6 +2305,7 @@ (define_int_attr simd32_op [(UNSPEC_QADD8 "qadd8") 
(UNSPEC_QSUB8 "qsub8")
 
 (define_int_attr mmla_sfx [(UNSPEC_MATMUL_S "s8") (UNSPEC_MATMUL_U "u8")
   (UNSPEC_MATMUL_US "s8")])
+
 ;;MVE int attribute.
 (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
   (VREV16Q_U "u") (VMVNQ_N_S "s") (VMVNQ_N_U "u")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 
b0d3443da9cee991193d390200738290806a1e69..b1862d7977e91605cd971e634105bed3fa6e75cb
 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -17,7 +17,7 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; .
 
-(define_insn "*mve_mov"
+(define_insn "mve_mov"
   [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w   , w,   
r,Ux,w")
(match_operand:MVE_types 1 "general_operand"  " 
w,r,w,DnDm,UxUi,r,w, Ul"))]
   "TARGET_HAVE_MVE || TARGET_HAVE_MVE_FLOAT"
@@ -81,18 +81,27 @@ (define_insn "*mve_mov"
   return "";
 }
 }
-  [(set_attr "type" 
"mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_store,mve_load")
+   [(set_attr_alternative "mve_unpredicated_insn" [(symbol_ref 
"CODE_FOR_mve_mov")
+  (symbol_ref 
"CODE_FOR_nothing")
+  (symbol_ref 
"CODE_FOR_nothing")
+  (symbol_ref 
"CODE_FOR_mve_mov")
+  (symbol_ref 
"CODE_FOR_mve_mov")
+  (symbol_ref 
"CODE_FOR_nothing")
+  (symbol_ref 
"CODE_FOR_mve_mov")
+  (symbol_ref 
"CODE_FOR_nothing")])
+   (set_attr "type" 
"mve_move,mve_move,mve_move,mve_move,mve_load,multiple,mve_store,mve_load")
(set_attr "length" "4,8,8,4,4,8,4,8")
(set_attr "thumb2_pool_range" "*,*,*,*,1018,*,*,*")
(set_attr "neg_pool_range" "*,*,*,*,996,*,*,*")])
 
-(define_insn "*mve_vdup"
+(define_insn "mve_vdup"
   

Re: [PATCH] aarch64: enable mixed-types for aarch64 simdclones

2023-12-12 Thread Andre Vieira (lists)




On 11/12/2023 21:42, Thomas Schwinge wrote:

Hi Andre!

On 2023-10-16T16:03:26+0100, "Andre Vieira (lists)" 
 wrote:

Just a minor update to the patch, I had missed the libgomp testsuite, so
had to make some adjustments there too.


Unfortunately, there appear to be a number of DejaGnu directive errors in
your test case changes -- do you not see those in your testing?


I hadn't seen those... I wonder whether they don't show up if you do 
dg-cmp-results with just one -v, I have binned the build, but I'll rerun 
it and double check, may need to use '-v -v' instead.


Thanks for letting me know.
..., and the following change also doesn't look quite right:



--- a/libgomp/testsuite/libgomp.fortran/declare-simd-1.f90
+++ b/libgomp/testsuite/libgomp.fortran/declare-simd-1.f90
@@ -1,5 +1,5 @@
  ! { dg-do run { target vect_simd_clones } }
-! { dg-options "-fno-inline" }
+! { dg-options "-fno-inline -cpp -D__aarch64__" }




Yeah, that needs a target selector. Thanks!


[RFC] aarch64: Add support for __BitInt

2024-01-10 Thread Andre Vieira (lists)

Hi,

This patch is still work in progress, but posting to show failure with 
bitint-7 test where handle_stmt called from lower_mergeable_stmt ICE's 
because the idx (3) is out of range for the __BitInt(135) with a 
limb_prec of 64.


I hacked gcc locally to work around this issue and still have one 
outstanding failure, so will look to resolve that failure before posting 
a new version.


Kind Regards,
Andrediff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
a5a6b52730d6c5013346d128e89915883f1707ae..15fb0ece5256f25c2ca8bb5cb82fc61488d0393e
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6534,7 +6534,7 @@ aarch64_return_in_memory_1 (const_tree type)
   machine_mode ag_mode;
   int count;
 
-  if (!AGGREGATE_TYPE_P (type)
+  if (!(AGGREGATE_TYPE_P (type) || TREE_CODE (type) == BITINT_TYPE)
   && TREE_CODE (type) != COMPLEX_TYPE
   && TREE_CODE (type) != VECTOR_TYPE)
 /* Simple scalar types always returned in registers.  */
@@ -6618,6 +6618,10 @@ aarch64_function_arg_alignment (machine_mode mode, 
const_tree type,
 
   gcc_assert (TYPE_MODE (type) == mode);
 
+  if (TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return GET_MODE_ALIGNMENT (TImode);
+
   if (!AGGREGATE_TYPE_P (type))
 {
   /* The ABI alignment is the natural alignment of the type, without
@@ -21773,6 +21777,11 @@ aarch64_composite_type_p (const_tree type,
   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
 return true;
 
+  if (type
+  && TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return true;
+
   if (mode == BLKmode
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
@@ -28265,6 +28274,29 @@ aarch64_excess_precision (enum excess_precision_type 
type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+aarch64_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (n <= 8)
+info->limb_mode = QImode;
+  else if (n <= 16)
+info->limb_mode = HImode;
+  else if (n <= 32)
+info->limb_mode = SImode;
+  else
+info->limb_mode = DImode;
+
+  if (n > 128)
+info->abi_limb_mode = TImode;
+  else
+info->abi_limb_mode = info->limb_mode;
+  info->big_endian = TARGET_BIG_END;
+  info->extended = false;
+  return true;
+}
+
 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
scheduled for speculative execution.  Reject the long-running division
and square-root instructions.  */
@@ -30374,6 +30406,9 @@ aarch64_run_selftests (void)
 #undef TARGET_C_EXCESS_PRECISION
 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
 
+#undef TARGET_C_BITINT_TYPE_INFO
+#define TARGET_C_BITINT_TYPE_INFO aarch64_bitint_type_info
+
 #undef  TARGET_EXPAND_BUILTIN
 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
 
diff --git a/libgcc/config/aarch64/t-softfp b/libgcc/config/aarch64/t-softfp
index 
2e32366f891361e2056c680b2e36edb1871c7670..4302ad52eb881825d0fb65b9ebd21031781781f5
 100644
--- a/libgcc/config/aarch64/t-softfp
+++ b/libgcc/config/aarch64/t-softfp
@@ -4,7 +4,8 @@ softfp_extensions := sftf dftf hftf bfsf
 softfp_truncations := tfsf tfdf tfhf tfbf dfbf sfbf hfbf
 softfp_exclude_libgcc2 := n
 softfp_extras += fixhfti fixunshfti floattihf floatuntihf \
-floatdibf floatundibf floattibf floatuntibf
+floatdibf floatundibf floattibf floatuntibf \
+fixtfbitint floatbitinttf
 
 TARGET_LIBGCC2_CFLAGS += -Wno-missing-prototypes
 


Re: [PATCH 8/8] aarch64: Add SVE support for simd clones [PR 96342]

2023-12-01 Thread Andre Vieira (lists)




On 29/11/2023 17:01, Richard Sandiford wrote:

"Andre Vieira (lists)"  writes:

Rebased, no major changes, still needs review.

On 30/08/2023 10:19, Andre Vieira (lists) via Gcc-patches wrote:

This patch finalizes adding support for the generation of SVE simd
clones when no simdlen is provided, following the ABI rules where the
widest data type determines the minimum amount of elements in a length
agnostic vector.

gcc/ChangeLog:

      * config/aarch64/aarch64-protos.h (add_sve_type_attribute):
Declare.
  * config/aarch64/aarch64-sve-builtins.cc (add_sve_type_attribute):
Make
  visibility global.
  * config/aarch64/aarch64.cc (aarch64_fntype_abi): Ensure SVE ABI is
  chosen over SIMD ABI if a SVE type is used in return or arguments.
  (aarch64_simd_clone_compute_vecsize_and_simdlen): Create VLA simd
clone
  when no simdlen is provided, according to ABI rules.
  (aarch64_simd_clone_adjust): Add '+sve' attribute to SVE simd clones.
  (aarch64_simd_clone_adjust_ret_or_param): New.
  (TARGET_SIMD_CLONE_ADJUST_RET_OR_PARAM): Define.
  * omp-simd-clone.cc (simd_clone_mangle): Print 'x' for VLA simdlen.
  (simd_clone_adjust): Adapt safelen check to be compatible with VLA
  simdlen.

gcc/testsuite/ChangeLog:

  * c-c++-common/gomp/declare-variant-14.c: Adapt aarch64 scan.
  * gfortran.dg/gomp/declare-variant-14.f90: Likewise.
  * gcc.target/aarch64/declare-simd-1.c: Remove warning checks where no
  longer necessary.
  * gcc.target/aarch64/declare-simd-2.c: Add SVE clone scan.


diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
60a55f4bc1956786ea687fc7cad7ec9e4a84e1f0..769d637f63724a7f0044f48f3dd683e0fb46049c
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1005,6 +1005,8 @@ namespace aarch64_sve {
  #ifdef GCC_TARGET_H
bool verify_type_context (location_t, type_context_kind, const_tree, bool);
  #endif
+ void add_sve_type_attribute (tree, unsigned int, unsigned int,
+ const char *, const char *);
  }
  
  extern void aarch64_split_combinev16qi (rtx operands[3]);

diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 
161a14edde7c9fb1b13b146cf50463e2d78db264..6f99c438d10daa91b7e3b623c995489f1a8a0f4c
 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -569,14 +569,16 @@ static bool reported_missing_registers_p;
  /* Record that TYPE is an ABI-defined SVE type that contains NUM_ZR SVE 
vectors
 and NUM_PR SVE predicates.  MANGLED_NAME, if nonnull, is the ABI-defined
 mangling of the type.  ACLE_NAME is the  name of the type.  */
-static void
+void
  add_sve_type_attribute (tree type, unsigned int num_zr, unsigned int num_pr,
const char *mangled_name, const char *acle_name)
  {
tree mangled_name_tree
  = (mangled_name ? get_identifier (mangled_name) : NULL_TREE);
+  tree acle_name_tree
+= (acle_name ? get_identifier (acle_name) : NULL_TREE);
  
-  tree value = tree_cons (NULL_TREE, get_identifier (acle_name), NULL_TREE);

+  tree value = tree_cons (NULL_TREE, acle_name_tree, NULL_TREE);
value = tree_cons (NULL_TREE, mangled_name_tree, value);
value = tree_cons (NULL_TREE, size_int (num_pr), value);
value = tree_cons (NULL_TREE, size_int (num_zr), value);
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
37507f091c2a6154fa944c3a9fad6a655ab5d5a1..cb0947b18c6a611d55579b5b08d93f6a4a9c3b2c
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4080,13 +4080,13 @@ aarch64_takes_arguments_in_sve_regs_p (const_tree 
fntype)
  static const predefined_function_abi &
  aarch64_fntype_abi (const_tree fntype)
  {
-  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
-return aarch64_simd_abi ();
-
if (aarch64_returns_value_in_sve_regs_p (fntype)
|| aarch64_takes_arguments_in_sve_regs_p (fntype))
  return aarch64_sve_abi ();
  
+  if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))

+return aarch64_simd_abi ();
+
return default_function_abi;
  }
  


I think we discussed this off-list later, but the change above shouldn't
be necessary.  aarch64_vector_pcs must not be attached to SVE PCS functions,
so the two cases should be mutually exclusive.


Yeah I had made the changes locally, but not updated the patch yet.



@@ -27467,7 +27467,7 @@ aarch64_simd_clone_compute_vecsize_and_simdlen (struct 
cgraph_node *node,
int num, bool explicit_p)
  {
tree t, ret_type;
-  unsigned int nds_elt_bits;
+  unsigned int nds_elt_bits, wds_elt_bits;
int count;
unsigned HOST_WIDE_INT const_simdlen;
  
@@ -27513,10 +27513,14 @@ aarch64_simd_clone_compute_vecsize_and

Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-12-07 Thread Andre Vieira (lists)
Thanks for addressing my comments. I have reviewed this and the other 
patch before and they LGTM. I however do not have approval rights so you 
will need the OK from a maintainer.


Thanks for doing this :)

Andre

On 30/11/2023 12:55, Stamatis Markianos-Wright wrote:

Hi Andre,

Thanks for the comments, see latest revision attached.

On 27/11/2023 12:47, Andre Vieira (lists) wrote:

Hi Stam,

Just some comments.

+/* Recursively scan through the DF chain backwards within the basic 
block and
+   determine if any of the USEs of the original insn (or the USEs of 
the insns
s/Recursively scan/Scan/ as you no longer recurse, thanks for that by 
the way :) +   where thy were DEF-ed, etc., recursively) were affected 
by implicit VPT

remove recursively for the same reasons.

+  if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P 
(cond_temp_iv.step))

+    return NULL;
+  /* Look at the steps and swap around the rtx's if needed. Error 
out if

+ one of them cannot be identified as constant.  */
+  if (INTVAL (cond_counter_iv.step) != 0 && INTVAL 
(cond_temp_iv.step) != 0)

+    return NULL;

Move the comment above the if before, as the erroring out it talks 
about is there.

Done


+  emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body));
 space after 'insn_note)'

@@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 -  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
   || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !CONST_INT_P (XEXP (inc_src, 1)))

Do we ever check that inc_src is negative? We used to check if it was 
-1, now we only check it's a constnat, but not a negative one, so I 
suspect this needs a:

|| INTVAL (XEXP (inc_src, 1)) >= 0

Good point. Done


@@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc 
*desc,

 case GE:
   /* Currently only GE tests against zero are supported.  */
   gcc_assert (XEXP (condition, 1) == const0_rtx);
-
+  /* FALLTHRU */
+    case GTU:
   noloop = constm1_rtx;

I spent a very long time staring at this trying to understand why 
noloop = constm1_rtx for GTU, where I thought it should've been (count 
& (n-1)). For the current use of doloop it doesn't matter because ARM 
is the only target using it and you set desc->noloop_assumptions to 
null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. 
However, if a different target accepts this GTU pattern then this 
target agnostic code will do the wrong thing.  I suggest we either:
 - set noloop to what we think might be the correct value, which if 
you ask me should be 'count & (XEXP (condition, 1))',
 - or add a gcc_assert (GET_CODE (condition) != GTU); under the if 
(desc->noloop_assumption); part and document why.  I have a slight 
preference for the assert given otherwise we are adding code that we 
can't test.


Yea, that's true tbh. I've done the latter, but also separated out the 
"case GTU:" and added a comment, so that it's more clear that the noloop 
things aren't used in the only implemented GTU case (Arm)


Thank you :)



LGTM otherwise (but I don't have the power to approve this ;)).

Kind regards,
Andre

From: Stamatis Markianos-Wright 
Sent: Thursday, November 16, 2023 11:36 AM
To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; 
Richard Sandiford; Kyrylo Tkachov
Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated 
Low Overhead Loops


Pinging back to the top of reviewers' inboxes due to worry about Stage 1
End in a few days :)


See the last email for the latest version of the 2/2 patch. The 1/2
patch is A-Ok from Kyrill's earlier target-backend review.


On 10/11/2023 12:41, Stamatis Markianos-Wright wrote:


On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:


On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:
One of the main reasons for reading the arm bits was to try to 
answer

the question: if we switch to a downcounting loop with a GE
condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?

So AFAICT this is actually handled in the generic code in
`doloop_valid_p`:

This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standa

Re: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2023-11-27 Thread Andre Vieira (lists)

Hi Stam,

Just some comments.

+/* Recursively scan through the DF chain backwards within the basic 
block and
+   determine if any of the USEs of the original insn (or the USEs of 
the insns
s/Recursively scan/Scan/ as you no longer recurse, thanks for that by 
the way :) +   where thy were DEF-ed, etc., recursively) were affected 
by implicit VPT

remove recursively for the same reasons.

+  if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P 
(cond_temp_iv.step))

+   return NULL;
+  /* Look at the steps and swap around the rtx's if needed.  Error 
out if

+one of them cannot be identified as constant.  */
+  if (INTVAL (cond_counter_iv.step) != 0 && INTVAL 
(cond_temp_iv.step) != 0)

+   return NULL;

Move the comment above the if before, as the erroring out it talks about 
is there.


+  emit_note_after ((enum insn_note)NOTE_KIND (insn), BB_END (body));
 space after 'insn_note)'

@@ -173,14 +176,14 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 -  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
   || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !CONST_INT_P (XEXP (inc_src, 1)))

Do we ever check that inc_src is negative? We used to check if it was 
-1, now we only check it's a constnat, but not a negative one, so I 
suspect this needs a:

|| INTVAL (XEXP (inc_src, 1)) >= 0

@@ -492,7 +519,8 @@ doloop_modify (class loop *loop, class niter_desc *desc,
 case GE:
   /* Currently only GE tests against zero are supported.  */
   gcc_assert (XEXP (condition, 1) == const0_rtx);
-
+  /* FALLTHRU */
+case GTU:
   noloop = constm1_rtx;

I spent a very long time staring at this trying to understand why noloop 
= constm1_rtx for GTU, where I thought it should've been (count & 
(n-1)). For the current use of doloop it doesn't matter because ARM is 
the only target using it and you set desc->noloop_assumptions to 
null_rtx in 'arm_attempt_dlstp_transform' so noloop is never used. 
However, if a different target accepts this GTU pattern then this target 
agnostic code will do the wrong thing.  I suggest we either:
 - set noloop to what we think might be the correct value, which if you 
ask me should be 'count & (XEXP (condition, 1))',
 - or add a gcc_assert (GET_CODE (condition) != GTU); under the if 
(desc->noloop_assumption); part and document why.  I have a slight 
preference for the assert given otherwise we are adding code that we 
can't test.


LGTM otherwise (but I don't have the power to approve this ;)).

Kind regards,
Andre

From: Stamatis Markianos-Wright 
Sent: Thursday, November 16, 2023 11:36 AM
To: Stamatis Markianos-Wright via Gcc-patches; Richard Earnshaw; Richard 
Sandiford; Kyrylo Tkachov
Subject: [PING][PATCH 2/2] arm: Add support for MVE Tail-Predicated Low 
Overhead Loops


Pinging back to the top of reviewers' inboxes due to worry about Stage 1
End in a few days :)


See the last email for the latest version of the 2/2 patch. The 1/2
patch is A-Ok from Kyrill's earlier target-backend review.


On 10/11/2023 12:41, Stamatis Markianos-Wright wrote:


On 06/11/2023 17:29, Stamatis Markianos-Wright wrote:


On 06/11/2023 11:24, Richard Sandiford wrote:

Stamatis Markianos-Wright  writes:

One of the main reasons for reading the arm bits was to try to answer
the question: if we switch to a downcounting loop with a GE
condition,
how do we make sure that the start value is not a large unsigned
number that is interpreted as negative by GE?  E.g. if the loop
originally counted up in steps of N and used an LTU condition,
it could stop at a value in the range [INT_MAX + 1, UINT_MAX].
But the loop might never iterate if we start counting down from
most values in that range.

Does the patch handle that?

So AFAICT this is actually handled in the generic code in
`doloop_valid_p`:

This kind of loops fail because of they are "desc->infinite", then no
loop-doloop conversion is attempted at all (even for standard
dls/le loops)

Thanks to that check I haven't been able to trigger anything like the
behaviour you describe, do you think the doloop_valid_p checks are
robust enough?

The loops I was thinking of are provably not infinite though. E.g.:

   for (unsigned int i = 0; i < UINT_MAX - 100; ++i)
 ...

is known to terminate.  And doloop conversion is safe with the normal
count-down-by-1 approach, so I don't think current code would need
to reject it.  I.e. a conversion to:

   unsigned int i = UINT_MAX - 101;
   do
 ...
   while (--i != ~0U);

would be safe, but a conversion to:

   int i = UINT_MAX - 101;
   do
 ...
   while ((i -= step, i > 0));

wouldn't, 

Re: [RFC] vect: disable multiple calls of poly simdclones

2023-11-27 Thread Andre Vieira (lists)




On 06/11/2023 07:52, Richard Biener wrote:

On Fri, 3 Nov 2023, Andre Vieira (lists) wrote:


Hi,

The current codegen code to support VF's that are multiples of a simdclone
simdlen rely on BIT_FIELD_REF to create multiple input vectors.  This does not
work for non-constant simdclones, so we should disable using such clones when
the VF is a multiple of the non-constant simdlen until we change the codegen
to support those.

Enabling SVE simdclone support will cause ICEs if the vectorizer decides to
use a SVE simdclone with a VF that is larger than the simdlen. I'll be away
for the next two weeks, so cant' really discuss this further.
I initially tried to solve the problem, but the way
vectorizable_simd_clone_call is structured doesn't make it easy to replace
BIT_FIELD_REF with the poly-suitable solution right now of using
unpack_{hi,lo}.


I think it should be straight-forward to use unpack_{even,odd} (it's
even/odd for VLA, right?  If lo/hi would be possible then doing
BIT_FIELD_REF would be, too?  Also you need to have multiple stages
of unpack/pack when the factor is more than 2).

There's plenty of time even during stage3 to address this.

At least your patch should have come with a testcase (or two).


Yeah I didn't add one as it didn't trigger on AArch64 without my two 
outstanding aarch64 simdclone patches.


Is there a bugreport tracking this issue?  It should affect GCN as well
I guess.


No, since I can't trigger them yet on trunk until the reviews on my 
target specific patches are done and they are committed.


I don't have a GCN backend lying around but I suspect GCN doesn't use 
poly simdlen simdclones yet either... I haven't checked. The issue 
triggers for aarch64 when trying to generate SVE simdclones for 
functions with mixed types.  I'll give the unpack thing a go locally.


veclower: improve selection of vector mode when lowering [PR 112787]

2023-12-06 Thread Andre Vieira (lists)

Hi,

This patch addresses the issue reported in PR target/112787 by improving the
compute type selection.  We do this by not considering types with more 
elements

than the type we are lowering since we'd reject such types anyway.

gcc/ChangeLog:

PR target/112787
* tree-vect-generic (type_for_widest_vector_mode): Add a parameter to
control maximum amount of elements in resulting vector mode.
(get_compute_type): Restrict vector_compute_type to a mode no wider
than the original compute type.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr112787.c: New test.

Bootstrapped and regression tested on aarch64-unknown-linux-gnu and 
x86_64-pc-linux-gnu.


Is this OK for trunk?

Kind regards,
Andre Vieiradiff --git a/gcc/testsuite/gcc.target/aarch64/pr112787.c 
b/gcc/testsuite/gcc.target/aarch64/pr112787.c
new file mode 100644
index 
..caca1bf7ef447e4489b2c134d7200a4afd16763f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr112787.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -march=armv8-a+sve -mcpu=neoverse-n2" } */
+
+typedef int __attribute__((__vector_size__ (64))) vec;
+
+vec fn (vec a, vec b)
+{
+  return a + b;
+}
+
+/* { dg-final { scan-assembler-times {add\tv[0-9]+} 4 } } */
diff --git a/gcc/tree-vect-generic.cc b/gcc/tree-vect-generic.cc
index 
a7e6cb87a5e31e3dd2a893ea5652eeebf8d5d214..2dbf3c8f5f64f2623944110dbc371fe0944198f0
 100644
--- a/gcc/tree-vect-generic.cc
+++ b/gcc/tree-vect-generic.cc
@@ -1347,7 +1347,7 @@ optimize_vector_constructor (gimple_stmt_iterator *gsi)
TYPE, or NULL_TREE if none is found.  */
 
 static tree
-type_for_widest_vector_mode (tree type, optab op)
+type_for_widest_vector_mode (tree type, optab op, poly_int64 max_nunits = 0)
 {
   machine_mode inner_mode = TYPE_MODE (type);
   machine_mode best_mode = VOIDmode, mode;
@@ -1371,7 +1371,9 @@ type_for_widest_vector_mode (tree type, optab op)
   FOR_EACH_MODE_FROM (mode, mode)
 if (GET_MODE_INNER (mode) == inner_mode
&& maybe_gt (GET_MODE_NUNITS (mode), best_nunits)
-   && optab_handler (op, mode) != CODE_FOR_nothing)
+   && optab_handler (op, mode) != CODE_FOR_nothing
+   && (known_eq (max_nunits, 0)
+   || known_lt (GET_MODE_NUNITS (mode), max_nunits)))
   best_mode = mode, best_nunits = GET_MODE_NUNITS (mode);
 
   if (best_mode == VOIDmode)
@@ -1702,7 +1704,8 @@ get_compute_type (enum tree_code code, optab op, tree 
type)
  || optab_handler (op, TYPE_MODE (type)) == CODE_FOR_nothing))
 {
   tree vector_compute_type
-   = type_for_widest_vector_mode (TREE_TYPE (type), op);
+   = type_for_widest_vector_mode (TREE_TYPE (type), op,
+  TYPE_VECTOR_SUBPARTS (compute_type));
   if (vector_compute_type != NULL_TREE
  && subparts_gt (compute_type, vector_compute_type)
  && maybe_ne (TYPE_VECTOR_SUBPARTS (vector_compute_type), 1U)


Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-01-31 Thread Andre Vieira (lists)




On 31/01/2024 14:35, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:




On 31/01/2024 13:58, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:




On 31/01/2024 12:13, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Tue, 30 Jan 2024, Andre Vieira wrote:



This patch adds stmt_vec_info to TARGET_SIMD_CLONE_USABLE to make sure
the
target can reject a simd_clone based on the vector mode it is using.
This is needed because for VLS SVE vectorization the vectorizer accepts
Advanced SIMD simd clones when vectorizing using SVE types because the
simdlens
might match.  This will cause type errors later on.

Other targets do not currently need to use this argument.


Can you instead pass down the mode?


Thinking about that again the cgraph_simd_clone info in the clone
should have sufficient information to disambiguate.  If it doesn't
then we should amend it.

Richard.


Hi Richard,

Thanks for the review, I don't think cgraph_simd_clone_info is the right
place
to pass down this information, since this is information about the caller
rather than the simdclone itself. What we are trying to achieve here is
making
the vectorizer being able to accept or reject simdclones based on the ISA
we
are vectorizing for. To distinguish between SVE and Advanced SIMD ISAs we
use
modes, I am also not sure that's ideal but it is what we currently use. So
to
answer your earlier question, yes I can also pass down mode if that's
preferable.


Note cgraph_simd_clone_info has simdlen and we seem to check elsewhere
whether that's POLY or constant.  I wonder how aarch64_sve_mode_p
comes into play here which in the end classifies VLS SVE modes as
non-SVE?



Using -msve-vector-bits=128
(gdb) p TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))
$4 = E_VNx4SImode
(gdb) p  TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo))
$5 = (tree) 0xf741c1b0
(gdb) p debug (TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo)))
128
(gdb) p aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo)))
$5 = true

and for reference without vls codegen:
(gdb) p TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))
$1 = E_VNx4SImode
(gdb) p  debug (TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo)))
POLY_INT_CST [128, 128]

Having said that I believe that the USABLE targethook implementation for
aarch64 should also block other uses, like an Advanced SIMD mode being used as
input for a SVE VLS SIMDCLONE. The reason being that for instance 'half'
registers like VNx2SI are packed differently from V2SI.

We could teach the vectorizer to support these of course, but that requires
more work and is not extremely useful just yet. I'll add the extra check that
to the patch once we agree on how to pass down the information we need. Happy
to use either mode, or stmt_vec_info and extract the mode from it like it does
now.


As said, please pass down 'mode'.  But I wonder how to document it,
which mode is that supposed to be?  Any of result or any argument
mode that happens to be a vector?  I think that we might be able
to mix Advanced SIMD modes and SVE modes with -msve-vector-bits=128
in the same loop?

Are the simd clones you don't want to use with -msve-vector-bits=128
having constant simdlen?  If so why do you generate them in the first
place?


So this is where things get a bit confusing and I will write up some 
text for these cases to put in our ABI document (currently in Beta and 
in need of some tlc).


Our intended behaviour is for a 'declare simd' without a simdlen to 
generate simdclones for:
* Advanced SIMD 128 and 64-bit vectors, where possible (we don't allow 
for simdlen 1, Tamar fixed that in gcc recently),

* SVE VLA vectors.

Let me illustrate this with an example:

__attribute__ ((simd (notinbranch), const)) float cosf(float);

Should tell the compiler the following simd clones are available:
__ZGVnN4v_cosf 128-bit 4x4 float Advanced SIMD clone
__ZGVnN2v_cosf 64-bit  4x2 float Advanced SIMD clone
__ZGVsMxv_cosf [128, 128]-bit 4x4xN SVE SIMD clone

[To save you looking into the abi let me break this down, _ZGV is 
prefix, then 'n' or 's' picks between Advanced SIMD and SVE, 'N' or 'M' 
picks between Not Masked and Masked (SVE is always masked even if we ask 
for notinbranch), then a digit or 'x' picks between Vector Length or 
VLA, and after that you get a letter per argument, where v = vector mapped]


Regardless of -msve-vector-bits, however, the vectorizer (and any other 
part of the compiler) may assume that the VL of the VLA SVE clone is 
that specified by -msve-vector-bits, which if the clone is written in a 
VLA way will still work.


If the attribute is used with a function definition rather than 
declaration, so:


__attribute__ ((simd (notinbranch), const)) float fn0(float a)
{
  return a + 1.0f;
}

the compiler should again generate the three simd clones:
__ZGVnN4v_fn0 128-bit 4x4 float Advanced SIMD clone
__ZGVnN2v_fn0 64-bit  4x2 float Advanced SIMD clone
__ZGVsMxv_fn0 [128, 128]-bit 4x4xN SVE SIMD

Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-02-01 Thread Andre Vieira (lists)




On 01/02/2024 07:19, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:


The patch didn't come with a testcase so it's really hard to tell
what goes wrong now and how it is fixed ...


My bad! I had a testcase locally but never added it...

However... now I look at it and ran it past Richard S, the codegen isn't 
'wrong', but it does have the potential to lead to some pretty slow 
codegen, especially for inbranch simdclones where it transforms the SVE 
predicate into an Advanced SIMD vector by inserting the elements one at 
a time...


An example of which can be seen if you do:

gcc -O3 -march=armv8-a+sve -msve-vector-bits=128  -fopenmp-simd t.c -S

with the following t.c:
#pragma omp declare simd simdlen(4) inbranch
int __attribute__ ((const)) fn5(int);

void fn4 (int *a, int *b, int n)
{
for (int i = 0; i < n; ++i)
b[i] = fn5(a[i]);
}

Now I do have to say, for our main usecase of libmvec we won't have any 
'inbranch' Advanced SIMD clones, so we avoid that issue... But of course 
that doesn't mean user-code will.


I'm gonna remove this patch and run another test regression to see if it 
catches anything weird, but if not then I guess we do have the option to 
not use this patch and aim to solve the costing or codegen issue in 
GCC-15. We don't currently do any simdclone costing and I don't have a 
clear suggestion for how given openmp has no mechanism that I know off 
to expose the speedup of a simdclone over it's scalar variant, so how 
would we 'compare' a simdclone call with extra overhead of argument 
preparation vs scalar, though at least we could prefer a call to a 
different simdclone with less argument preparation. Anyways I digress.


Other tests, these require aarch64-autovec-preference=2 so that also has 
me worried less...


gcc -O3 -march=armv8-a+sve -msve-vector-bits=128 --param 
aarch64-autovec-preference=2 -fopenmp-simd t.c -S


t.c:
#pragma omp declare simd simdlen(2) notinbranch
float __attribute__ ((const)) fn1(double);

void fn0 (float *a, float *b, int n)
{
for (int i = 0; i < n; ++i)
b[i] = fn1((double) a[i]);
}

#pragma omp declare simd simdlen(2) notinbranch
float __attribute__ ((const)) fn3(float);

void fn2 (float *a, double *b, int n)
{
for (int i = 0; i < n; ++i)
b[i] = (double) fn3(a[i]);
}


Richard.



That said, I wonder how we end up mixing things up in the first place.

Richard.






Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-01-31 Thread Andre Vieira (lists)




On 31/01/2024 12:13, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Tue, 30 Jan 2024, Andre Vieira wrote:



This patch adds stmt_vec_info to TARGET_SIMD_CLONE_USABLE to make sure the
target can reject a simd_clone based on the vector mode it is using.
This is needed because for VLS SVE vectorization the vectorizer accepts
Advanced SIMD simd clones when vectorizing using SVE types because the simdlens
might match.  This will cause type errors later on.

Other targets do not currently need to use this argument.


Can you instead pass down the mode?


Thinking about that again the cgraph_simd_clone info in the clone
should have sufficient information to disambiguate.  If it doesn't
then we should amend it.

Richard.


Hi Richard,

Thanks for the review, I don't think cgraph_simd_clone_info is the right 
place to pass down this information, since this is information about the 
caller rather than the simdclone itself. What we are trying to achieve 
here is making the vectorizer being able to accept or reject simdclones 
based on the ISA we are vectorizing for. To distinguish between SVE and 
Advanced SIMD ISAs we use modes, I am also not sure that's ideal but it 
is what we currently use. So to answer your earlier question, yes I can 
also pass down mode if that's preferable.


Regards,
Andre


Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-01-31 Thread Andre Vieira (lists)




On 31/01/2024 13:58, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:




On 31/01/2024 12:13, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Tue, 30 Jan 2024, Andre Vieira wrote:



This patch adds stmt_vec_info to TARGET_SIMD_CLONE_USABLE to make sure the
target can reject a simd_clone based on the vector mode it is using.
This is needed because for VLS SVE vectorization the vectorizer accepts
Advanced SIMD simd clones when vectorizing using SVE types because the
simdlens
might match.  This will cause type errors later on.

Other targets do not currently need to use this argument.


Can you instead pass down the mode?


Thinking about that again the cgraph_simd_clone info in the clone
should have sufficient information to disambiguate.  If it doesn't
then we should amend it.

Richard.


Hi Richard,

Thanks for the review, I don't think cgraph_simd_clone_info is the right place
to pass down this information, since this is information about the caller
rather than the simdclone itself. What we are trying to achieve here is making
the vectorizer being able to accept or reject simdclones based on the ISA we
are vectorizing for. To distinguish between SVE and Advanced SIMD ISAs we use
modes, I am also not sure that's ideal but it is what we currently use. So to
answer your earlier question, yes I can also pass down mode if that's
preferable.


Note cgraph_simd_clone_info has simdlen and we seem to check elsewhere
whether that's POLY or constant.  I wonder how aarch64_sve_mode_p
comes into play here which in the end classifies VLS SVE modes as
non-SVE?



Using -msve-vector-bits=128
(gdb) p TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))
$4 = E_VNx4SImode
(gdb) p  TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo))
$5 = (tree) 0xf741c1b0
(gdb) p debug (TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo)))
128
(gdb) p aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo)))
$5 = true

and for reference without vls codegen:
(gdb) p TYPE_MODE (STMT_VINFO_VECTYPE (stmt_vinfo))
$1 = E_VNx4SImode
(gdb) p  debug (TYPE_SIZE (STMT_VINFO_VECTYPE (stmt_vinfo)))
POLY_INT_CST [128, 128]

Having said that I believe that the USABLE targethook implementation for 
aarch64 should also block other uses, like an Advanced SIMD mode being 
used as input for a SVE VLS SIMDCLONE. The reason being that for 
instance 'half' registers like VNx2SI are packed differently from V2SI.


We could teach the vectorizer to support these of course, but that 
requires more work and is not extremely useful just yet. I'll add the 
extra check that to the patch once we agree on how to pass down the 
information we need. Happy to use either mode, or stmt_vec_info and 
extract the mode from it like it does now.



Regards,
Andre





Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-01-31 Thread Andre Vieira (lists)




On 31/01/2024 14:03, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:




On 31/01/2024 12:13, Richard Biener wrote:

On Wed, 31 Jan 2024, Richard Biener wrote:


On Tue, 30 Jan 2024, Andre Vieira wrote:



This patch adds stmt_vec_info to TARGET_SIMD_CLONE_USABLE to make sure the
target can reject a simd_clone based on the vector mode it is using.
This is needed because for VLS SVE vectorization the vectorizer accepts
Advanced SIMD simd clones when vectorizing using SVE types because the
simdlens
might match.  This will cause type errors later on.

Other targets do not currently need to use this argument.


Can you instead pass down the mode?


Thinking about that again the cgraph_simd_clone info in the clone
should have sufficient information to disambiguate.  If it doesn't
then we should amend it.

Richard.


Hi Richard,

Thanks for the review, I don't think cgraph_simd_clone_info is the right place
to pass down this information, since this is information about the caller
rather than the simdclone itself. What we are trying to achieve here is making
the vectorizer being able to accept or reject simdclones based on the ISA we
are vectorizing for. To distinguish between SVE and Advanced SIMD ISAs we use
modes, I am also not sure that's ideal but it is what we currently use. So to
answer your earlier question, yes I can also pass down mode if that's
preferable.


Note cgraph_simd_clone_info has simdlen and we seem to check elsewhere
whether that's POLY or constant.  I wonder how aarch64_sve_mode_p
comes into play here which in the end classifies VLS SVE modes as
non-SVE?


Maybe it's just a bit non-obvious as you key on mangling:

  static int
-aarch64_simd_clone_usable (struct cgraph_node *node)
+aarch64_simd_clone_usable (struct cgraph_node *node, stmt_vec_info
stmt_vinfo)
  {
switch (node->simdclone->vecsize_mangle)
  {
  case 'n':
if (!TARGET_SIMD)
 return -1;
+  if (STMT_VINFO_VECTYPE (stmt_vinfo)
+ && aarch64_sve_mode_p (TYPE_MODE (STMT_VINFO_VECTYPE
(stmt_vinfo
+   return -1;

?  What does 'n' mean?  It's documented as

   /* The mangling character for a given vector size.  This is used
  to determine the ISA mangling bit as specified in the Intel
  Vector ABI.  */
   unsigned char vecsize_mangle;


I'll update the comment, but yeh 'n' is for Advanced SIMD, 's' is for SVE.


which is slightly misleading.


Re: [PATCH 1/3] vect: Pass stmt_vec_info to TARGET_SIMD_CLONE_USABLE

2024-02-26 Thread Andre Vieira (lists)




On 05/02/2024 09:56, Richard Biener wrote:

On Thu, 1 Feb 2024, Andre Vieira (lists) wrote:




On 01/02/2024 07:19, Richard Biener wrote:

On Wed, 31 Jan 2024, Andre Vieira (lists) wrote:


The patch didn't come with a testcase so it's really hard to tell
what goes wrong now and how it is fixed ...


My bad! I had a testcase locally but never added it...

However... now I look at it and ran it past Richard S, the codegen isn't
'wrong', but it does have the potential to lead to some pretty slow codegen,
especially for inbranch simdclones where it transforms the SVE predicate into
an Advanced SIMD vector by inserting the elements one at a time...

An example of which can be seen if you do:

gcc -O3 -march=armv8-a+sve -msve-vector-bits=128  -fopenmp-simd t.c -S

with the following t.c:
#pragma omp declare simd simdlen(4) inbranch
int __attribute__ ((const)) fn5(int);

void fn4 (int *a, int *b, int n)
{
 for (int i = 0; i < n; ++i)
 b[i] = fn5(a[i]);
}

Now I do have to say, for our main usecase of libmvec we won't have any
'inbranch' Advanced SIMD clones, so we avoid that issue... But of course that
doesn't mean user-code will.


It seems to use SVE masks with vector(4)  and the
ABI says the mask is vector(4) int.  You say that's because we choose
a Adv SIMD clone for the SVE VLS vector code (it calls _ZGVnM4v_fn5).

The vectorizer creates

   _44 = VEC_COND_EXPR ;

and then vector lowering decomposes this.  That means the vectorizer
lacks a check that the target handles this VEC_COND_EXPR.

Of course I would expect that SVE with VLS vectors is able to
code generate this operation, so it's missing patterns in the end.

Richard.



What should we do for GCC-14? Going forward I think the right thing to 
do is to add these patterns. But I am not even going to try to do that 
right now and even though we can codegen for this, the result doesn't 
feel like it would ever be profitable which means I'd rather not 
vectorize, or well pick a different vector mode if possible.


This would be achieved with the change to the targethook. If I change 
the hook to take modes, using STMT_VINFO_VECTYPE (stmt_vinfo), is that 
OK for now?


Kind regards,
Andre


Re: [PATCH 2/2] aarch64: Add support for _BitInt

2024-02-27 Thread Andre Vieira (lists)

Hey,

Dropped the first patch and dealt with the comments above, hopefully I 
didn't miss any this time.


--

This patch adds support for C23's _BitInt for the AArch64 port when 
compiling

for little endianness.  Big Endianness requires further target-agnostic
support and we therefor disable it for now.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (TARGET_C_BITINT_TYPE_INFO): Declare MACRO.
(aarch64_bitint_type_info): New function.
(aarch64_return_in_memory_1): Return large _BitInt's in memory.
(aarch64_function_arg_alignment): Adapt to correctly return the ABI
mandated alignment of _BitInt(N) where N > 128 as the alignment of
TImode.
(aarch64_composite_type_p): Return true for _BitInt(N), where N > 128.

libgcc/ChangeLog:

* config/aarch64/t-softfp (softfp_extras): Add floatbitinthf,
floatbitintbf, floatbitinttf and fixtfbitint.
* config/aarch64/libgcc-softfp.ver (GCC_14.0.0): Add __floatbitinthf,
__floatbitintbf, __floatbitinttf and __fixtfbitint.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/bitint-alignments.c: New test.
* gcc.target/aarch64/bitint-args.c: New test.
* gcc.target/aarch64/bitint-sizes.c: New test.


On 02/02/2024 14:46, Jakub Jelinek wrote:

On Thu, Jan 25, 2024 at 05:45:01PM +, Andre Vieira wrote:

This patch adds support for C23's _BitInt for the AArch64 port when compiling
for little endianness.  Big Endianness requires further target-agnostic
support and we therefor disable it for now.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (TARGET_C_BITINT_TYPE_INFO): Declare MACRO.
(aarch64_bitint_type_info): New function.
(aarch64_return_in_memory_1): Return large _BitInt's in memory.
(aarch64_function_arg_alignment): Adapt to correctly return the ABI
mandated alignment of _BitInt(N) where N > 128 as the alignment of
TImode.
(aarch64_composite_type_p): Return true for _BitInt(N), where N > 128.

libgcc/ChangeLog:

* config/aarch64/t-softfp: Add fixtfbitint, floatbitinttf and
floatbitinthf to the softfp_extras variable to ensure the
runtime support is available for _BitInt.


I think this lacks some config/aarch64/t-whatever.ver
additions.
See PR113700 for some more details.
We want the support routines for binary floating point <-> _BitInt
conversions in both libgcc.a and libgcc_s.so.1 and exported from the latter
too at GCC_14.0.0 symver, while decimal floating point <-> _BitInt solely in
libgcc.a (as with all the huge dfp/bid stuff).

Jakub
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
16318bf925883ecedf9345e53fc0824a553b2747..9bd8d22f6edd9f6c77907ec383f9e8bf055cfb8b
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6583,6 +6583,7 @@ aarch64_return_in_memory_1 (const_tree type)
   int count;
 
   if (!AGGREGATE_TYPE_P (type)
+  && TREE_CODE (type) != BITINT_TYPE
   && TREE_CODE (type) != COMPLEX_TYPE
   && TREE_CODE (type) != VECTOR_TYPE)
 /* Simple scalar types always returned in registers.  */
@@ -21895,6 +21896,11 @@ aarch64_composite_type_p (const_tree type,
   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
 return true;
 
+  if (type
+  && TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return true;
+
   if (mode == BLKmode
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
@@ -28400,6 +28406,42 @@ aarch64_excess_precision (enum excess_precision_type 
type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+aarch64_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (TARGET_BIG_END)
+return false;
+
+  if (n <= 8)
+info->limb_mode = QImode;
+  else if (n <= 16)
+info->limb_mode = HImode;
+  else if (n <= 32)
+info->limb_mode = SImode;
+  else if (n <= 64)
+info->limb_mode = DImode;
+  else if (n <= 128)
+info->limb_mode = TImode;
+  else
+/* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
+   type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
+   able to use libgcc's implementation to support large _BitInt's we need
+   to use a LIMB_MODE that is no larger than 'long long'.  This is why we
+   use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
+   be TImode to ensure we are ABI compliant.  */
+info->limb_mode = DImode;
+
+  if (n > 128)
+info->abi_limb_mode = TImode;
+  else
+info->abi_limb_mode = info->limb_mode;
+  info->big_endian = TARGET_BIG_END;
+  info->extended = false;
+  return true;
+}
+
 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
scheduled for speculative 

<    1   2   3   4   5   6   7   >