date:20230703

Re: [PATCH v1] RISC-V: Fix one bug for floating-point static frm

2023-07-03 Thread juzhe.zh...@rivai.ai

LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-07-04 13:50
To: gcc-patches
CC: juzhe.zhong; rdapp.gcc; jeffreyalaw; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Fix one bug for floating-point static frm
From: Pan Li 
 
This patch would like to fix one bug to align below items of spec.
 
1. By default, the RVV floating-point will take dyn mode.
2. DYN is invalid in FRM register for RVV floating-point.
 
When mode switching the function entry and exit, it will take DYN as
the frm mode.
 
Signed-off-by: Pan Li 
 
gcc/ChangeLog:
 
* config/riscv/riscv.cc (riscv_emit_mode_set): Avoid emit insn
when FRM_MODE_DYN.
(riscv_mode_entry): Take FRM_MODE_DYN as entry mode.
(riscv_mode_exit): Likewise for exit mode.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/base/float-point-frm-insert-6.c: New test.
---
gcc/config/riscv/riscv.cc |  6 ++--
.../riscv/rvv/base/float-point-frm-insert-6.c | 31 +++
2 files changed, 34 insertions(+), 3 deletions(-)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
 
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e4dc8115e69..f5fe910426e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7670,7 +7670,7 @@ riscv_emit_mode_set (int entity, int mode, int prev_mode,
emit_insn (gen_vxrmsi (gen_int_mode (mode, SImode)));
   break;
 case RISCV_FRM:
-  if (mode != FRM_MODE_NONE && mode != prev_mode)
+  if (mode != FRM_MODE_DYN && mode != prev_mode)
{
  rtx scaler = gen_reg_rtx (SImode);
  rtx imm = gen_int_mode (mode, SImode);
@@ -7774,7 +7774,7 @@ riscv_mode_entry (int entity)
 case RISCV_VXRM:
   return VXRM_MODE_NONE;
 case RISCV_FRM:
-  return FRM_MODE_NONE;
+  return FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
@@ -7791,7 +7791,7 @@ riscv_mode_exit (int entity)
 case RISCV_VXRM:
   return VXRM_MODE_NONE;
 case RISCV_FRM:
-  return FRM_MODE_NONE;
+  return FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
new file mode 100644
index 000..6d896e0953e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfadd_vv_f32m1_rm (vfloat32m1_t op1, vfloat32m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_f32m1_rm (op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vv_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, vfloat32m1_t op2,
+ size_t vl) {
+  return __riscv_vfadd_vv_f32m1_m_rm(mask, op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vf_f32m1_rm(vfloat32m1_t op1, float32_t op2, size_t vl) {
+  return __riscv_vfadd_vf_f32m1_rm(op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vf_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, float32_t op2,
+ size_t vl) {
+  return __riscv_vfadd_vf_f32m1_m_rm(mask, op1, op2, 7, vl);
+}
+
+/* { dg-final { scan-assembler-times 
{vfadd\.v[vf]\s+v[0-9]+,\s*v[0-9]+,\s*[fav]+[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-not {fsrm\s+[ax][0-9]+,\s*[ax][0-9]+} } } */
-- 
2.34.1

[PATCH v1] RISC-V: Fix one bug for floating-point static frm

2023-07-03 Thread Pan Li via Gcc-patches

From: Pan Li 

This patch would like to fix one bug to align below items of spec.

1. By default, the RVV floating-point will take dyn mode.
2. DYN is invalid in FRM register for RVV floating-point.

When mode switching the function entry and exit, it will take DYN as
the frm mode.

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_emit_mode_set): Avoid emit insn
when FRM_MODE_DYN.
(riscv_mode_entry): Take FRM_MODE_DYN as entry mode.
(riscv_mode_exit): Likewise for exit mode.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/float-point-frm-insert-6.c: New test.
---
 gcc/config/riscv/riscv.cc |  6 ++--
 .../riscv/rvv/base/float-point-frm-insert-6.c | 31 +++
 2 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e4dc8115e69..f5fe910426e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7670,7 +7670,7 @@ riscv_emit_mode_set (int entity, int mode, int prev_mode,
emit_insn (gen_vxrmsi (gen_int_mode (mode, SImode)));
   break;
 case RISCV_FRM:
-  if (mode != FRM_MODE_NONE && mode != prev_mode)
+  if (mode != FRM_MODE_DYN && mode != prev_mode)
{
  rtx scaler = gen_reg_rtx (SImode);
  rtx imm = gen_int_mode (mode, SImode);
@@ -7774,7 +7774,7 @@ riscv_mode_entry (int entity)
 case RISCV_VXRM:
   return VXRM_MODE_NONE;
 case RISCV_FRM:
-  return FRM_MODE_NONE;
+  return FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
@@ -7791,7 +7791,7 @@ riscv_mode_exit (int entity)
 case RISCV_VXRM:
   return VXRM_MODE_NONE;
 case RISCV_FRM:
-  return FRM_MODE_NONE;
+  return FRM_MODE_DYN;
 default:
   gcc_unreachable ();
 }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
new file mode 100644
index 000..6d896e0953e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-6.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64 -O3 -Wno-psabi" } */
+
+#include "riscv_vector.h"
+
+typedef float float32_t;
+
+vfloat32m1_t
+test_riscv_vfadd_vv_f32m1_rm (vfloat32m1_t op1, vfloat32m1_t op2, size_t vl) {
+  return __riscv_vfadd_vv_f32m1_rm (op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vv_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, vfloat32m1_t op2,
+size_t vl) {
+  return __riscv_vfadd_vv_f32m1_m_rm(mask, op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vf_f32m1_rm(vfloat32m1_t op1, float32_t op2, size_t vl) {
+  return __riscv_vfadd_vf_f32m1_rm(op1, op2, 7, vl);
+}
+
+vfloat32m1_t
+test_vfadd_vf_f32m1_m_rm(vbool32_t mask, vfloat32m1_t op1, float32_t op2,
+size_t vl) {
+  return __riscv_vfadd_vf_f32m1_m_rm(mask, op1, op2, 7, vl);
+}
+
+/* { dg-final { scan-assembler-times 
{vfadd\.v[vf]\s+v[0-9]+,\s*v[0-9]+,\s*[fav]+[0-9]+} 4 } } */
+/* { dg-final { scan-assembler-not {fsrm\s+[ax][0-9]+,\s*[ax][0-9]+} } } */
-- 
2.34.1

Re: [PATCH V4 1/4] rs6000: build constant via li;rotldi

2023-07-03 Thread Kewen.Lin via Gcc-patches

Hi Jeff,

on 2023/7/4 10:18, Jiufu Guo via Gcc-patches wrote:
> Hi,
> 
> If a constant is possible to be rotated to/from a positive or negative
> value from "li", then "li;rotldi" can be used to build the constant.
> 
> Compare with the previous version:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-June/621961.html
> This patch just did minor changes to the style and comments.
> 
> Bootstrap and regtest pass on ppc64{,le}.
> 
> Since the previous version is approved with conditions, this version
> explained the concern too.  If no objection, I would like to apply
> this patch to trunk.
> 
> 
> BR,
> Jeff (Jiufu)
> 
> gcc/ChangeLog:
> 
>   * config/rs6000/rs6000.cc (can_be_built_by_li_and_rotldi): New function.
>   (rs6000_emit_set_long_const): Call can_be_built_by_li_and_rotldi.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/powerpc/const-build.c: New test.
> ---
>  gcc/config/rs6000/rs6000.cc   | 47 +--
>  .../gcc.target/powerpc/const-build.c  | 57 +++
>  2 files changed, 98 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/const-build.c
> 
> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
> index 42f49e4a56b..acc332acc05 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -10258,6 +10258,31 @@ rs6000_emit_set_const (rtx dest, rtx source)
>return true;
>  }
>  
> +/* Check if value C can be built by 2 instructions: one is 'li', another is
> +   rotldi.

Nit: different style, li is with "'" but rotldi isn't.

> +
> +   If so, *SHIFT is set to the shift operand of rotldi(rldicl), and *MASK
> +   is set to the mask operand of rotldi(rldicl), and return true.
> +   Return false otherwise.  */
> +
> +static bool
> +can_be_built_by_li_and_rotldi (HOST_WIDE_INT c, int *shift,
> +HOST_WIDE_INT *mask)
> +{
> +  /* If C or ~C contains at least 49 successive zeros, then C can be rotated
> + to/from a positive or negative value that 'li' is able to load.  */
> +  int n;
> +  if (can_be_rotated_to_lowbits (c, 15, )
> +  || can_be_rotated_to_lowbits (~c, 15, ))
> +{
> +  *mask = HOST_WIDE_INT_M1;
> +  *shift = HOST_BITS_PER_WIDE_INT - n;
> +  return true;
> +}
> +
> +  return false;
> +}
> +
>  /* Subroutine of rs6000_emit_set_const, handling PowerPC64 DImode.
> Output insns to set DEST equal to the constant C as a series of
> lis, ori and shl instructions.  */
> @@ -10266,15 +10291,14 @@ static void
>  rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
>  {
>rtx temp;
> +  int shift;
> +  HOST_WIDE_INT mask;
>HOST_WIDE_INT ud1, ud2, ud3, ud4;
>  
>ud1 = c & 0x;
> -  c = c >> 16;
> -  ud2 = c & 0x;
> -  c = c >> 16;
> -  ud3 = c & 0x;
> -  c = c >> 16;
> -  ud4 = c & 0x;
> +  ud2 = (c >> 16) & 0x;
> +  ud3 = (c >> 32) & 0x;
> +  ud4 = (c >> 48) & 0x;
>  
>if ((ud4 == 0x && ud3 == 0x && ud2 == 0x && (ud1 & 0x8000))
>|| (ud4 == 0 && ud3 == 0 && ud2 == 0 && ! (ud1 & 0x8000)))
> @@ -10305,6 +10329,17 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT 
> c)
>emit_move_insn (dest, gen_rtx_XOR (DImode, temp,
>GEN_INT ((ud2 ^ 0x) << 16)));
>  }
> +  else if (can_be_built_by_li_and_rotldi (c, , ))
> +{
> +  temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
> +  unsigned HOST_WIDE_INT imm = (c | ~mask);
> +  imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
> +
> +  emit_move_insn (temp, GEN_INT (imm));
> +  if (shift != 0)
> + temp = gen_rtx_ROTATE (DImode, temp, GEN_INT (shift));
> +  emit_move_insn (dest, temp);
> +}
>else if (ud3 == 0 && ud4 == 0)
>  {
>temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
> diff --git a/gcc/testsuite/gcc.target/powerpc/const-build.c 
> b/gcc/testsuite/gcc.target/powerpc/const-build.c
> new file mode 100644
> index 000..69b37e2bb53
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/const-build.c
> @@ -0,0 +1,57 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -save-temps" } */
> +/* { dg-require-effective-target has_arch_ppc64 } */
> +
> +/* Verify that two instructions are sucessfully used to build constants.

s/sucessfully/successfully/

> +   One insn is li or lis, another is rotate: rldicl, rldicr or rldic.  */

Nit: This patch is for insn li + insn rldicl only, you probably want to keep
consistent in the comments.

The others look good to me, thanks!

Segher had one question on "~c" before, I saw you had explained for it, it
makes sense to me, but in case he has more questions I'd defer the final
approval to him.

BR,
Kewen

RE: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread Li, Pan2 via Gcc-patches

Hi Robin,

Just revert this patch, it reports some weird illegal instr, I may need more 
time for this.

Pan

-Original Message-
From: Li, Pan2 
Sent: Monday, July 3, 2023 11:00 PM
To: Robin Dapp ; juzhe.zh...@rivai.ai; gcc-patches 

Cc: jeffreyalaw ; Wang, Yanzhang 
; kito.cheng 
Subject: RE: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

Sure, every change need test and will pay attention for this in future.

Pan

-Original Message-
From: Robin Dapp  
Sent: Monday, July 3, 2023 10:57 PM
To: Li, Pan2 ; juzhe.zh...@rivai.ai; gcc-patches 

Cc: rdapp@gmail.com; jeffreyalaw ; Wang, Yanzhang 
; kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

> Sorry for inconvenient, still working on fix it. If urgent I can
> revert this change to unblock your work ASAP.

I'm not blocked by this, thanks, just wanted to document it here.
I was testing another patch and needed to dig for a while until
I realized the FAILs come from this one.  In general I would
assume that even obvious patches are tested before (I have
introduced bugs by obvious ones before so I make sure to).

Regards
 Robin

RE: [PATCH V7] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern

2023-07-03 Thread Li, Pan2 via Gcc-patches

Committed as both the bootstrap and regression tests passed, thanks Richard.

Pan

-Original Message-
From: Gcc-patches  On Behalf 
Of Richard Sandiford via Gcc-patches
Sent: Monday, July 3, 2023 9:50 PM
To: juzhe.zh...@rivai.ai
Cc: gcc-patches@gcc.gnu.org; rguent...@suse.de
Subject: Re: [PATCH V7] Machine Description: Add LEN_MASK_{GATHER_LOAD, 
SCATTER_STORE} pattern

juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong 
>
> Hi, Richi and Richard.
>
> Base one the review comments from Richard:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623405.html
>
> I change len_mask_gather_load/len_mask_scatter_store order into:
> {len,bias,mask}
>
> We adjust adding len and mask using using add_len_and_mask_args
> which is same as partial_load/parial_store.
>
> Now, the codes become more reasonable and easier maintain.
>
> This patch is adding LEN_MASK_{GATHER_LOAD,SCATTER_STORE} to allow targets
> handle flow control by mask and loop control by length on gather/scatter 
> memory
> operations. Consider this following case:
>
> #include 
> void
> f (uint8_t *restrict a,
>uint8_t *restrict b, int n,
>int base, int step,
>int *restrict cond)
> {
>   for (int i = 0; i < n; ++i)
> {
>   if (cond[i])
> a[i * step + base] = b[i * step + base];
> }
> }
>
> We hope RVV can vectorize such case into following IR:
>
> loop_len = SELECT_VL
> control_mask = comparison
> v = LEN_MASK_GATHER_LOAD (.., loop_len, bias, control_mask)
> LEN_SCATTER_STORE (... v, ..., loop_len, bias, control_mask)
>
> This patch doesn't apply such patterns into vectorizer, just add patterns
> and update the documents.
>
> Will send patch which apply such patterns into vectorizer soon after this
> patch is approved.
>
> Ok for trunk?
>
> gcc/ChangeLog:
>
> * doc/md.texi: Add len_mask_gather_load/len_mask_scatter_store.
> * internal-fn.cc (expand_scatter_store_optab_fn): Ditto.
> (expand_gather_load_optab_fn): Ditto.
> (internal_load_fn_p): Ditto.
> (internal_store_fn_p): Ditto.
> (internal_gather_scatter_fn_p): Ditto.
> (internal_fn_len_index): Ditto.
> (internal_fn_mask_index): Ditto.
> (internal_fn_stored_value_index): Ditto.
> * internal-fn.def (LEN_MASK_GATHER_LOAD): Ditto.
> (LEN_MASK_SCATTER_STORE): Ditto.
> * optabs.def (OPTAB_CD): Ditto.

Nice!  OK, thanks.

Richard

> ---
>  gcc/doc/md.texi | 17 +
>  gcc/internal-fn.cc  | 32 +---
>  gcc/internal-fn.def |  8 ++--
>  gcc/optabs.def  |  2 ++
>  4 files changed, 42 insertions(+), 17 deletions(-)
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 5e5482265cd..f14dd32b2dc 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5040,6 +5040,15 @@ operand 5.  Bit @var{i} of the mask is set if element 
> @var{i}
>  of the result should be loaded from memory and clear if element @var{i}
>  of the result should be set to zero.
>  
> +@cindex @code{len_mask_gather_load@var{m}@var{n}} instruction pattern
> +@item @samp{len_mask_gather_load@var{m}@var{n}}
> +Like @samp{gather_load@var{m}@var{n}}, but takes an extra length operand 
> (operand 5),
> +a bias operand (operand 6) as well as a mask operand (operand 7).  Similar 
> to len_maskload,
> +the instruction loads at most (operand 5 + operand 6) elements from memory.
> +Bit @var{i} of the mask is set if element @var{i} of the result should
> +be loaded from memory and clear if element @var{i} of the result should be 
> undefined.
> +Mask elements @var{i} with @var{i} > (operand 5 + operand 6) are ignored.
> +
>  @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
>  @item @samp{scatter_store@var{m}@var{n}}
>  Store a vector of mode @var{m} into several distinct memory locations.
> @@ -5069,6 +5078,14 @@ Like @samp{scatter_store@var{m}@var{n}}, but takes an 
> extra mask operand as
>  operand 5.  Bit @var{i} of the mask is set if element @var{i}
>  of the result should be stored to memory.
>  
> +@cindex @code{len_mask_scatter_store@var{m}@var{n}} instruction pattern
> +@item @samp{len_mask_scatter_store@var{m}@var{n}}
> +Like @samp{scatter_store@var{m}@var{n}}, but takes an extra length operand 
> (operand 5),
> +a bias operand (operand 6) as well as a mask operand (operand 7).  The 
> instruction stores
> +at most (operand 5 + operand 6) elements of (operand 4) to memory.
> +Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be 
> stored.
> +Mask elements @var{i} with @var{i} > (operand 5 + operand 6) are ignored.
> +
>  @cindex @code{vec_set@var{m}} instruction pattern
>  @item @samp{vec_set@var{m}}
>  Set given field in the vector value.  Operand 0 is the vector to modify,
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index c1fcb38b17b..303df102d81 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -3507,7 +3507,6 @@ expand_scatter_store_optab_fn (internal_fn, gcall 
> *stmt, direct_optab

[PATCH V2] i386: Inline function with default arch/tune to caller

2023-07-03 Thread Hongyu Wang via Gcc-patches

Hi,

For function with different target attributes, current logic rejects to
inline the callee when any arch or tune is mismatched. Relax the
condition to allow callee with default arch/tune to be inlined.

Boostrapped/regtested on x86-64-linux-gnu{-m32,}.

Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_can_inline_p): If callee has
default arch=x86-64 and tune=generic, do not block the
inlining to its caller.

gcc/testsuite/ChangeLog:

* gcc.target/i386/inline_target_clones.c: New test.
---
 gcc/config/i386/i386.cc   | 22 +++--
 .../gcc.target/i386/inline_target_clones.c| 24 +++
 2 files changed, 39 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/inline_target_clones.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 8989985700a..4741c9b5364 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -605,13 +605,6 @@ ix86_can_inline_p (tree caller, tree callee)
   != (callee_opts->x_target_flags & ~always_inline_safe_mask))
 ret = false;
 
-  /* See if arch, tune, etc. are the same.  */
-  else if (caller_opts->arch != callee_opts->arch)
-ret = false;
-
-  else if (!always_inline && caller_opts->tune != callee_opts->tune)
-ret = false;
-
   else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath
   /* If the calle doesn't use FP expressions differences in
  ix86_fpmath can be ignored.  We are called from FEs
@@ -622,6 +615,21 @@ ix86_can_inline_p (tree caller, tree callee)
   || ipa_fn_summaries->get (callee_node)->fp_expressions))
 ret = false;
 
+  /* At this point we cannot identify whether arch or tune setting
+ comes from target attribute or not. So the most conservative way
+ is to allow the callee that uses default arch and tune string to
+ be inlined.  */
+  else if (!strcmp (callee_opts->x_ix86_arch_string, "x86-64")
+  && !strcmp (callee_opts->x_ix86_tune_string, "generic"))
+ret = true;
+
+  /* See if arch, tune, etc. are the same.  */
+  else if (caller_opts->arch != callee_opts->arch)
+ret = false;
+
+  else if (!always_inline && caller_opts->tune != callee_opts->tune)
+ret = false;
+
   else if (!always_inline
   && caller_opts->branch_cost != callee_opts->branch_cost)
 ret = false;
diff --git a/gcc/testsuite/gcc.target/i386/inline_target_clones.c 
b/gcc/testsuite/gcc.target/i386/inline_target_clones.c
new file mode 100644
index 000..53db1600ce5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/inline_target_clones.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-require-ifunc "" } */
+/* { dg-options "-O3 -march=x86-64" } */
+/* { dg-final { scan-assembler-not "call\[ \t\]+callee" } } */
+
+float callee (float a, float b, float c, float d,
+ float e, float f, float g, float h)
+{
+  return a * b + c * d + e * f + g + h + a * c + b * c
++ a * d + b * e + a * f + c * h + 
+b * (a - 0.4f) * (c + h) * (b + e * d) - a / f * h;
+}
+
+__attribute__((target_clones("default","arch=icelake-server")))
+void caller (int n, float *a,
+float c1, float c2, float c3,
+float c4, float c5, float c6,
+float c7)
+{
+  for (int i = 0; i < n; i++)
+{
+  a[i] = callee (a[i], c1, c2, c3, c4, c5, c6, c7);
+}
+}
-- 
2.31.1

ping^^^^: [PATCH V2] rs6000: Enhance lowpart/highpart DI->SF by mtvsrws/mtvsrd

2023-07-03 Thread Jiufu Guo via Gcc-patches



Hi,

Gentle ping ...

Jiufu Guo via Gcc-patches  writes:

> Gentle ping...
>
> Jiufu Guo via Gcc-patches  writes:
>
>> Gentle ping...
>>
>> Jiufu Guo via Gcc-patches  writes:
>>
>>> Hi
>>>
>>> I would like to ping this patch for stage1:
>>> https://gcc.gnu.org/pipermail/gcc-patches/2023-February/612168.html
>>>
>>> BR,
>>> Jeff (Jiufu)
>>>
>>> Jiufu Guo  writes:
>>>
 Hi,

 Compare with previous version:
 https://gcc.gnu.org/pipermail/gcc-patches/2023-January/609654.html
 This patch does not use UNSPEC for insn mtvsrws anymore.  And to handle
 the subreg better on BE and LE, predicate "lowpart_subreg_operator"
 is introducted. To help combine pass to match the pattern on high32
 bit of DI, shiftrt is still used.

 As mentioned in PR108338, on p9, we could use mtvsrws to implement
 the conversion from SI#0 to SF (or lowpart DI to SF).

 For examples:
   *(long long*)buff = di;
   float f = *(float*)(buff);
 We generate "sldi 9,3,32 ; mtvsrd 1,9 ; xscvspdpn 1,1" instead of
 "mtvsrws 1,3 ; xscvspdpn 1,1".

 This patch update this, and also enhance the bitcast from highpart
 DI to SF.

 Bootstrap and regtests pass on ppc64{,le}.
 Is this ok for trunk?

 BR,
 Jeff (Jiufu)

PR target/108338

 gcc/ChangeLog:

* config/rs6000/predicates.md (lowpart_subreg_operator): New
define_predicate.
* config/rs6000/rs6000.md (any_rshift): New code_iterator.
(movsf_from_si2): Rename to...
(movsf_from_si2_): ... this.
(si2sf_mtvsrws): New define_insn.

 gcc/testsuite/ChangeLog:

* gcc.target/powerpc/pr108338.c: New test.

 ---
  gcc/config/rs6000/predicates.md |  5 +++
  gcc/config/rs6000/rs6000.md | 35 -
  gcc/testsuite/gcc.target/powerpc/pr108338.c | 42 +
  3 files changed, 73 insertions(+), 9 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108338.c

 diff --git a/gcc/config/rs6000/predicates.md 
 b/gcc/config/rs6000/predicates.md
 index 52c65534e51..e57c9d99c6b 100644
 --- a/gcc/config/rs6000/predicates.md
 +++ b/gcc/config/rs6000/predicates.md
 @@ -2064,3 +2064,8 @@ (define_predicate "macho_pic_address"
else
  return false;
  })
 +
 +(define_predicate "lowpart_subreg_operator"
 +  (and (match_code "subreg")
 +   (match_test "subreg_lowpart_offset (mode, GET_MODE (SUBREG_REG 
 (op)))
 +  == SUBREG_BYTE (op)")))
 diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
 index 4a7812fa592..5b4a7f8d801 100644
 --- a/gcc/config/rs6000/rs6000.md
 +++ b/gcc/config/rs6000/rs6000.md
 @@ -7539,6 +7539,14 @@ (define_split
 UNSPEC_MOVSI_GOT))]
"")
  
 +(define_insn "si2sf_mtvsrws"
 +  [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
 +   (subreg:SF (match_operand:SI 1 "gpc_reg_operand" "r") 0))]
 +  "TARGET_P9_VECTOR && TARGET_XSCVSPDPN"
 +  "mtvsrws %x0,%1\n\txscvspdpn %x0,%x0"
 +  [(set_attr "type" "mfvsr")
 +   (set_attr "length" "8")])
 +
  ;;   MR  LA
  ;;   LWZ LFIWZX  LXSIWZX
  ;;   STW STFIWX  STXSIWX
 @@ -8203,10 +8211,18 @@ (define_insn_and_split "movsf_from_si"
rtx op2 = operands[2];
rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
  
 -  /* Move SF value to upper 32-bits for xscvspdpn.  */
 -  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
 -  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
 -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
 +  if (TARGET_P9_VECTOR)
 +{
 +  emit_insn (gen_si2sf_mtvsrws (op0, gen_lowpart (SImode, op1_di)));
 +}
 +  else
 +{
 +  /* Move SF value to upper 32-bits for xscvspdpn.  */
 +  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
 +  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
 +  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
 +}
 +
DONE;
  }
[(set_attr "length"
 @@ -8219,18 +8235,19 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
  
 +(define_code_iterator any_rshift [ashiftrt lshiftrt])
 +
  ;; For extracting high part element from DImode register like:
  ;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
  ;; split it before reload with "and mask" to avoid generating shift right
  ;; 32 bit then shift left 32 bit.
 -(define_insn_and_split "movsf_from_si2"
 +(define_insn_and_split "movsf_from_si2_"
[(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
(unspec:SF
 -

[PATCH] Break false dependence for vpternlog by inserting vpxor.

2023-07-03 Thread liuhongt via Gcc-patches

vpternlog is also used for optimization which doesn't need any valid
input operand, in that case, the destination is used as input in the
instruction and that creates a false dependence.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/110438
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog): New
define_insn.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2_pternlog): New
define_insn.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
---
 gcc/config/i386/predicates.md|  8 ++-
 gcc/config/i386/sse.md   | 69 +++-
 gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++
 3 files changed, 94 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index fb07707dcba..df0d9e20def 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 812cfca4b92..93cdd844026 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,28 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9358,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9345,12 +9367,35 @@ (define_insn "*_cvtmask2"
   "TARGET_AVX512F"
   "@
vpmovm2\t{%1, %0|%0, %1}
-   vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+   #"
+  "&& !TARGET_AVX512DQ && reload_completed"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (mode);"
   [(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "*_cvtmask2_pternlog"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+   (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+ (match_operand:VI48_AVX512VL 3 "const0_operand")
+ (match_operand: 1 "register_operand" "Yk")))
+   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_AVX512F && !TARGET_AVX512DQ"
+

RE: [VSETVL PASS] RISC-V: Optimize local AVL propagation

2023-07-03 Thread Li, Pan2 via Gcc-patches

Committed, thanks Kito.

Pan

-Original Message-
From: Gcc-patches  On Behalf 
Of Kito Cheng via Gcc-patches
Sent: Tuesday, July 4, 2023 10:20 AM
To: Robin Dapp 
Cc: Juzhe-Zhong ; gcc-patches@gcc.gnu.org; 
kito.ch...@sifive.com; pal...@dabbelt.com; pal...@rivosinc.com; 
jeffreya...@gmail.com
Subject: Re: [VSETVL PASS] RISC-V: Optimize local AVL propagation

LGTM

On Mon, Jul 3, 2023 at 8:47 PM Robin Dapp  wrote:
>
> LGTM.
>
> Regards
>  Robin
>

Re: [PATCH ver 3] rs6000: Update the vsx-vector-6.* tests.

2023-07-03 Thread Kewen.Lin via Gcc-patches

Hi Carl,

on 2023/6/30 05:36, Carl Love wrote:
> GCC maintainers:
> 
> Ver 3.  Added __attribute__ ((noipa)) to the test files.  Changed some
> of the scan-assembler-times checks to cover multiple similar
> instructions.  Change the function check macro to a macro to generate a
> function to do the test and check the results.  Retested on the various
> processor types and BE/LE versions.
> 
> Ver 2.  Switched to using code macros to generate the call to the
> builtin and test the results.  Added in instruction counts for the key
> instruction for the builtin.  Moved the tests into an additional
> function call to ensure the compile doesn't replace the builtin call
> code with the statically computed results.  The compiler was doing this
> for a few of the simpler tests.  
> 
> The following patch takes the tests in vsx-vector-6-p7.h,  vsx-vector-
> 6-p8.h, vsx-vector-6-p9.h and reorganizes them into a series of smaller
> test files by functionality rather than processor version.
> 
> Tested the patch on Power 8 LE/BE, Power 9 LE/BE and Power 10 LE with
> no regresions.
> 
> Please let me know if this patch is acceptable for mainline.  Thanks.
> 
>Carl
> 
> 
> -
> rs6000: Update the vsx-vector-6.* tests.
> 
> The vsx-vector-6.h file is included into the processor specific test files
> vsx-vector-6.p7.c, vsx-vector-6.p8.c, and vsx-vector-6.p9.c.  The .h file
> contains a large number of vsx vector builtin tests.  The processor
> specific files contain the number of instructions that the tests are
> expected to generate for that processor.  The tests are compile only.
> 
> The tests are broken up into a seriers of files for related tests.  The

s/seriers/series/

> new tests are runnable tests to verify the builtin argument types and the
> functional correctness of each test rather then verifying the type and
> number of instructions generated.
> 
> gcc/testsuite/
>   * gcc.target/powerpc/vsx-vector-6-1op.c: New test file.
>   * gcc.target/powerpc/vsx-vector-6-2lop.c: New test file.
>   * gcc.target/powerpc/vsx-vector-6-2op.c: New test file.
>   * gcc.target/powerpc/vsx-vector-6-3op.c: New test file.
>   * gcc.target/powerpc/vsx-vector-6-cmp-all.c: New test file.
>   * gcc.target/powerpc/vsx-vector-6-cmp.c: New test file.

Missing "func-" in the names ...

>   * gcc.target/powerpc/vsx-vector-6.h: Remove test file.
>   * gcc.target/powerpc/vsx-vector-6-p7.h: Remove test file.
>   * gcc.target/powerpc/vsx-vector-6-p8.h: Remove test file.
>   * gcc.target/powerpc/vsx-vector-6-p9.h: Remove test file.

should be vsx-vector-6-p{7,8,9}.c, "git gcc-verify" should catch these.

> ---
>  .../powerpc/vsx-vector-6-func-1op.c   | 141 ++
>  .../powerpc/vsx-vector-6-func-2lop.c  | 217 +++
>  .../powerpc/vsx-vector-6-func-2op.c   | 133 +
>  .../powerpc/vsx-vector-6-func-3op.c   | 257 ++
>  .../powerpc/vsx-vector-6-func-cmp-all.c   | 211 ++
>  .../powerpc/vsx-vector-6-func-cmp.c   | 121 +
>  .../gcc.target/powerpc/vsx-vector-6.h | 154 ---
>  .../gcc.target/powerpc/vsx-vector-6.p7.c  |  43 ---
>  .../gcc.target/powerpc/vsx-vector-6.p8.c  |  43 ---
>  .../gcc.target/powerpc/vsx-vector-6.p9.c  |  42 ---
>  10 files changed, 1080 insertions(+), 282 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-1op.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-2lop.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-2op.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-3op.c
>  create mode 100644 
> gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-cmp-all.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-cmp.c
>  delete mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6.h
>  delete mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6.p7.c
>  delete mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6.p8.c
>  delete mode 100644 gcc/testsuite/gcc.target/powerpc/vsx-vector-6.p9.c
> 
> diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-1op.c 
> b/gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-1op.c
> new file mode 100644
> index 000..52c7ae3e983
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/vsx-vector-6-func-1op.c
> @@ -0,0 +1,141 @@
> +/* { dg-do run { target lp64 } } */
> +/* { dg-skip-if "" { powerpc*-*-darwin* } } */
> +/* { dg-options "-O2 -save-temps" } */

I just noticed that we missed an effective target check here to ensure the
support of those bifs during the test run, and since it's a runnable test
case, also need to ensure the generated hw insn supported, it's "vsx_hw"
like:

/* { dg-require-effective-target vsx_hw } */

And adding "-mvsx" to the dg-options.

This is also applied for the other test cases.

Re: [VSETVL PASS] RISC-V: Optimize local AVL propagation

2023-07-03 Thread Kito Cheng via Gcc-patches

LGTM

On Mon, Jul 3, 2023 at 8:47 PM Robin Dapp  wrote:
>
> LGTM.
>
> Regards
>  Robin
>

[PATCH V4 1/4] rs6000: build constant via li;rotldi

2023-07-03 Thread Jiufu Guo via Gcc-patches

Hi,

If a constant is possible to be rotated to/from a positive or negative
value from "li", then "li;rotldi" can be used to build the constant.

Compare with the previous version:
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/621961.html
This patch just did minor changes to the style and comments.

Bootstrap and regtest pass on ppc64{,le}.

Since the previous version is approved with conditions, this version
explained the concern too.  If no objection, I would like to apply
this patch to trunk.


BR,
Jeff (Jiufu)

gcc/ChangeLog:

* config/rs6000/rs6000.cc (can_be_built_by_li_and_rotldi): New function.
(rs6000_emit_set_long_const): Call can_be_built_by_li_and_rotldi.

gcc/testsuite/ChangeLog:

* gcc.target/powerpc/const-build.c: New test.
---
 gcc/config/rs6000/rs6000.cc   | 47 +--
 .../gcc.target/powerpc/const-build.c  | 57 +++
 2 files changed, 98 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/const-build.c

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 42f49e4a56b..acc332acc05 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -10258,6 +10258,31 @@ rs6000_emit_set_const (rtx dest, rtx source)
   return true;
 }
 
+/* Check if value C can be built by 2 instructions: one is 'li', another is
+   rotldi.
+
+   If so, *SHIFT is set to the shift operand of rotldi(rldicl), and *MASK
+   is set to the mask operand of rotldi(rldicl), and return true.
+   Return false otherwise.  */
+
+static bool
+can_be_built_by_li_and_rotldi (HOST_WIDE_INT c, int *shift,
+  HOST_WIDE_INT *mask)
+{
+  /* If C or ~C contains at least 49 successive zeros, then C can be rotated
+ to/from a positive or negative value that 'li' is able to load.  */
+  int n;
+  if (can_be_rotated_to_lowbits (c, 15, )
+  || can_be_rotated_to_lowbits (~c, 15, ))
+{
+  *mask = HOST_WIDE_INT_M1;
+  *shift = HOST_BITS_PER_WIDE_INT - n;
+  return true;
+}
+
+  return false;
+}
+
 /* Subroutine of rs6000_emit_set_const, handling PowerPC64 DImode.
Output insns to set DEST equal to the constant C as a series of
lis, ori and shl instructions.  */
@@ -10266,15 +10291,14 @@ static void
 rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
 {
   rtx temp;
+  int shift;
+  HOST_WIDE_INT mask;
   HOST_WIDE_INT ud1, ud2, ud3, ud4;
 
   ud1 = c & 0x;
-  c = c >> 16;
-  ud2 = c & 0x;
-  c = c >> 16;
-  ud3 = c & 0x;
-  c = c >> 16;
-  ud4 = c & 0x;
+  ud2 = (c >> 16) & 0x;
+  ud3 = (c >> 32) & 0x;
+  ud4 = (c >> 48) & 0x;
 
   if ((ud4 == 0x && ud3 == 0x && ud2 == 0x && (ud1 & 0x8000))
   || (ud4 == 0 && ud3 == 0 && ud2 == 0 && ! (ud1 & 0x8000)))
@@ -10305,6 +10329,17 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c)
   emit_move_insn (dest, gen_rtx_XOR (DImode, temp,
 GEN_INT ((ud2 ^ 0x) << 16)));
 }
+  else if (can_be_built_by_li_and_rotldi (c, , ))
+{
+  temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
+  unsigned HOST_WIDE_INT imm = (c | ~mask);
+  imm = (imm >> shift) | (imm << (HOST_BITS_PER_WIDE_INT - shift));
+
+  emit_move_insn (temp, GEN_INT (imm));
+  if (shift != 0)
+   temp = gen_rtx_ROTATE (DImode, temp, GEN_INT (shift));
+  emit_move_insn (dest, temp);
+}
   else if (ud3 == 0 && ud4 == 0)
 {
   temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode);
diff --git a/gcc/testsuite/gcc.target/powerpc/const-build.c 
b/gcc/testsuite/gcc.target/powerpc/const-build.c
new file mode 100644
index 000..69b37e2bb53
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/const-build.c
@@ -0,0 +1,57 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-require-effective-target has_arch_ppc64 } */
+
+/* Verify that two instructions are sucessfully used to build constants.
+   One insn is li or lis, another is rotate: rldicl, rldicr or rldic.  */
+
+#define NOIPA __attribute__ ((noipa))
+
+struct fun
+{
+  long long (*f) (void);
+  long long val;
+};
+
+long long NOIPA
+li_rotldi_1 (void)
+{
+  return 0x75310LL;
+}
+
+long long NOIPA
+li_rotldi_2 (void)
+{
+  return 0x2164LL;
+}
+
+long long NOIPA
+li_rotldi_3 (void)
+{
+  return 0x8531LL;
+}
+
+long long NOIPA
+li_rotldi_4 (void)
+{
+  return 0x2194LL;
+}
+
+struct fun arr[] = {
+  {li_rotldi_1, 0x75310LL},
+  {li_rotldi_2, 0x2164LL},
+  {li_rotldi_3, 0x8531LL},
+  {li_rotldi_4, 0x2194LL},
+};
+
+/* { dg-final { scan-assembler-times {\mrotldi\M} 4 } } */
+
+int
+main ()
+{
+  for (int i = 0; i < sizeof (arr) / sizeof (arr[0]); i++)
+if ((*arr[i].f) () != arr[i].val)
+  __builtin_abort ();
+
+  return 0;
+}
-- 
2.39.3

Re: [PATCH] rs6000: Update the vsx-vector-6.* tests.

2023-07-03 Thread Kewen.Lin via Gcc-patches

Hi Carl,

on 2023/7/3 23:57, Carl Love wrote:
> Kewen:
> 
> On Fri, 2023-06-30 at 15:20 -0700, Carl Love wrote:
>> Segher never liked the above way of looking at the assembly.  He
>> prefers:
>>   gcc -S -g -mcpu=power8 -o vsx-vector-6-func-2lop.s vsx-vector-6-
>> func-
>> 2lop.c
>>
>>   grep xxlor vsx-vector-6-func-2lop.s | wc
>>  34  68 516
>>
>> So, again, I get the same count of 34 on both makalu and genoa.  But
>> again, that doesn't agree with what make script/scan-assembler thinks
>> the counts should be.
>>
>> When I looked at the vsx-vector-6-func-2lop.s I see on BE:
>>
>>  
>> lxvd2x 0,10,9
>> xxlor 0,12,0
>> xxlnor 0,0,0
>>  ...
>>
>> I was guessing that it was adjusting the data layout from the load. 
>> But looking again more carefully versus LE:
>>
>> 
>> lxvd2x 0,31,9 
>>xxpermdi 0,0,0,2 
>>xxlor 0,12,0  
>>xxlnor 0,0,0  
>>xxpermdi 0,0,0,2 
>> 
>>
>> the xxpermdi is probably what is really doing the data layout change.
>>
>> So, we have the issue that looking at the assembly gives different
>> instruction counts then what 
>>
>>dg-final { scan-assembler-times {\mxxlor\M} }
>>
>> comes up with???  Now I am really confused.  I don't know how the
>> scan-
>> assembler-times works but I will go see if I can find it and see if I
>> can figure out what the issue is.  I would expect that the scan-
>> assembler is working off the --save-temp files, which get deleted as
>> part of the run.  I would guess that scan-assembler does a grep to
>> find
>> the instructions and then maybe uses wc to count them??? I will go
>> see
>> if I can figure out how scan-assembler-times works.
> 
> OK, I figured out why I was getting 34 xxlor instructions instead of
> the 22 that the scan-assembler-times was getting.  The difference was
> when I compiled the program I forgot to use -O2.  So with -O2 I get the
> same number of xxlor instructins as scan-assembler-instructions.  I get
> 34 if I do not specify optimization.

OK, thanks for looking into it.  When you run a test case with RUNTESTFLAGS,
you can add the "-v" (and even more times) to RUNTESTFLAGS, then you can find
the exact compiling commands in the dumping, I usually used this way for
reproducing and I hope it can avoid some inconsistency for reproduction.

> 
> So, I think the scan-assembler-times are all correct.
> 
> As Peter says, counting xxlor is a bit problematic in general.  We
> could just drop counting xxlor or have the LE/BE count qualifier for
> the instructions.  Your call.

Yeah, I agree that counting xxlor in the checking code (from function main)
is fragile, but as you said we still want to check expected xxlor generated
for bif vec_or, so I'd prefer to separate the existing case into the
compiling part and run part, I'll reply with more details to your latest v3.

Thanks,
Kewen

[committed] CRIS: Replace unspec CRIS_UNSPEC_SWAP_BITS with rtx bitreverse

2023-07-03 Thread Hans-Peter Nilsson via Gcc-patches

This is just expected to be a change in representation.
No code is expected to change; no new tests are added.

* config/cris/cris.md (CRIS_UNSPEC_SWAP_BITS): Remove.
("cris_swap_bits", "ctzsi2"): Use bitreverse instead.
---
 gcc/config/cris/cris.md | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/gcc/config/cris/cris.md b/gcc/config/cris/cris.md
index 7504b63dabf3..deb2f0c6b7c7 100644
--- a/gcc/config/cris/cris.md
+++ b/gcc/config/cris/cris.md
@@ -50,9 +50,6 @@ (define_c_enum ""
   [
;; Stack frame deallocation barrier.
CRIS_UNSPEC_FRAME_DEALLOC
-
-   ;; Swap all 32 bits of the operand; 31 <=> 0, 30 <=> 1...
-   CRIS_UNSPEC_SWAP_BITS
   ])
 
 ;; Register numbers.
@@ -2177,8 +2174,7 @@ (define_insn 
"bswapsi2"
 
 (define_insn "cris_swap_bits"
   [(set (match_operand:SI 0 "register_operand" "=r")
-   (unspec:SI [(match_operand:SI 1 "register_operand" "0")]
-  CRIS_UNSPEC_SWAP_BITS))
+   (bitreverse:SI (match_operand:SI 1 "register_operand" "0")))
(clobber (reg:CC CRIS_CC0_REGNUM))]
   "TARGET_HAS_SWAP"
   "swapwbr %0"
@@ -2193,8 +2189,7 @@ (define_expand "ctzsi2"
  (match_operand:SI 1 "register_operand"))
  (clobber (reg:CC CRIS_CC0_REGNUM))])
(parallel
-[(set (match_dup 2)
- (unspec:SI [(match_dup 2)] CRIS_UNSPEC_SWAP_BITS))
+[(set (match_dup 2) (bitreverse:SI (match_dup 2)))
  (clobber (reg:CC CRIS_CC0_REGNUM))])
(parallel
 [(set (match_operand:SI 0 "register_operand")
-- 
2.30.2

[committed] dwarf2out.cc (mem_loc_descriptor): Handle BITREVERSE

2023-07-03 Thread Hans-Peter Nilsson via Gcc-patches

Committed as obvious after regtest for cris-elf together
with the "next" patch, that replaces unspec
CRIS_UNSPEC_SWAP_BITS with bitreverse (which hit the ICE).

-- >8 --
This seems to have just been overlooked when introducing
BITREVERSE.  Note that the function name mem_loc_descriptor
is a misnomer; it'd better be called rtx_loc_descriptor or
any_loc_descriptor, because "anything" RTX can end up here.
To wit, when introducing new RTL that ends up as code or for
other reasons appear in debug expressions, don't forget to
update this function.  This was observed by building
libstdc+++ for cris-elf with a patch replacing the
CRIS_UNSPEC_SWAP_BITS by bitreverse, as hitting the
internal-error-generating default case.

Looking at the BSWAP, POPCOUNT and ROTATE cases, BITREVERSE
can probably be fully expressed as DWARF code if need be,
but let's start with not throwing an internal error.

gcc:
* dwarf2out.cc (mem_loc_descriptor): Handle BITREVERSE.
---
 gcc/dwarf2out.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/dwarf2out.cc b/gcc/dwarf2out.cc
index 9112fc0c64b5..e973644102c0 100644
--- a/gcc/dwarf2out.cc
+++ b/gcc/dwarf2out.cc
@@ -16940,6 +16940,7 @@ mem_loc_descriptor (rtx rtl, machine_mode mode,
 case CLOBBER:
 case SMUL_HIGHPART:
 case UMUL_HIGHPART:
+case BITREVERSE:
   break;
 
 case CONST_STRING:
-- 
2.30.2

[PATCH] xtensa: Use HARD_REG_SET instead of bare integer

2023-07-03 Thread Takayuki 'January June' Suwa via Gcc-patches

gcc/ChangeLog:

* config/xtensa/xtensa.cc (machine_function, xtensa_expand_prologue):
Change to use HARD_REG_BIT and its macros.
* config/xtensa/xtensa.md
(peephole2: regmove elimination during DFmode input reload):
Likewise.
---
 gcc/config/xtensa/xtensa.cc |  9 +
 gcc/config/xtensa/xtensa.md | 13 ++---
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 3298d53493c..992e80d824d 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -107,7 +107,7 @@ struct GTY(()) machine_function
   bool epilogue_done;
   bool inhibit_logues_a1_adjusts;
   rtx last_logues_a9_content;
-  HOST_WIDE_INT eliminated_callee_saved_bmp;
+  HARD_REG_SET eliminated_callee_saved;
 };
 
 static void xtensa_option_override (void);
@@ -3586,7 +3586,8 @@ xtensa_expand_prologue (void)
df_insn_rescan (insnS);
SET_SRC (PATTERN (insnR)) = copy_rtx (mem);
df_insn_rescan (insnR);
-   cfun->machine->eliminated_callee_saved_bmp |= 1 << regno;
+   SET_HARD_REG_BIT (cfun->machine->eliminated_callee_saved,
+ regno);
  }
else
  {
@@ -3690,8 +3691,8 @@ xtensa_expand_epilogue (bool sibcall_p)
   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
if (xtensa_call_save_reg(regno))
  {
-   if (! (cfun->machine->eliminated_callee_saved_bmp
-  & (1 << regno)))
+   if (! TEST_HARD_REG_BIT (cfun->machine->eliminated_callee_saved,
+regno))
  {
rtx x = gen_rtx_PLUS (Pmode,
  stack_pointer_rtx, GEN_INT (offset));
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 664424f1239..5386e45b51d 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -3240,15 +3240,14 @@
(set (match_dup 3)
(match_dup 7))]
 {
-  uint32_t check = 0;
+  HARD_REG_SET regs;
   int i;
+  CLEAR_HARD_REG_SET (regs);
   for (i = 0; i <= 3; ++i)
-{
-  uint32_t mask = (uint32_t)1 << REGNO (operands[i]);
-  if (check & mask)
-   FAIL;
-  check |= mask;
-}
+if (TEST_HARD_REG_BIT (regs, REGNO (operands[i])))
+  FAIL;
+else
+  SET_HARD_REG_BIT (regs, REGNO (operands[i]));
   operands[6] = gen_rtx_MEM (SFmode, XEXP (operands[6], 0));
   operands[7] = gen_rtx_MEM (SFmode, XEXP (operands[7], 0));
 })
-- 
2.30.2

Re: [PATCH] Fortran: fixes for procedures with ALLOCATABLE,INTENT(OUT) arguments [PR92178]

2023-07-03 Thread Steve Kargl via Gcc-patches

On Mon, Jul 03, 2023 at 10:49:36PM +0200, Harald Anlauf via Fortran wrote:
> 
> Indeed, this is a nice demonstration.
> 
> While playing, I was wondering whether the following code is conforming:
> 
> program p
>   call s ((1))
> contains
>   subroutine s (x)
> integer :: x
> x = 42
>   end subroutine
> end
> 
> (It crashes with gfortran, but not with any foreign brand tested).
> 

It's not conforming.  '(1)' is an expression and it cannot appear
in a variable definition condition.  I am not aware of any numbered
constraint tha would require a Fortran processor to generate an
error.

-- 
Steve

Re: [PATCH] libstdc++: Split up pstl/set.cc testcase

2023-07-03 Thread Jonathan Wakely via Gcc-patches

On Mon, 3 Jul 2023 at 23:14, Thomas Rodgers via Libstdc++
 wrote:
>
> This testcase is causing some timeout issues. This patch splits the
> testcase up by individual set algorithm.

I think the Apache license requires a notice saying the original file
was modified. A comment in each new file noting it was derived from
pstl/alg_sorting/set.cc (or whatever the file is called upstream)
should be sufficient.

OK with that change, thanks.

[committed] libstdc++: Fix synopsis test

2023-07-03 Thread Jonathan Wakely via Gcc-patches

Tested x86_64-linux. Pushed to trunk.

-- >8 --

The  header is only supported for the cxx11 ABI. The
declarations of basic_syncbuf, basic_osyncstream, syncbuf and
osyncstream were already correctly guarded by a check for
_GLIBCXX_USE_CXX11_ABI, but the wsyncbuf and wosyncstream declarations
were not.

libstdc++-v3/ChangeLog:

* testsuite/27_io/headers/iosfwd/synopsis.cc: Make wsyncbuf and
wosyncstream depend on _GLIBCXX_USE_CXX11_ABI.
---
 libstdc++-v3/testsuite/27_io/headers/iosfwd/synopsis.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libstdc++-v3/testsuite/27_io/headers/iosfwd/synopsis.cc 
b/libstdc++-v3/testsuite/27_io/headers/iosfwd/synopsis.cc
index b6d3fa7a719..12f47ae8133 100644
--- a/libstdc++-v3/testsuite/27_io/headers/iosfwd/synopsis.cc
+++ b/libstdc++-v3/testsuite/27_io/headers/iosfwd/synopsis.cc
@@ -115,7 +115,7 @@ _GLIBCXX_END_NAMESPACE_CXX11
   typedef basic_ofstream wofstream;
   typedef basic_fstream  wfstream;
 
-#if __cplusplus >= 202002L
+#if __cplusplus >= 202002L && _GLIBCXX_USE_CXX11_ABI
   typedef basic_syncbuf wsyncbuf;
   typedef basic_osyncstream wosyncstream;
 #endif
-- 
2.41.0

Re: [PATCH] libstdc++: Enable OpenMP 5.0 pragmas in PSTL headers

2023-07-03 Thread Jonathan Wakely via Gcc-patches

Pushed to trunk now.

On Fri, 30 Jun 2023 at 21:17, Jonathan Wakely via Libstdc++
 wrote:
>
> Jakub made a similar change a few yeas ago, but I think it got lost
> in the recent PSTL rebase.
>
> Tested x86_64-linux.
>
> Does this look OK for trunk?
>
> -- >8 --
>
> This reapplies r10-1314-g32bab8b6ad0a90 which was lost in the recent
> PSTL rebase from upstream.
>
> * include/pstl/pstl_config.h (_PSTL_PRAGMA_SIMD_SCAN,
> _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN, _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN):
> Define to OpenMP 5.0 pragmas even for GCC 10.0+.
> (_PSTL_UDS_PRESENT): Define to 1 for GCC 10.0+.
> ---
>  libstdc++-v3/include/pstl/pstl_config.h | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/libstdc++-v3/include/pstl/pstl_config.h 
> b/libstdc++-v3/include/pstl/pstl_config.h
> index 74d2139c736..ccb9dd32838 100644
> --- a/libstdc++-v3/include/pstl/pstl_config.h
> +++ b/libstdc++-v3/include/pstl/pstl_config.h
> @@ -82,7 +82,8 @@
>  #define _PSTL_PRAGMA_FORCEINLINE
>  #endif
>
> -#if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1900
> +#if (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1900) || \
> +(!defined(__INTEL_COMPILER) && _PSTL_GCC_VERSION >= 10)
>  #define _PSTL_PRAGMA_SIMD_SCAN(PRM) _PSTL_PRAGMA(omp simd 
> reduction(inscan, PRM))
>  #define _PSTL_PRAGMA_SIMD_INCLUSIVE_SCAN(PRM) _PSTL_PRAGMA(omp scan 
> inclusive(PRM))
>  #define _PSTL_PRAGMA_SIMD_EXCLUSIVE_SCAN(PRM) _PSTL_PRAGMA(omp scan 
> exclusive(PRM))
> @@ -126,7 +127,8 @@
>  #define _PSTL_UDR_PRESENT
>  #endif
>
> -#if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1900 && 
> __INTEL_COMPILER_BUILD_DATE >= 20180626
> +#if (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1900 && 
> __INTEL_COMPILER_BUILD_DATE >= 20180626) || \
> +(!defined(__INTEL_COMPILER) && _PSTL_GCC_VERSION >= 10)
>  #   define _PSTL_UDS_PRESENT
>  #endif
>
> --
> 2.41.0
>

[committed] libstdc++: Qualify calls to std::_Destroy and _Destroy_aux

2023-07-03 Thread Jonathan Wakely via Gcc-patches

Tested x86_64-linux. Pushed to trunk.

This isn't a regression, but is safe to backport.

-- >8 --

These calls should be qualified to prevent ADL, which can cause errors
for incomplete types that are associated classes.

libstdc++-v3/ChangeLog:

* include/bits/alloc_traits.h (_Destroy): Qualify call.
* include/bits/stl_construct.h (_Destroy, _Destroy_n): Likewise.
* testsuite/23_containers/vector/cons/destroy-adl.cc: New test.
---
 libstdc++-v3/include/bits/alloc_traits.h  |  2 +-
 libstdc++-v3/include/bits/stl_construct.h |  4 ++--
 .../23_containers/vector/cons/destroy-adl.cc  | 11 +++
 3 files changed, 14 insertions(+), 3 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/23_containers/vector/cons/destroy-adl.cc

diff --git a/libstdc++-v3/include/bits/alloc_traits.h 
b/libstdc++-v3/include/bits/alloc_traits.h
index cd91d152f64..182c3e23eed 100644
--- a/libstdc++-v3/include/bits/alloc_traits.h
+++ b/libstdc++-v3/include/bits/alloc_traits.h
@@ -944,7 +944,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 _Destroy(_ForwardIterator __first, _ForwardIterator __last,
 allocator<_Tp>&)
 {
-  _Destroy(__first, __last);
+  std::_Destroy(__first, __last);
 }
 #endif
   /// @endcond
diff --git a/libstdc++-v3/include/bits/stl_construct.h 
b/libstdc++-v3/include/bits/stl_construct.h
index 574f4fa50b4..cf62d927cdb 100644
--- a/libstdc++-v3/include/bits/stl_construct.h
+++ b/libstdc++-v3/include/bits/stl_construct.h
@@ -190,7 +190,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 #if __cplusplus >= 202002L
   if (std::__is_constant_evaluated())
-   return _Destroy_aux::__destroy(__first, __last);
+   return std::_Destroy_aux::__destroy(__first, __last);
 #endif
   std::_Destroy_aux<__has_trivial_destructor(_Value_type)>::
__destroy(__first, __last);
@@ -239,7 +239,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 #if __cplusplus >= 202002L
   if (std::__is_constant_evaluated())
-   return _Destroy_n_aux::__destroy_n(__first, __count);
+   return std::_Destroy_n_aux::__destroy_n(__first, __count);
 #endif
   return std::_Destroy_n_aux<__has_trivial_destructor(_Value_type)>::
__destroy_n(__first, __count);
diff --git a/libstdc++-v3/testsuite/23_containers/vector/cons/destroy-adl.cc 
b/libstdc++-v3/testsuite/23_containers/vector/cons/destroy-adl.cc
new file mode 100644
index 000..5623842e9b1
--- /dev/null
+++ b/libstdc++-v3/testsuite/23_containers/vector/cons/destroy-adl.cc
@@ -0,0 +1,11 @@
+// { dg-do compile }
+
+#include 
+
+template struct Holder { T t; }; // { dg-bogus "incomplete type" }
+struct Incomplete;
+
+void destroy(std::vector*>* p)
+{
+  p->~vector();
+}
-- 
2.41.0

[PATCH] libstdc++: Split up pstl/set.cc testcase

2023-07-03 Thread Thomas Rodgers via Gcc-patches

This testcase is causing some timeout issues. This patch splits the
testcase up by individual set algorithm.
From 857359b72f8886b6e90db3b596d04f08559d2b51 Mon Sep 17 00:00:00 2001
From: Thomas Rodgers 
Date: Mon, 3 Jul 2023 15:04:45 -0700
Subject: [PATCH] libstdc++: Split up pstl/set.cc testcase

This testcase is causing some timeout issues. This patch splits the
testcase up by individual set algorithm.

libstdc++-v3:/ChangeLog:
	* testsuite/25_algorithms/pstl/alg_sorting/set.cc: Delete
	file.
	* testsuite/25_algorithms/pstl/alg_sorting/set_difference.cc:
	New file.
	* testsuite/25_algorithms/pstl/alg_sorting/set_intersection.cc:
	Likewise.
	* testsuite/25_algorithms/pstl/alg_sorting/set_symmetric_difference.cc:
	Likewise.
	* testsuite/25_algorithms/pstl/alg_sorting/set_union.cc:
	Likewise.
	* testsuite/25_algorithms/pstl/alg_sorting/set_util.h:
	Likewise.
---
 .../25_algorithms/pstl/alg_sorting/set.cc | 289 --
 .../pstl/alg_sorting/set_difference.cc|  90 ++
 .../pstl/alg_sorting/set_intersection.cc  |  91 ++
 .../alg_sorting/set_symmetric_difference.cc   |  92 ++
 .../pstl/alg_sorting/set_union.cc |  90 ++
 .../25_algorithms/pstl/alg_sorting/set_util.h |  72 +
 6 files changed, 435 insertions(+), 289 deletions(-)
 delete mode 100644 libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set.cc
 create mode 100644 libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set_difference.cc
 create mode 100644 libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set_intersection.cc
 create mode 100644 libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set_symmetric_difference.cc
 create mode 100644 libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set_union.cc
 create mode 100644 libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set_util.h

diff --git a/libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set.cc b/libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set.cc
deleted file mode 100644
index 0343739dfd1..000
--- a/libstdc++-v3/testsuite/25_algorithms/pstl/alg_sorting/set.cc
+++ /dev/null
@@ -1,289 +0,0 @@
-// -*- C++ -*-
-// { dg-options "-ltbb" }
-// { dg-do run { target c++17 } }
-// { dg-timeout-factor 3 }
-// { dg-require-effective-target tbb_backend }
-
-//===-- set.pass.cpp --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===--===//
-
-#include "pstl/pstl_test_config.h"
-
-#ifdef PSTL_STANDALONE_TESTS
-
-#include 
-#include 
-
-#include "pstl/execution"
-#include "pstl/algorithm"
-#else
-#include 
-#include 
-#endif // PSTL_STANDALONE_TESTS
-
-#include "pstl/test_utils.h"
-
-using namespace TestUtils;
-
-template 
-struct Num
-{
-T val;
-
-Num() : val{} {}
-Num(const T& v) : val(v) {}
-
-//for "includes" checks
-template 
-bool
-operator<(const Num& v1) const
-{
-return val < v1.val;
-}
-
-//The types Type1 and Type2 must be such that an object of type InputIt can be dereferenced and then implicitly converted to both of them
-template 
-operator Num() const
-{
-return Num((T1)val);
-}
-
-friend bool
-operator==(const Num& v1, const Num& v2)
-{
-return v1.val == v2.val;
-}
-};
-
-template 
-struct test_set_union
-{
-template 
-typename std::enable_if::value, void>::type
-operator()(Policy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2,
-   Compare comp)
-{
-using T1 = typename std::iterator_traits::value_type;
-
-auto n1 = std::distance(first1, last1);
-auto n2 = std::distance(first2, last2);
-auto n = n1 + n2;
-Sequence expect(n);
-Sequence out(n);
-
-auto expect_res = std::set_union(first1, last1, first2, last2, expect.begin(), comp);
-auto res = std::set_union(exec, first1, last1, first2, last2, out.begin(), comp);
-
-EXPECT_TRUE(expect_res - expect.begin() == res - out.begin(), "wrong result for set_union");
-EXPECT_EQ_N(expect.begin(), out.begin(), std::distance(out.begin(), res), "wrong set_union effect");
-}
-
-template 
-typename std::enable_if::value, void>::type
-operator()(Policy&&, InputIterator1, InputIterator1, InputIterator2, InputIterator2, Compare)
-{
-}
-};
-
-template 
-struct test_set_intersection
-{
-template 
-typename std::enable_if::value, void>::type
-operator()(Policy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2,
-   Compare comp)
-{
-using T1 = typename std::iterator_traits::value_type;
-
-auto n1 = std::distance(first1, last1);

Re: [PATCH] tree-optimization/110310 - move vector epilogue disabling to analysis phase

2023-07-03 Thread Richard Sandiford via Gcc-patches

Richard Biener  writes:
> The following removes late deciding to elide vectorized epilogues to
> the analysis phase and also avoids altering the epilogues niter.
> The costing part from vect_determine_partial_vectors_and_peeling is
> moved to vect_analyze_loop_costing where we use the main loop
> analysis to constrain the epilogue scalar iterations.
>
> I have not tried to integrate this with vect_known_niters_smaller_than_vf.
>
> It seems the for_epilogue_p parameter in
> vect_determine_partial_vectors_and_peeling is largely useless and
> we could compute that in the function itself.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
>
> I suppose testing on aarch64 would be nice-to-have - any takers?

Sorry, ran this earlier today and then forgot about it.  And yeah,
it passes bootstrap & regtest on aarch64-linux-gnu (all languages).

LGTM FWIW, except:

> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 0a03f56aae7..f39a1ecb306 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2144,14 +2144,76 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo,
>  
>/* Only loops that can handle partially-populated vectors can have 
> iteration
>   counts less than the vectorization factor.  */
> -  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
> +  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
> +  && vect_known_niters_smaller_than_vf (loop_vinfo))
>  {
> -  if (vect_known_niters_smaller_than_vf (loop_vinfo))
> +  if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +  "not vectorized: iteration count smaller than "
> +  "vectorization factor.\n");
> +  return 0;
> +}
> +
> +  /* If we know the number of iterations we can do better, for the
> + epilogue we can also decide whether the main loop leaves us
> + with enough iterations, prefering a smaller vector epilog then
> + also possibly used for the case we skip the vector loop.  */
> +  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
> +  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
> +{
> +  widest_int scalar_niters
> + = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
> +  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
> + {
> +   loop_vec_info orig_loop_vinfo
> + = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
> +   unsigned lowest_vf
> + = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
> +   int prolog_peeling = 0;
> +   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
> + prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
> +   if (prolog_peeling >= 0
> +   && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
> +lowest_vf))
> + {
> +   unsigned gap
> + = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
> +   scalar_niters = ((scalar_niters - gap - prolog_peeling)
> +% lowest_vf + gap);

Are you sure we want this + gap?  A vectorised epilogue can't handle the
gap either, at least for things that use (say) the first vector of LD2
and ignore the second vector.

Thanks,
Richard

> +   if (scalar_niters == 0)
> + {
> +   if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +  "not vectorized: loop never entered\n");
> +   return 0;
> + }
> + }
> + }
> +
> +  /* Check that the loop processes at least one full vector.  */
> +  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> +  if (known_lt (scalar_niters, vf))
>   {
> if (dump_enabled_p ())
>   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -  "not vectorized: iteration count smaller than "
> -  "vectorization factor.\n");
> +  "loop does not have enough iterations "
> +  "to support vectorization.\n");
> +   return 0;
> + }
> +
> +  /* If we need to peel an extra epilogue iteration to handle data
> +  accesses with gaps, check that there are enough scalar iterations
> +  available.
> +
> +  The check above is redundant with this one when peeling for gaps,
> +  but the distinction is useful for diagnostics.  */
> +  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
> +   && known_le (scalar_niters, vf))
> + {
> +   if (dump_enabled_p ())
> + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +  "loop does not have enough iterations "
> +  "to support peeling for gaps.\n");
> return 0;
>   }
>  }
> @@ -2502,31 +2564,6 @@ vect_determine_partial_vectors_and_peeling 
> (loop_vec_info loop_vinfo,
>

Re: [PATCH v2] RISC-V: Add support for vector crypto extensions

2023-07-03 Thread Philipp Tomsich

Thanks, applied to master.
--Philipp.

On Mon, 3 Jul 2023 at 15:42, Kito Cheng  wrote:

> Thanks, LGTM :)
>
> Christoph Muellner 於 2023年7月3日 週一，19:08寫道：
>
>> From: Christoph Müllner 
>>
>> This series adds basic support for the vector crypto extensions:
>> * Zvbb
>> * Zvbc
>> * Zvkg
>> * Zvkned
>> * Zvkhn[a,b]
>> * Zvksed
>> * Zvksh
>> * Zvkn
>> * Zvknc
>> * Zvkng
>> * Zvks
>> * Zvksc
>> * Zvksg
>> * Zvkt
>>
>> This patch is based on the v20230620 version of the Vector Cryptography
>> specification. The specification is frozen and can be found here:
>>   https://github.com/riscv/riscv-crypto/releases/tag/v20230620
>>
>> Binutils support has been merged upstream a few days ago.
>>
>> All extensions come with tests for the feature test macros.
>>
>> gcc/ChangeLog:
>>
>> * common/config/riscv/riscv-common.cc: Add support for zvbb,
>> zvbc, zvkg, zvkned, zvknha, zvknhb, zvksed, zvksh, zvkn,
>> zvknc, zvkng, zvks, zvksc, zvksg, zvkt and the implied subsets.
>> * config/riscv/arch-canonicalize: Add canonicalization info for
>> zvkn, zvknc, zvkng, zvks, zvksc, zvksg.
>> * config/riscv/riscv-opts.h (MASK_ZVBB): New macro.
>> (MASK_ZVBC): Likewise.
>> (TARGET_ZVBB): Likewise.
>> (TARGET_ZVBC): Likewise.
>> (MASK_ZVKG): Likewise.
>> (MASK_ZVKNED): Likewise.
>> (MASK_ZVKNHA): Likewise.
>> (MASK_ZVKNHB): Likewise.
>> (MASK_ZVKSED): Likewise.
>> (MASK_ZVKSH): Likewise.
>> (MASK_ZVKN): Likewise.
>> (MASK_ZVKNC): Likewise.
>> (MASK_ZVKNG): Likewise.
>> (MASK_ZVKS): Likewise.
>> (MASK_ZVKSC): Likewise.
>> (MASK_ZVKSG): Likewise.
>> (MASK_ZVKT): Likewise.
>> (TARGET_ZVKG): Likewise.
>> (TARGET_ZVKNED): Likewise.
>> (TARGET_ZVKNHA): Likewise.
>> (TARGET_ZVKNHB): Likewise.
>> (TARGET_ZVKSED): Likewise.
>> (TARGET_ZVKSH): Likewise.
>> (TARGET_ZVKN): Likewise.
>> (TARGET_ZVKNC): Likewise.
>> (TARGET_ZVKNG): Likewise.
>> (TARGET_ZVKS): Likewise.
>> (TARGET_ZVKSC): Likewise.
>> (TARGET_ZVKSG): Likewise.
>> (TARGET_ZVKT): Likewise.
>> * config/riscv/riscv.opt: Introduction of riscv_zv{b,k}_subext.
>>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.target/riscv/zvbb.c: New test.
>> * gcc.target/riscv/zvbc.c: New test.
>> * gcc.target/riscv/zvkg.c: New test.
>> * gcc.target/riscv/zvkn-1.c: New test.
>> * gcc.target/riscv/zvkn.c: New test.
>> * gcc.target/riscv/zvknc-1.c: New test.
>> * gcc.target/riscv/zvknc-2.c: New test.
>> * gcc.target/riscv/zvknc.c: New test.
>> * gcc.target/riscv/zvkned.c: New test.
>> * gcc.target/riscv/zvkng-1.c: New test.
>> * gcc.target/riscv/zvkng-2.c: New test.
>> * gcc.target/riscv/zvkng.c: New test.
>> * gcc.target/riscv/zvknha.c: New test.
>> * gcc.target/riscv/zvknhb.c: New test.
>> * gcc.target/riscv/zvks-1.c: New test.
>> * gcc.target/riscv/zvks.c: New test.
>> * gcc.target/riscv/zvksc-1.c: New test.
>> * gcc.target/riscv/zvksc-2.c: New test.
>> * gcc.target/riscv/zvksc.c: New test.
>> * gcc.target/riscv/zvksed.c: New test.
>> * gcc.target/riscv/zvksg-1.c: New test.
>> * gcc.target/riscv/zvksg-2.c: New test.
>> * gcc.target/riscv/zvksg.c: New test.
>> * gcc.target/riscv/zvksh.c: New test.
>> * gcc.target/riscv/zvkt.c: New test.
>>
>> Signed-off-by: Christoph Müllner 
>> ---
>> Changes for v2:
>> - Update patch for specification version v20230620
>>
>>  gcc/common/config/riscv/riscv-common.cc  | 55 
>>  gcc/config/riscv/arch-canonicalize   |  7 +++
>>  gcc/config/riscv/riscv-opts.h| 34 +++
>>  gcc/config/riscv/riscv.opt   |  6 +++
>>  gcc/testsuite/gcc.target/riscv/zvbb.c| 13 ++
>>  gcc/testsuite/gcc.target/riscv/zvbc.c| 13 ++
>>  gcc/testsuite/gcc.target/riscv/zvkg.c| 13 ++
>>  gcc/testsuite/gcc.target/riscv/zvkn-1.c  | 29 +
>>  gcc/testsuite/gcc.target/riscv/zvkn.c| 29 +
>>  gcc/testsuite/gcc.target/riscv/zvknc-1.c | 37 
>>  gcc/testsuite/gcc.target/riscv/zvknc-2.c | 37 
>>  gcc/testsuite/gcc.target/riscv/zvknc.c   | 37 
>>  gcc/testsuite/gcc.target/riscv/zvkned.c  | 13 ++
>>  gcc/testsuite/gcc.target/riscv/zvkng-1.c | 37 
>>  gcc/testsuite/gcc.target/riscv/zvkng-2.c | 37 
>>  gcc/testsuite/gcc.target/riscv/zvkng.c   | 37 
>>  gcc/testsuite/gcc.target/riscv/zvknha.c  | 13 ++
>>  gcc/testsuite/gcc.target/riscv/zvknhb.c  | 13 ++
>>  gcc/testsuite/gcc.target/riscv/zvks-1.c  | 29 +
>>  gcc/testsuite/gcc.target/riscv/zvks.c| 29 +
>>  gcc/testsuite/gcc.target/riscv/zvksc-1.c | 37

[PATCH 5/5] OpenMP: Array shaping operator and strided "target update" for C

2023-07-03 Thread Julian Brown

Following the similar support for C++ and Fortran, here is the
C implementation for the OpenMP 5.0 array-shaping operator, and for
strided and rectangular updates for "target update".

Much of the implementation is shared with the C++ support added earlier
in this patch series.  Some details of parsing necessarily differ for C,
but the general ideas are the same.

2023-07-03  Julian Brown  

gcc/c/
* c-parser.cc (c_parser_braced_init): Disallow array-shaping operator
in braced init.
(c_parser_conditional_expression): Disallow array-shaping operator in
conditional expression.
(c_parser_cast_expression): Add array-shaping operator support.
(c_parser_postfix_expression): Disallow array-shaping operator in
statement expressions.
(c_parser_postfix_expression_after_primary): Add OpenMP array section
stride support.
(c_parser_expr_list): Disallow array-shaping operator in expression
lists.
(c_array_type_nelts_top, c_array_type_nelts_total): New functions.
(c_parser_omp_variable_list): Support array-shaping operator.
(c_parser_omp_target_update): Recognize GOMP_MAP_TO_GRID and
GOMP_MAP_FROM_GRID map kinds as well as OMP_CLAUSE_TO/OMP_CLAUSE_FROM.
* c-tree.h (c_omp_array_shaping_op_p, c_omp_has_array_shape_p): New
extern declarations.
(create_omp_arrayshape_type): Add prototype.
* c-typeck.cc (c_omp_array_shaping_op_p, c_omp_has_array_shape_p): New
globals.
(build_omp_array_section): Permit integral types, not just integer
constants, when creating array types for array sections.
(create_omp_arrayshape_type): New function.
(handle_omp_array_sections_1): Add DISCONTIGUOUS parameter.  Add
strided/rectangular array section support.
(omp_array_section_low_bound): New function.
(handle_omp_array_sections): Add DISCONTIGUOUS parameter.  Add
strided/rectangular array section support.
(c_finish_omp_clauses): Update calls to handle_omp_array_sections.
Handle discontiguous updates.

gcc/testsuite/
* gcc.dg/gomp/bad-array-shaping-c-1.c: New test.
* gcc.dg/gomp/bad-array-shaping-c-2.c: New test.
* gcc.dg/gomp/bad-array-shaping-c-3.c: New test.
* gcc.dg/gomp/bad-array-shaping-c-4.c: New test.
* gcc.dg/gomp/bad-array-shaping-c-5.c: New test.
* gcc.dg/gomp/bad-array-shaping-c-6.c: New test.
* gcc.dg/gomp/bad-array-shaping-c-7.c: New test.

libgomp/
* testsuite/libgomp.c/array-shaping-1.c: New test.
* testsuite/libgomp.c/array-shaping-2.c: New test.
* testsuite/libgomp.c/array-shaping-3.c: New test.
* testsuite/libgomp.c/array-shaping-4.c: New test.
* testsuite/libgomp.c/array-shaping-5.c: New test.
* testsuite/libgomp.c/array-shaping-6.c: New test.
---
 gcc/c/c-parser.cc | 301 +-
 gcc/c/c-tree.h|   4 +
 gcc/c/c-typeck.cc | 241 --
 .../gcc.dg/gomp/bad-array-shaping-c-1.c   |  26 ++
 .../gcc.dg/gomp/bad-array-shaping-c-2.c   |  24 ++
 .../gcc.dg/gomp/bad-array-shaping-c-3.c   |  30 ++
 .../gcc.dg/gomp/bad-array-shaping-c-4.c   |  27 ++
 .../gcc.dg/gomp/bad-array-shaping-c-5.c   |  17 +
 .../gcc.dg/gomp/bad-array-shaping-c-6.c   |  26 ++
 .../gcc.dg/gomp/bad-array-shaping-c-7.c   |  15 +
 libgomp/testsuite/libgomp.c/array-shaping-1.c | 236 ++
 libgomp/testsuite/libgomp.c/array-shaping-2.c |  39 +++
 libgomp/testsuite/libgomp.c/array-shaping-3.c |  42 +++
 libgomp/testsuite/libgomp.c/array-shaping-4.c |  36 +++
 libgomp/testsuite/libgomp.c/array-shaping-5.c |  38 +++
 libgomp/testsuite/libgomp.c/array-shaping-6.c |  45 +++
 16 files changed, 1099 insertions(+), 48 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/gomp/bad-array-shaping-c-1.c
 create mode 100644 gcc/testsuite/gcc.dg/gomp/bad-array-shaping-c-2.c
 create mode 100644 gcc/testsuite/gcc.dg/gomp/bad-array-shaping-c-3.c
 create mode 100644 gcc/testsuite/gcc.dg/gomp/bad-array-shaping-c-4.c
 create mode 100644 gcc/testsuite/gcc.dg/gomp/bad-array-shaping-c-5.c
 create mode 100644 gcc/testsuite/gcc.dg/gomp/bad-array-shaping-c-6.c
 create mode 100644 gcc/testsuite/gcc.dg/gomp/bad-array-shaping-c-7.c
 create mode 100644 libgomp/testsuite/libgomp.c/array-shaping-1.c
 create mode 100644 libgomp/testsuite/libgomp.c/array-shaping-2.c
 create mode 100644 libgomp/testsuite/libgomp.c/array-shaping-3.c
 create mode 100644 libgomp/testsuite/libgomp.c/array-shaping-4.c
 create mode 100644 libgomp/testsuite/libgomp.c/array-shaping-5.c
 create mode 100644 libgomp/testsuite/libgomp.c/array-shaping-6.c

diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index 280426ddf10..7e895e11da2 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -5764,7 +5764,9 @@ c_parser_braced_init (c_parser *parser,

[PATCH 4/5] OpenMP: Noncontiguous "target update" for Fortran

2023-07-03 Thread Julian Brown

This patch implements noncontiguous "target update" for Fortran.
The existing middle end/runtime bits relating to C++ support are reused,
with some small adjustments, e.g.:

  1. The node used to map the OMP "array descriptor" (from omp-low.cc
 onwards) now uses the OMP_CLAUSE_SIZE field as a bias (the difference
 between the "virtual origin" element with zero indices in each
 dimension and the first element actually stored in memory).

  2. The OMP_CLAUSE_SIZE field of a GOMP_MAP_DIM_STRIDE node may now be
 used to store a "span", which is the distance in bytes between
 two adjacent elements in an array (with unit stride) when that is
 different from the element size, as it can be in Fortran.

The implementation goes to some effort to massage Fortran array metadata
(array descriptors) into a form that can ultimately be consumed by
omp_target_memcpy_rect_worker. The method for doing this is described
in comments in the patch body.

2023-07-03  Julian Brown  

gcc/fortran/
* trans-openmp.cc (gfc_omp_deep_map_kind_p): Handle
GOMP_MAP_{TO,FROM}_GRID, GOMP_MAP_GRID_{DIM,STRIDE}.
(gfc_trans_omp_arrayshape_type, gfc_omp_calculate_gcd,
gfc_desc_to_omp_noncontig_array, gfc_omp_contiguous_update_p): New
functions.
(gfc_trans_omp_clauses): Handle noncontiguous to/from clauses for OMP
"target update" directives.

gcc/
* gimplify.cc (gimplify_adjust_omp_clauses): Don't gimplify
VIEW_CONVERT_EXPR away in GOMP_MAP_TO_GRID/GOMP_MAP_FROM_GRID clauses.
* omp-low.cc (omp_noncontig_descriptor_type): Add SPAN field.
(scan_sharing_clauses): Don't store descriptor size in its
OMP_CLAUSE_SIZE field.
(lower_omp_target): Add missing OMP_CLAUSE_MAP check.  Add special-case
string handling.  Handle span and bias.  Use low bound instead of zero
as index for trailing full dimensions.

libgomp/
* libgomp.h (omp_noncontig_array_desc): Add span field.
* target.c (omp_target_memcpy_rect_worker): Add span parameter. Update
forward declaration. Handle span != element_size.
(gomp_update): Handle bias in descriptor's size slot.  Update calls to
omp_target_memcpy_rect_worker.
* testsuite/libgomp.fortran/noncontig-updates-1.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-2.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-3.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-4.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-5.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-6.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-7.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-8.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-9.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-10.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-11.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-12.f90: New test.
* testsuite/libgomp.fortran/noncontig-updates-13.f90: New test.

gcc/testsuite/
* gfortran.dg/gomp/noncontig-updates-1.f90: New test.
* gfortran.dg/gomp/noncontig-updates-2.f90: New test.
* gfortran.dg/gomp/noncontig-updates-3.f90: New test.
* gfortran.dg/gomp/noncontig-updates-4.f90: New test.
---
 gcc/fortran/trans-openmp.cc   | 500 ++
 gcc/gimplify.cc   |  10 +
 gcc/omp-low.cc|  73 ++-
 .../gfortran.dg/gomp/noncontig-updates-1.f90  |  19 +
 .../gfortran.dg/gomp/noncontig-updates-2.f90  |  16 +
 .../gfortran.dg/gomp/noncontig-updates-3.f90  |  16 +
 .../gfortran.dg/gomp/noncontig-updates-4.f90  |  15 +
 libgomp/libgomp.h |   1 +
 libgomp/target.c  |  47 +-
 .../libgomp.fortran/noncontig-updates-1.f90   |  54 ++
 .../libgomp.fortran/noncontig-updates-10.f90  |  29 +
 .../libgomp.fortran/noncontig-updates-11.f90  |  51 ++
 .../libgomp.fortran/noncontig-updates-12.f90  |  59 +++
 .../libgomp.fortran/noncontig-updates-13.f90  |  42 ++
 .../libgomp.fortran/noncontig-updates-2.f90   | 101 
 .../libgomp.fortran/noncontig-updates-3.f90   |  47 ++
 .../libgomp.fortran/noncontig-updates-4.f90   |  78 +++
 .../libgomp.fortran/noncontig-updates-5.f90   |  55 ++
 .../libgomp.fortran/noncontig-updates-6.f90   |  34 ++
 .../libgomp.fortran/noncontig-updates-7.f90   |  36 ++
 .../libgomp.fortran/noncontig-updates-8.f90   |  39 ++
 .../libgomp.fortran/noncontig-updates-9.f90   |  34 ++
 22 files changed, 1325 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/gomp/noncontig-updates-1.f90
 create mode 100644 gcc/testsuite/gfortran.dg/gomp/noncontig-updates-2.f90
 create mode 100644 gcc/testsuite/gfortran.dg/gomp/noncontig-updates-3.f90
 create mode

[PATCH 2/5] OpenMP: Allow complete replacement of clause during map/to/from expansion

2023-07-03 Thread Julian Brown

At present, map/to/from clauses on OpenMP "target" directives may be
expanded into several mapping nodes if they describe array sections with
pointer or reference bases, or similar.  This patch allows the original
clause to be replaced during that expansion, mostly by passing the list
pointer to the node to various functions rather than the node itself.

This is needed by the following patch. There shouldn't be any functional
changes introduced by this patch itself.

2023-07-03  Julian Brown  

gcc/c-family/
* c-common.h (expand_array_base, expand_component_selector,
expand_map_clause): Adjust member declarations.
* c-omp.cc (omp_expand_access_chain): Pass and return pointer to
clause.
(c_omp_address_inspector::expand_array_base): Likewise.
(c_omp_address_inspector::expand_component_selector): Likewise.
(c_omp_address_inspector::expand_map_clause): Likewise.

gcc/c/
* c-typeck.cc (handle_omp_array_sections): Pass pointer to clause to
process instead of clause.
(c_finish_omp_clauses): Update calls to handle_omp_array_sections.
Handle cases where initial clause might be replaced.

gcc/cp/
* semantics.cc (handle_omp_array_sections): Pass pointer to clause
instead of clause.  Add PNEXT return parameter for next clause in list
to process.
(finish_omp_clauses): Update calls to handle_omp_array_sections.
Handle cases where initial clause might be replaced.
---
 gcc/c-family/c-common.h | 12 +++
 gcc/c-family/c-omp.cc   | 75 +
 gcc/c/c-typeck.cc   | 32 +++---
 gcc/cp/semantics.cc | 37 +---
 4 files changed, 88 insertions(+), 68 deletions(-)

diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index acd0c861a55..756358f3fd8 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -1375,12 +1375,12 @@ public:
 
   bool maybe_zero_length_array_section (tree);
 
-  tree expand_array_base (tree, vec &, tree, unsigned *,
- c_omp_region_type, bool);
-  tree expand_component_selector (tree, vec &, tree,
- unsigned *);
-  tree expand_map_clause (tree, tree, vec &,
- c_omp_region_type);
+  tree * expand_array_base (tree *, vec &, tree, unsigned *,
+   c_omp_region_type, bool);
+  tree * expand_component_selector (tree *, vec &, tree,
+   unsigned *);
+  tree * expand_map_clause (tree *, tree, vec &,
+   c_omp_region_type);
 };
 
 enum c_omp_directive_kind {
diff --git a/gcc/c-family/c-omp.cc b/gcc/c-family/c-omp.cc
index 16b620fcb3d..17f3d71c655 100644
--- a/gcc/c-family/c-omp.cc
+++ b/gcc/c-family/c-omp.cc
@@ -4130,11 +4130,12 @@ 
c_omp_address_inspector::maybe_zero_length_array_section (tree clause)
expression types here, because e.g. you can't have an array of
references.  See also gimplify.cc:omp_expand_access_chain.  */
 
-static tree
-omp_expand_access_chain (tree c, tree expr, vec _tokens,
-unsigned *idx)
+static tree *
+omp_expand_access_chain (tree *pc, tree expr,
+vec _tokens, unsigned *idx)
 {
   using namespace omp_addr_tokenizer;
+  tree c = *pc;
   location_t loc = OMP_CLAUSE_LOCATION (c);
   unsigned i = *idx;
   tree c2 = NULL_TREE;
@@ -4172,35 +4173,36 @@ omp_expand_access_chain (tree c, tree expr, 
vec _tokens,
   break;
 
 default:
-  return error_mark_node;
+  return NULL;
 }
 
   if (c2)
 {
   OMP_CLAUSE_CHAIN (c2) = OMP_CLAUSE_CHAIN (c);
   OMP_CLAUSE_CHAIN (c) = c2;
-  c = c2;
+  pc = _CLAUSE_CHAIN (c);
 }
 
   *idx = ++i;
 
   if (i < addr_tokens.length ()
   && addr_tokens[i]->type == ACCESS_METHOD)
-return omp_expand_access_chain (c, expr, addr_tokens, idx);
+return omp_expand_access_chain (pc, expr, addr_tokens, idx);
 
-  return c;
+  return pc;
 }
 
 /* Translate "array_base_decl access_method" to OMP mapping clauses.  */
 
-tree
-c_omp_address_inspector::expand_array_base (tree c,
+tree *
+c_omp_address_inspector::expand_array_base (tree *pc,
vec _tokens,
tree expr, unsigned *idx,
c_omp_region_type ort,
bool decl_p)
 {
   using namespace omp_addr_tokenizer;
+  tree c = *pc;
   location_t loc = OMP_CLAUSE_LOCATION (c);
   int i = *idx;
   tree decl = addr_tokens[i + 1]->expr;
@@ -4225,7 +4227,7 @@ c_omp_address_inspector::expand_array_base (tree c,
  || OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_DETACH))
 {
   *idx = ++i;
-  return c;
+  return pc;
 }
 
   switch (addr_tokens[i + 1]->u.access_kind)
@@ -4474,7 +4476,7 @@ c_omp_address_inspector::expand_array_base (tree c,
 
 default:
   *idx = i +

[PATCH 0/5] [og13] OpenMP: strides, rectangular updates and array-shaping operator for "target update"

2023-07-03 Thread Julian Brown

This patch series adds support for the array-shaping operator from OpenMP
5.0, and strided and rectangular transfers for "target update" directives.
The patches were previously posted for mainline here:

  https://gcc.gnu.org/pipermail/gcc-patches/2023-March/613785.html (C++)
  https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616921.html (Fortran)
  https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618738.html (C)

This time the patches have been merged to the og13 branch (mostly
straightforward, though note the conflict described in patch 3/5).

Re-tested with offloading to AMD GCN.  I will apply shortly.

Julian Brown (5):
  OpenMP: Fix "exit data" for array sections for ref-to-ptr components
  OpenMP: Allow complete replacement of clause during map/to/from
expansion
  OpenMP: Support strided and shaped-array updates for C++
  OpenMP: Noncontiguous "target update" for Fortran
  OpenMP: Array shaping operator and strided "target update" for C

 gcc/c-family/c-common.h   |  12 +-
 gcc/c-family/c-omp.cc | 277 --
 gcc/c-family/c-pretty-print.cc|   5 +
 gcc/c/c-parser.cc | 331 +++-
 gcc/c/c-tree.h|   6 +-
 gcc/c/c-typeck.cc | 287 --
 gcc/cp/cp-objcp-common.cc |   1 +
 gcc/cp/cp-tree.def|   1 +
 gcc/cp/cp-tree.h  |  13 +-
 gcc/cp/decl.cc|  75 +++
 gcc/cp/decl2.cc   |  19 +-
 gcc/cp/error.cc   |   5 +
 gcc/cp/mangle.cc  |   1 +
 gcc/cp/operators.def  |   1 +
 gcc/cp/parser.cc  | 303 ++-
 gcc/cp/parser.h   |   7 +
 gcc/cp/pt.cc  |  39 +-
 gcc/cp/semantics.cc   | 289 --
 gcc/cp/typeck.cc  |  12 +-
 gcc/fortran/trans-openmp.cc   | 500 ++
 gcc/gimplify.cc   |  84 ++-
 gcc/omp-general.cc|  47 ++
 gcc/omp-general.h |   4 +-
 gcc/omp-low.cc| 459 +++-
 gcc/testsuite/g++.dg/gomp/array-shaping-1.C   |  22 +
 gcc/testsuite/g++.dg/gomp/array-shaping-2.C   | 134 +
 .../g++.dg/gomp/bad-array-shaping-1.C |  47 ++
 .../g++.dg/gomp/bad-array-shaping-2.C |  52 ++
 .../g++.dg/gomp/bad-array-shaping-3.C |  53 ++
 .../g++.dg/gomp/bad-array-shaping-4.C |  60 +++
 .../g++.dg/gomp/bad-array-shaping-5.C |  55 ++
 .../g++.dg/gomp/bad-array-shaping-6.C |  59 +++
 .../g++.dg/gomp/bad-array-shaping-7.C |  48 ++
 .../g++.dg/gomp/bad-array-shaping-8.C |  50 ++
 .../gcc.dg/gomp/bad-array-shaping-c-1.c   |  26 +
 .../gcc.dg/gomp/bad-array-shaping-c-2.c   |  24 +
 .../gcc.dg/gomp/bad-array-shaping-c-3.c   |  30 ++
 .../gcc.dg/gomp/bad-array-shaping-c-4.c   |  27 +
 .../gcc.dg/gomp/bad-array-shaping-c-5.c   |  17 +
 .../gcc.dg/gomp/bad-array-shaping-c-6.c   |  26 +
 .../gcc.dg/gomp/bad-array-shaping-c-7.c   |  15 +
 .../gfortran.dg/gomp/noncontig-updates-1.f90  |  19 +
 .../gfortran.dg/gomp/noncontig-updates-2.f90  |  16 +
 .../gfortran.dg/gomp/noncontig-updates-3.f90  |  16 +
 .../gfortran.dg/gomp/noncontig-updates-4.f90  |  15 +
 gcc/tree-pretty-print.cc  |  17 +
 gcc/tree.def  |   2 +-
 include/gomp-constants.h  |   7 +-
 libgomp/libgomp.h |  15 +
 libgomp/target.c  | 261 ++---
 .../testsuite/libgomp.c++/array-shaping-1.C   | 469 
 .../testsuite/libgomp.c++/array-shaping-10.C  |  61 +++
 .../testsuite/libgomp.c++/array-shaping-11.C  |  63 +++
 .../testsuite/libgomp.c++/array-shaping-12.C  |  65 +++
 .../testsuite/libgomp.c++/array-shaping-13.C  |  89 
 .../testsuite/libgomp.c++/array-shaping-2.C   |  38 ++
 .../testsuite/libgomp.c++/array-shaping-3.C   |  38 ++
 .../testsuite/libgomp.c++/array-shaping-4.C   |  38 ++
 .../testsuite/libgomp.c++/array-shaping-5.C   |  38 ++
 .../testsuite/libgomp.c++/array-shaping-6.C   |  54 ++
 .../testsuite/libgomp.c++/array-shaping-7.C   |  54 ++
 .../testsuite/libgomp.c++/array-shaping-8.C   |  65 +++
 .../testsuite/libgomp.c++/array-shaping-9.C   |  95 
 libgomp/testsuite/libgomp.c/array-shaping-1.c | 236 +
 libgomp/testsuite/libgomp.c/array-shaping-2.c |  39 ++
 libgomp/testsuite/libgomp.c/array-shaping-3.c |  42 ++
 libgomp/testsuite/libgomp.c/array-shaping-4.c |  36 ++
 libgomp/testsuite/libgomp.c/array-shaping-5.c |  38 ++
 libgomp/testsuite/libgomp.c/array-shaping-6.c |  45 ++
 .../libgomp.fortran/noncontig-updates-1.f90   |  54 ++

[PATCH 1/5] OpenMP: Fix "exit data" for array sections for ref-to-ptr components

2023-07-03 Thread Julian Brown

This patch fixes "exit data" for (C++) reference-to-pointer struct
components with array sections, such as:

  struct S { int * [...] };
  ...
  #pragma omp target exit data map(from: str->ptr, str->ptr[0:n])

Such exits need two "detach" operations. We need to unmap
both the pointer and the slice. That idiom is recognized by
omp_resolve_clause_dependencies, but before omp_build_struct_sibling_lists
finishes the resulting mapping nodes are represented like this:

  GOMP_MAP_FROM GOMP_MAP_DETACH GOMP_MAP_ATTACH_DETACH

And at the moment, that won't be recognized as a single mapping group
as it should be. This patch fixes that.

(This is covered by a test case added in later patches in this series,
e.g. libgomp/testsuite/libgomp.c++/array-shaping-8.C.)

2023-07-03  Julian Brown  

gcc/
* gimplify.cc (omp_get_attachment): Handle GOMP_MAP_DETACH here.
(omp_group_last): Handle *, GOMP_MAP_DETACH, GOMP_MAP_ATTACH_DETACH
groups for "exit data" of reference-to-pointer component array
sections.
(omp_group_base): Handle GOMP_MAP_DETACH.
---
 gcc/gimplify.cc | 30 ++
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 20aba45110f..6280eb7e028 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -9171,6 +9171,7 @@ omp_get_attachment (omp_mapping_group *grp)
 
  case GOMP_MAP_ATTACH_DETACH:
  case GOMP_MAP_ATTACH_ZERO_LENGTH_ARRAY_SECTION:
+ case GOMP_MAP_DETACH:
return OMP_CLAUSE_DECL (node);
 
  default:
@@ -9247,23 +9248,43 @@ omp_group_last (tree *start_p)
 == GOMP_MAP_POINTER_TO_ZERO_LENGTH_ARRAY_SECTION)
 || (OMP_CLAUSE_MAP_KIND (nc)
 == GOMP_MAP_ATTACH_ZERO_LENGTH_ARRAY_SECTION)
+|| OMP_CLAUSE_MAP_KIND (nc) == GOMP_MAP_DETACH
 || OMP_CLAUSE_MAP_KIND (nc) == GOMP_MAP_ALWAYS_POINTER
 || omp_map_clause_descriptor_p (nc)))
{
- grp_last_p = _CLAUSE_CHAIN (c);
- c = nc;
  tree nc2 = OMP_CLAUSE_CHAIN (nc);
+ if (OMP_CLAUSE_MAP_KIND (nc) == GOMP_MAP_DETACH)
+   {
+ /* In the specific case we're doing "exit data" on an array
+slice of a reference-to-pointer struct component, we will see
+DETACH followed by ATTACH_DETACH here.  We want to treat that
+as a single group. In other cases DETACH might represent a
+stand-alone "detach" clause, so we don't want to consider
+that part of the group.  */
+ if (nc2
+ && OMP_CLAUSE_CODE (nc2) == OMP_CLAUSE_MAP
+ && OMP_CLAUSE_MAP_KIND (nc2) == GOMP_MAP_ATTACH_DETACH)
+   goto consume_two_nodes;
+ else
+   break;
+   }
  if (nc2
  && OMP_CLAUSE_CODE (nc2) == OMP_CLAUSE_MAP
  && (OMP_CLAUSE_MAP_KIND (nc)
  == GOMP_MAP_POINTER_TO_ZERO_LENGTH_ARRAY_SECTION)
  && OMP_CLAUSE_MAP_KIND (nc2) == GOMP_MAP_ATTACH)
{
+   consume_two_nodes:
  grp_last_p = _CLAUSE_CHAIN (nc);
  c = nc2;
- nc2 = OMP_CLAUSE_CHAIN (nc2);
+ nc = OMP_CLAUSE_CHAIN (nc2);
+   }
+ else
+   {
+ grp_last_p = _CLAUSE_CHAIN (c);
+ c = nc;
+ nc = nc2;
}
-  nc = nc2;
}
   break;
 
@@ -9416,6 +9437,7 @@ omp_group_base (omp_mapping_group *grp, unsigned int 
*chained,
case GOMP_MAP_ALWAYS_POINTER:
case GOMP_MAP_ATTACH_DETACH:
case GOMP_MAP_ATTACH_ZERO_LENGTH_ARRAY_SECTION:
+   case GOMP_MAP_DETACH:
  return *grp->grp_start;
 
default:
-- 
2.25.1

Re: [PATCH] Fortran: fixes for procedures with ALLOCATABLE,INTENT(OUT) arguments [PR92178]

2023-07-03 Thread Harald Anlauf via Gcc-patches


Hi Mikael,

Am 03.07.23 um 13:46 schrieb Mikael Morin:

A few thing to double check below.


diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index 30946ba3f63..16e8f037cfc 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc

(...)

@@ -6117,6 +6118,33 @@ gfc_conv_procedure_call (gfc_se * se,
gfc_symbol * sym,
    && UNLIMITED_POLY (sym)
    && comp && (strcmp ("_copy", comp->name) == 0);

+  /* First scan argument list for allocatable actual arguments passed to
+ allocatable dummy arguments with INTENT(OUT).  As the corresponding
+ actual arguments are deallocated before execution of the
procedure, we
+ evaluate actual argument expressions to avoid problems with
possible
+ dependencies.  */
+  bool force_eval_args = false;
+  gfc_formal_arglist *tmp_formal;
+  for (arg = args, tmp_formal = formal; arg != NULL;
+   arg = arg->next, tmp_formal = tmp_formal ? tmp_formal->next :
NULL)
+    {
+  e = arg->expr;
+  fsym = tmp_formal ? tmp_formal->sym : NULL;
+  if (e && fsym
+  && e->expr_type == EXPR_VARIABLE
+  && fsym->attr.intent == INTENT_OUT
+  && (fsym->ts.type == BT_CLASS && fsym->attr.class_ok
+  ? CLASS_DATA (fsym)->attr.allocatable
+  : fsym->attr.allocatable)
+  && e->symtree
+  && e->symtree->n.sym
+  && gfc_variable_attr (e, NULL).allocatable)
+    {
+  force_eval_args = true;
+  break;
+    }
+    }
+

The function is already big enough, would you mind outlining this to its
own function?


This can be done.  At least it is not part of the monster loop.




   /* Evaluate the arguments.  */
   for (arg = args, argc = 0; arg != NULL;
    arg = arg->next, formal = formal ? formal->next : NULL, ++argc)
@@ -6680,7 +6708,7 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol
* sym,
   else
 tmp = gfc_finish_block ();

-  gfc_add_expr_to_block (>pre, tmp);
+  gfc_add_expr_to_block (_blk, tmp);
 }

   /* A class array element needs converting back to be a
@@ -6980,7 +7008,7 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol
* sym,
 build_empty_stmt (input_location));
   }
 if (tmp != NULL_TREE)
-  gfc_add_expr_to_block (>pre, tmp);
+  gfc_add_expr_to_block (_blk, tmp);
   }

   tmp = parmse.expr;
@@ -7004,7 +7032,7 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol
* sym,
  void_type_node,
  gfc_conv_expr_present (e->symtree->n.sym),
    tmp, build_empty_stmt (input_location));
-  gfc_add_expr_to_block (>pre, tmp);
+  gfc_add_expr_to_block (_blk, tmp);
 }
 }
 }

These look good, but I'm surprised that there is no similar change at
the 6819 line.
This is the class array actual vs class array dummy case.
It seems to be checked by the "bar" subroutine in your testcase, except
that the intent(out) argument comes last there, whereas it was coming
first with the original testcases in the PR.
Can you double check?


I believe I tried that before and encountered regressions.
The change

diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index 16e8f037cfc..43e013fa720 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -6844,7 +6844,8 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol *
sym,
  else
tmp = gfc_finish_block ();

- gfc_add_expr_to_block (>pre, tmp);
+//   gfc_add_expr_to_block (>pre, tmp);
+ gfc_add_expr_to_block (_blk, tmp);
}

  /* The conversion does not repackage the reference to a class

regresses on:
gfortran.dg/class_array_16.f90
gfortran.dg/finalize_12.f90
gfortran.dg/optional_class_1.f90

A simplified testcase for further study:

program p
  implicit none
  class(*),  allocatable :: c(:)
  c = [3, 4]
  call bar (allocated (c), c, allocated (c))
  if (allocated (c)) stop 14
contains
  subroutine bar (alloc, x, alloc2)
logical :: alloc, alloc2
class(*), allocatable, intent(out) :: x(:)
if (allocated (x)) stop 5
if (.not. alloc)   stop 6
if (.not. alloc2)  stop 16
  end subroutine bar
end

(This fails in a different place for the posted patch and for
the above trial change.  Need to go to the drawing board...)



@@ -7101,6 +7129,21 @@ gfc_conv_procedure_call (gfc_se * se,
gfc_symbol * sym,
 }
 }

+  /* If any actual argument of the procedure is allocatable and
passed
+ to an allocatable dummy with INTENT(OUT), we conservatively
+ evaluate all actual argument expressions before deallocations are
+ performed and the procedure is executed.  This ensures we conform
+ to F2023:15.5.3, 15.5.4.  Create temporaries except for constants,
+ variables, and functions returning pointers that can appear in a
+ variable

[pushed] testsuite, Darwin: Remove an unnecessary flags addition.

2023-07-03 Thread Iain Sandoe via Gcc-patches

This has been in use for some time in the Darwin branches that are used
by downstream distributions. Re-tested on x86_64-darwin, pushed to trunk,
thanks,
Iain

--- 8< ---

The addition of the multiply_defined suppress flag has been handled for some
considerable time now in the Darwin specs; remove it from the testsuite libs.
Avoid duplicates in the specs.

Signed-off-by: Iain Sandoe 

gcc/ChangeLog:

* config/darwin.h: Avoid duplicate multiply_defined specs on
earlier Darwin versions with shared libgcc.

libstdc++-v3/ChangeLog:

* testsuite/lib/libstdc++.exp: Remove additional flag handled
by Darwin specs.

gcc/testsuite/ChangeLog:

* lib/g++.exp: Remove additional flag handled by Darwin specs.
* lib/obj-c++.exp: Likewise.
---
 gcc/config/darwin.h  | 5 ++---
 gcc/testsuite/lib/g++.exp| 4 
 gcc/testsuite/lib/obj-c++.exp| 4 
 libstdc++-v3/testsuite/lib/libstdc++.exp | 3 ---
 4 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/gcc/config/darwin.h b/gcc/config/darwin.h
index e6f76e598e6..714d3d5cc0d 100644
--- a/gcc/config/darwin.h
+++ b/gcc/config/darwin.h
@@ -217,8 +217,7 @@ extern GTY(()) int darwin_ms_struct;
   "%{image_base*:-Xlinker -image_base -Xlinker %*} %= 10.7 mmacosx-version-min= -no_pie) }"
 
 #define DARWIN_CC1_SPEC
\
-  "%

[PATCH] vect: Treat vector widening IFN calls as 'simple' [PR110436]

2023-07-03 Thread Andre Vieira (lists) via Gcc-patches


Hi,

This patch makes the vectorizer treat any vector widening IFN as simple, 
like

it did with the tree codes VEC_WIDEN_*.

I wasn't sure whether I should make all IFN's simple and then exclude 
some (like GOMP_ ones), or include more than just the new widening IFNs. 
But since this is the only behaviour that changed with the ifn patch, I 
decided to only special case the widening IFNs for now. Let me know if 
you have different thoughts on this.


Bootstrapped and regression tested on aarch64-unknow-linux-gnu.

gcc/ChangeLog:

PR tree-optimization/110436
* tree-vect-stmts.cc (is_simple_and_all_uses_invariant): Treat widening
IFN's as simple.

gcc/testsuite/ChangeLog:

* gcc.dg/pr110436.c: New test.diff --git a/gcc/testsuite/gcc.dg/pr110436.c b/gcc/testsuite/gcc.dg/pr110436.c
new file mode 100644
index 
..c146f99fac9f0524eaa3b1230b56e9f94eed5bda
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr110436.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include "pr83089.c"
+
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
d642d3c257f8d540a8562eedbcd40372b9550959..706055e9af94f0c1500c25faf4bd74fc08bf3cd6
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -296,8 +296,11 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
   tree op;
   ssa_op_iter iter;
 
-  gassign *stmt = dyn_cast  (stmt_info->stmt);
-  if (!stmt)
+  gimple *stmt = stmt_info->stmt;
+  if (!is_gimple_assign (stmt)
+  && !(is_gimple_call (stmt)
+  && gimple_call_internal_p (stmt)
+  && widening_fn_p (gimple_call_combined_fn (stmt
 return false;
 
   FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)

[PATCH v2] libstdc++: PSTL dispatch for C++20 range random access iterators [PR110512]

2023-07-03 Thread Gonzalo Brito Gadeschi via Gcc-patches

libstdc++: Recognize C++ random access iterators as random access in PSTL
[PR110432]

The check for random access iterators in the PSTL only checks whether the
iterator inherits from the random_access_iterator_tag, failing to recognize
random access iterators originating in C++20 ranges and views.

This patch extends the check to also recognize types that model the C++20
random_access_iterator concept as providing random access.

This is allowed by C++23's P2408, which is safe to backport to C++20,
because
any application that would break already exhibits undefined
behavior due to precondition violation.

libstdc++-v3/ChangeLog:
PR libstdc++/110512
* include/pstl/execution_impl.h Recognize C++20 random access iterators as
random access.

Bootstrapping and testing
* Tested with x86_64-pc-linux-gnu.

---
 libstdc++-v3/include/pstl/execution_impl.h | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/libstdc++-v3/include/pstl/execution_impl.h
b/libstdc++-v3/include/pstl/execution_impl.h
index 64f6cc4357a..c17da29141e 100644
--- a/libstdc++-v3/include/pstl/execution_impl.h
+++ b/libstdc++-v3/include/pstl/execution_impl.h
@@ -22,7 +22,15 @@ namespace __internal

 template 
 using __are_iterators_of = std::conjunction<
-std::is_base_of<_IteratorTag, typename
std::iterator_traits>::iterator_category>...>;
+#if __cplusplus >= 202002L
+std::disjunction<
+std::is_base_of<_IteratorTag, typename
std::iterator_traits>::iterator_category>,
+std::integral_constant>
+>...
+#else   // __cplusplus
+std::is_base_of<_IteratorTag, typename
std::iterator_traits>::iterator_category>...
+#endif  // __cplusplus
+>;

 template 
 using __are_random_access_iterators =
__are_iterators_of;
-- 
2.17.1

Re: [PATCH] rs6000: Update the vsx-vector-6.* tests.

2023-07-03 Thread Carl Love via Gcc-patches

Kewen:

On Fri, 2023-06-30 at 15:20 -0700, Carl Love wrote:
> Segher never liked the above way of looking at the assembly.  He
> prefers:
>   gcc -S -g -mcpu=power8 -o vsx-vector-6-func-2lop.s vsx-vector-6-
> func-
> 2lop.c
> 
>   grep xxlor vsx-vector-6-func-2lop.s | wc
>  34  68 516
> 
> So, again, I get the same count of 34 on both makalu and genoa.  But
> again, that doesn't agree with what make script/scan-assembler thinks
> the counts should be.
> 
> When I looked at the vsx-vector-6-func-2lop.s I see on BE:
> 
>  
> lxvd2x 0,10,9
> xxlor 0,12,0
> xxlnor 0,0,0
>  ...
> 
> I was guessing that it was adjusting the data layout from the load. 
> But looking again more carefully versus LE:
> 
> 
> lxvd2x 0,31,9 
>xxpermdi 0,0,0,2 
>xxlor 0,12,0  
>xxlnor 0,0,0  
>xxpermdi 0,0,0,2 
> 
> 
> the xxpermdi is probably what is really doing the data layout change.
> 
> So, we have the issue that looking at the assembly gives different
> instruction counts then what 
> 
>dg-final { scan-assembler-times {\mxxlor\M} }
> 
> comes up with???  Now I am really confused.  I don't know how the
> scan-
> assembler-times works but I will go see if I can find it and see if I
> can figure out what the issue is.  I would expect that the scan-
> assembler is working off the --save-temp files, which get deleted as
> part of the run.  I would guess that scan-assembler does a grep to
> find
> the instructions and then maybe uses wc to count them??? I will go
> see
> if I can figure out how scan-assembler-times works.

OK, I figured out why I was getting 34 xxlor instructions instead of
the 22 that the scan-assembler-times was getting.  The difference was
when I compiled the program I forgot to use -O2.  So with -O2 I get the
same number of xxlor instructins as scan-assembler-instructions.  I get
34 if I do not specify optimization.

So, I think the scan-assembler-times are all correct.

As Peter says, counting xxlor is a bit problematic in general.  We
could just drop counting xxlor or have the LE/BE count qualifier for
the instructions.  Your call.

 Carl

RE: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread Li, Pan2 via Gcc-patches

Sure, every change need test and will pay attention for this in future.

Pan

-Original Message-
From: Robin Dapp  
Sent: Monday, July 3, 2023 10:57 PM
To: Li, Pan2 ; juzhe.zh...@rivai.ai; gcc-patches 

Cc: rdapp@gmail.com; jeffreyalaw ; Wang, Yanzhang 
; kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

> Sorry for inconvenient, still working on fix it. If urgent I can
> revert this change to unblock your work ASAP.

I'm not blocked by this, thanks, just wanted to document it here.
I was testing another patch and needed to dig for a while until
I realized the FAILs come from this one.  In general I would
assume that even obvious patches are tested before (I have
introduced bugs by obvious ones before so I make sure to).

Regards
 Robin

Re: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread Robin Dapp via Gcc-patches

> Sorry for inconvenient, still working on fix it. If urgent I can
> revert this change to unblock your work ASAP.

I'm not blocked by this, thanks, just wanted to document it here.
I was testing another patch and needed to dig for a while until
I realized the FAILs come from this one.  In general I would
assume that even obvious patches are tested before (I have
introduced bugs by obvious ones before so I make sure to).

Regards
 Robin

RE: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread Li, Pan2 via Gcc-patches

Sorry for inconvenient, still working on fix it. If urgent I can revert this 
change to unblock your work ASAP.

Pan

-Original Message-
From: Robin Dapp  
Sent: Monday, July 3, 2023 10:49 PM
To: Li, Pan2 ; juzhe.zh...@rivai.ai; gcc-patches 

Cc: rdapp@gmail.com; jeffreyalaw ; Wang, Yanzhang 
; kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

Hmm, looks like it wasn't simple enough...

I'm seeing execution fails for various floating point test cases.
This is due to a mismatch between the FRM_DYN definition (0b111 == 7)
and the attribute value (== 5).  Therefore we set the rounding mode
to 5 instead of 7.

Regards
 Robin

Re: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread Robin Dapp via Gcc-patches

Hmm, looks like it wasn't simple enough...

I'm seeing execution fails for various floating point test cases.
This is due to a mismatch between the FRM_DYN definition (0b111 == 7)
and the attribute value (== 5).  Therefore we set the rounding mode
to 5 instead of 7.

Regards
 Robin

[committed] tree+ggc: Change return type of predicate functions from int to bool

2023-07-03 Thread Uros Bizjak via Gcc-patches

Also change internal variable from int to bool.

gcc/ChangeLog:

* tree.h (tree_int_cst_equal): Change return type from int to bool.
(operand_equal_for_phi_arg_p): Ditto.
(tree_map_base_marked_p): Ditto.
* tree.cc (contains_placeholder_p): Update function body
for bool return type.
(type_cache_hasher::equal): Ditto.
(tree_map_base_hash): Change return type
from int to void and adjust function body accordingly.
(tree_int_cst_equal): Ditto.
(operand_equal_for_phi_arg_p): Ditto.
(get_narrower): Change "first" variable to bool.
(cl_option_hasher::equal): Update function body for bool return type.
* ggc.h (ggc_set_mark): Change return type from int to bool.
(ggc_marked_p): Ditto.
* ggc-page.cc (gt_ggc_mx): Change return type
from int to void and adjust function body accordingly.
(ggc_set_mark): Ditto.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.
diff --git a/gcc/ggc-page.cc b/gcc/ggc-page.cc
index c25218d7415..2f0b72e1b22 100644
--- a/gcc/ggc-page.cc
+++ b/gcc/ggc-page.cc
@@ -1538,7 +1538,7 @@ gt_ggc_mx (unsigned char& x ATTRIBUTE_UNUSED)
P must have been allocated by the GC allocator; it mustn't point to
static objects, stack variables, or memory allocated with malloc.  */
 
-int
+bool
 ggc_set_mark (const void *p)
 {
   page_entry *entry;
@@ -1558,7 +1558,7 @@ ggc_set_mark (const void *p)
 
   /* If the bit was previously set, skip it.  */
   if (entry->in_use_p[word] & mask)
-return 1;
+return true;
 
   /* Otherwise set it, and decrement the free object count.  */
   entry->in_use_p[word] |= mask;
@@ -1567,14 +1567,14 @@ ggc_set_mark (const void *p)
   if (GGC_DEBUG_LEVEL >= 4)
 fprintf (G.debug_file, "Marking %p\n", p);
 
-  return 0;
+  return false;
 }
 
-/* Return 1 if P has been marked, zero otherwise.
+/* Return true if P has been marked, zero otherwise.
P must have been allocated by the GC allocator; it mustn't point to
static objects, stack variables, or memory allocated with malloc.  */
 
-int
+bool
 ggc_marked_p (const void *p)
 {
   page_entry *entry;
diff --git a/gcc/ggc.h b/gcc/ggc.h
index 78eab7eaba6..34108e2f006 100644
--- a/gcc/ggc.h
+++ b/gcc/ggc.h
@@ -90,15 +90,15 @@ extern const struct ggc_root_tab * const 
gt_pch_scalar_rtab[];
 
 /* Actually set the mark on a particular region of memory, but don't
follow pointers.  This function is called by ggc_mark_*.  It
-   returns zero if the object was not previously marked; nonzero if
+   returns false if the object was not previously marked; true if
the object was already marked, or if, for any other reason,
pointers in this data structure should not be traversed.  */
-extern int ggc_set_mark(const void *);
+extern bool ggc_set_mark (const void *);
 
-/* Return 1 if P has been marked, zero otherwise.
+/* Return true if P has been marked, zero otherwise.
P must have been allocated by the GC allocator; it mustn't point to
static objects, stack variables, or memory allocated with malloc.  */
-extern int ggc_marked_p(const void *);
+extern bool ggc_marked_p (const void *);
 
 /* PCH and GGC handling for strings, mostly trivial.  */
 extern void gt_pch_n_S (const void *);
diff --git a/gcc/tree.cc b/gcc/tree.cc
index 58288efa2e2..bd500ec72a5 100644
--- a/gcc/tree.cc
+++ b/gcc/tree.cc
@@ -2839,7 +2839,7 @@ grow_tree_vec (tree v, int len MEM_STAT_DECL)
   return v;
 }
 
-/* Return 1 if EXPR is the constant zero, whether it is integral, float or
+/* Return true if EXPR is the constant zero, whether it is integral, float or
fixed, and scalar, complex or vector.  */
 
 bool
@@ -2850,7 +2850,7 @@ zerop (const_tree expr)
  || fixed_zerop (expr));
 }
 
-/* Return 1 if EXPR is the integer constant zero or a complex constant
+/* Return true if EXPR is the integer constant zero or a complex constant
of zero, or a location wrapper for such a constant.  */
 
 bool
@@ -2874,7 +2874,7 @@ integer_zerop (const_tree expr)
 }
 }
 
-/* Return 1 if EXPR is the integer constant one or the corresponding
+/* Return true if EXPR is the integer constant one or the corresponding
complex constant, or a location wrapper for such a constant.  */
 
 bool
@@ -2898,9 +2898,9 @@ integer_onep (const_tree expr)
 }
 }
 
-/* Return 1 if EXPR is the integer constant one.  For complex and vector,
-   return 1 if every piece is the integer constant one.
-   Also return 1 for location wrappers for such a constant.  */
+/* Return true if EXPR is the integer constant one.  For complex and vector,
+   return true if every piece is the integer constant one.
+   Also return true for location wrappers for such a constant.  */
 
 bool
 integer_each_onep (const_tree expr)
@@ -2914,8 +2914,8 @@ integer_each_onep (const_tree expr)
 return integer_onep (expr);
 }
 
-/* Return 1 if EXPR is an integer containing all 1's in as much precision as
-   it contains, or a complex or vector whose subparts are such integers,

RE: [PATCH V2] Middle-end: Change order of LEN_MASK_LOAD/LEN_MASK_STORE arguments

2023-07-03 Thread Li, Pan2 via Gcc-patches

Committed as passed both the bootstrap and regression test, thanks Richard.

Pan

-Original Message-
From: Gcc-patches  On Behalf 
Of Richard Sandiford via Gcc-patches
Sent: Monday, July 3, 2023 5:27 PM
To: juzhe.zh...@rivai.ai
Cc: gcc-patches@gcc.gnu.org; rguent...@suse.de
Subject: Re: [PATCH V2] Middle-end: Change order of 
LEN_MASK_LOAD/LEN_MASK_STORE arguments

juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong 
>
> Hi, Richard. I fix the order as you suggeted.
>
> Before this patch, the order is {len,mask,bias}.
>
> Now, after this patch, the order becomes {len,bias,mask}.
>
> Since you said we should not need 'internal_fn_bias_index', the bias index 
> should always be the len index + 1.
> I notice LEN_STORE order is {len,vector,bias}, to make them consistent, I 
> reorder into LEN_STORE {len,bias,vector}.
> Just like MASK_STORE {mask,vector}.
>
> Ok for trunk ?
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md: Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
> * config/riscv/riscv-v.cc (expand_load_store): Ditto.
> * doc/md.texi: Ditto.
> * gimple-fold.cc (gimple_fold_partial_load_store_mem_ref): Ditto.
> * internal-fn.cc (len_maskload_direct): Ditto.
> (len_maskstore_direct): Ditto.
> (add_len_and_mask_args): New function.
> (expand_partial_load_optab_fn): Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
> (expand_partial_store_optab_fn): Ditto.
> (internal_fn_len_index): New function.
> (internal_fn_mask_index): Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
> (internal_fn_stored_value_index): Ditto.
> (internal_len_load_store_bias): Ditto.
> * internal-fn.h (internal_fn_len_index): New function.
> * tree-ssa-dse.cc (initialize_ao_ref_for_dse): Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
> * tree-vect-stmts.cc (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.

OK, thanks.

Richard

> ---
>  gcc/config/riscv/autovec.md |   8 +-
>  gcc/config/riscv/riscv-v.cc |   2 +-
>  gcc/doc/md.texi |  16 ++--
>  gcc/gimple-fold.cc  |   8 +-
>  gcc/internal-fn.cc  | 156 ++--
>  gcc/internal-fn.h   |   1 +
>  gcc/tree-ssa-dse.cc |  11 +--
>  gcc/tree-vect-stmts.cc  |  11 +--
>  8 files changed, 107 insertions(+), 106 deletions(-)
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 1488f2be1be..4ab0e9f99eb 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -26,8 +26,8 @@
>[(match_operand:V 0 "register_operand")
> (match_operand:V 1 "memory_operand")
> (match_operand 2 "autovec_length_operand")
> -   (match_operand: 3 "vector_mask_operand")
> -   (match_operand 4 "const_0_operand")]
> +   (match_operand 3 "const_0_operand")
> +   (match_operand: 4 "vector_mask_operand")]
>"TARGET_VECTOR"
>  {
>riscv_vector::expand_load_store (operands, true);
> @@ -38,8 +38,8 @@
>[(match_operand:V 0 "memory_operand")
> (match_operand:V 1 "register_operand")
> (match_operand 2 "autovec_length_operand")
> -   (match_operand: 3 "vector_mask_operand")
> -   (match_operand 4 "const_0_operand")]
> +   (match_operand 3 "const_0_operand")
> +   (match_operand: 4 "vector_mask_operand")]
>"TARGET_VECTOR"
>  {
>riscv_vector::expand_load_store (operands, false);
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index adb8d7d36a5..8d5bed7ebe4 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -2777,7 +2777,7 @@ expand_load_store (rtx *ops, bool is_load)
>  {
>poly_int64 value;
>rtx len = ops[2];
> -  rtx mask = ops[3];
> +  rtx mask = ops[4];
>machine_mode mode = GET_MODE (ops[0]);
>  
>if (poly_int_rtx_p (len, ) && known_eq (value, GET_MODE_NUNITS 
> (mode)))
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index cefdee84821..5e5482265cd 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5302,15 +5302,15 @@ This pattern is not allowed to @code{FAIL}.
>  @cindex @code{len_maskload@var{m}@var{n}} instruction pattern
>  @item @samp{len_maskload@var{m}@var{n}}
>  Perform a masked load from the memory location pointed to by operand 1
> -into register operand 0.  (operand 2 + operand 4) elements are loaded from
> +into register operand 0.  (operand 2 + operand 3) elements are loaded from
>  memory and other elements in operand 0 are set to undefined values.
>  This is a combination of len_load and maskload.
>  Operands 0 and 1 have mode @var{m}, which must be a vector mode.  Operand 2
>  has whichever integer mode the target prefers.  A mask is specified in
> -operand 3 which must be of type @var{n}.  The mask has lower precedence than
> +operand 4 which must be of type @var{n}.  The mask has lower precedence than

Re: [PATCH V7] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern

2023-07-03 Thread Richard Sandiford via Gcc-patches

juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong 
>
> Hi, Richi and Richard.
>
> Base one the review comments from Richard:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623405.html
>
> I change len_mask_gather_load/len_mask_scatter_store order into:
> {len,bias,mask}
>
> We adjust adding len and mask using using add_len_and_mask_args
> which is same as partial_load/parial_store.
>
> Now, the codes become more reasonable and easier maintain.
>
> This patch is adding LEN_MASK_{GATHER_LOAD,SCATTER_STORE} to allow targets
> handle flow control by mask and loop control by length on gather/scatter 
> memory
> operations. Consider this following case:
>
> #include 
> void
> f (uint8_t *restrict a,
>uint8_t *restrict b, int n,
>int base, int step,
>int *restrict cond)
> {
>   for (int i = 0; i < n; ++i)
> {
>   if (cond[i])
> a[i * step + base] = b[i * step + base];
> }
> }
>
> We hope RVV can vectorize such case into following IR:
>
> loop_len = SELECT_VL
> control_mask = comparison
> v = LEN_MASK_GATHER_LOAD (.., loop_len, bias, control_mask)
> LEN_SCATTER_STORE (... v, ..., loop_len, bias, control_mask)
>
> This patch doesn't apply such patterns into vectorizer, just add patterns
> and update the documents.
>
> Will send patch which apply such patterns into vectorizer soon after this
> patch is approved.
>
> Ok for trunk?
>
> gcc/ChangeLog:
>
> * doc/md.texi: Add len_mask_gather_load/len_mask_scatter_store.
> * internal-fn.cc (expand_scatter_store_optab_fn): Ditto.
> (expand_gather_load_optab_fn): Ditto.
> (internal_load_fn_p): Ditto.
> (internal_store_fn_p): Ditto.
> (internal_gather_scatter_fn_p): Ditto.
> (internal_fn_len_index): Ditto.
> (internal_fn_mask_index): Ditto.
> (internal_fn_stored_value_index): Ditto.
> * internal-fn.def (LEN_MASK_GATHER_LOAD): Ditto.
> (LEN_MASK_SCATTER_STORE): Ditto.
> * optabs.def (OPTAB_CD): Ditto.

Nice!  OK, thanks.

Richard

> ---
>  gcc/doc/md.texi | 17 +
>  gcc/internal-fn.cc  | 32 +---
>  gcc/internal-fn.def |  8 ++--
>  gcc/optabs.def  |  2 ++
>  4 files changed, 42 insertions(+), 17 deletions(-)
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 5e5482265cd..f14dd32b2dc 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5040,6 +5040,15 @@ operand 5.  Bit @var{i} of the mask is set if element 
> @var{i}
>  of the result should be loaded from memory and clear if element @var{i}
>  of the result should be set to zero.
>  
> +@cindex @code{len_mask_gather_load@var{m}@var{n}} instruction pattern
> +@item @samp{len_mask_gather_load@var{m}@var{n}}
> +Like @samp{gather_load@var{m}@var{n}}, but takes an extra length operand 
> (operand 5),
> +a bias operand (operand 6) as well as a mask operand (operand 7).  Similar 
> to len_maskload,
> +the instruction loads at most (operand 5 + operand 6) elements from memory.
> +Bit @var{i} of the mask is set if element @var{i} of the result should
> +be loaded from memory and clear if element @var{i} of the result should be 
> undefined.
> +Mask elements @var{i} with @var{i} > (operand 5 + operand 6) are ignored.
> +
>  @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
>  @item @samp{scatter_store@var{m}@var{n}}
>  Store a vector of mode @var{m} into several distinct memory locations.
> @@ -5069,6 +5078,14 @@ Like @samp{scatter_store@var{m}@var{n}}, but takes an 
> extra mask operand as
>  operand 5.  Bit @var{i} of the mask is set if element @var{i}
>  of the result should be stored to memory.
>  
> +@cindex @code{len_mask_scatter_store@var{m}@var{n}} instruction pattern
> +@item @samp{len_mask_scatter_store@var{m}@var{n}}
> +Like @samp{scatter_store@var{m}@var{n}}, but takes an extra length operand 
> (operand 5),
> +a bias operand (operand 6) as well as a mask operand (operand 7).  The 
> instruction stores
> +at most (operand 5 + operand 6) elements of (operand 4) to memory.
> +Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be 
> stored.
> +Mask elements @var{i} with @var{i} > (operand 5 + operand 6) are ignored.
> +
>  @cindex @code{vec_set@var{m}} instruction pattern
>  @item @samp{vec_set@var{m}}
>  Set given field in the vector value.  Operand 0 is the vector to modify,
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index c1fcb38b17b..303df102d81 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -3507,7 +3507,6 @@ expand_scatter_store_optab_fn (internal_fn, gcall 
> *stmt, direct_optab optab)
>  {
>internal_fn ifn = gimple_call_internal_fn (stmt);
>int rhs_index = internal_fn_stored_value_index (ifn);
> -  int mask_index = internal_fn_mask_index (ifn);
>tree base = gimple_call_arg (stmt, 0);
>tree offset = gimple_call_arg (stmt, 1);
>tree scale = gimple_call_arg (stmt, 2);
> @@ -3518,19 +3517,14 @@ expand_scatter_store_optab_fn

Re: [PATCH v2] RISC-V: Add support for vector crypto extensions

2023-07-03 Thread Kito Cheng via Gcc-patches

Thanks, LGTM :)

Christoph Muellner 於 2023年7月3日 週一，19:08寫道：

> From: Christoph Müllner 
>
> This series adds basic support for the vector crypto extensions:
> * Zvbb
> * Zvbc
> * Zvkg
> * Zvkned
> * Zvkhn[a,b]
> * Zvksed
> * Zvksh
> * Zvkn
> * Zvknc
> * Zvkng
> * Zvks
> * Zvksc
> * Zvksg
> * Zvkt
>
> This patch is based on the v20230620 version of the Vector Cryptography
> specification. The specification is frozen and can be found here:
>   https://github.com/riscv/riscv-crypto/releases/tag/v20230620
>
> Binutils support has been merged upstream a few days ago.
>
> All extensions come with tests for the feature test macros.
>
> gcc/ChangeLog:
>
> * common/config/riscv/riscv-common.cc: Add support for zvbb,
> zvbc, zvkg, zvkned, zvknha, zvknhb, zvksed, zvksh, zvkn,
> zvknc, zvkng, zvks, zvksc, zvksg, zvkt and the implied subsets.
> * config/riscv/arch-canonicalize: Add canonicalization info for
> zvkn, zvknc, zvkng, zvks, zvksc, zvksg.
> * config/riscv/riscv-opts.h (MASK_ZVBB): New macro.
> (MASK_ZVBC): Likewise.
> (TARGET_ZVBB): Likewise.
> (TARGET_ZVBC): Likewise.
> (MASK_ZVKG): Likewise.
> (MASK_ZVKNED): Likewise.
> (MASK_ZVKNHA): Likewise.
> (MASK_ZVKNHB): Likewise.
> (MASK_ZVKSED): Likewise.
> (MASK_ZVKSH): Likewise.
> (MASK_ZVKN): Likewise.
> (MASK_ZVKNC): Likewise.
> (MASK_ZVKNG): Likewise.
> (MASK_ZVKS): Likewise.
> (MASK_ZVKSC): Likewise.
> (MASK_ZVKSG): Likewise.
> (MASK_ZVKT): Likewise.
> (TARGET_ZVKG): Likewise.
> (TARGET_ZVKNED): Likewise.
> (TARGET_ZVKNHA): Likewise.
> (TARGET_ZVKNHB): Likewise.
> (TARGET_ZVKSED): Likewise.
> (TARGET_ZVKSH): Likewise.
> (TARGET_ZVKN): Likewise.
> (TARGET_ZVKNC): Likewise.
> (TARGET_ZVKNG): Likewise.
> (TARGET_ZVKS): Likewise.
> (TARGET_ZVKSC): Likewise.
> (TARGET_ZVKSG): Likewise.
> (TARGET_ZVKT): Likewise.
> * config/riscv/riscv.opt: Introduction of riscv_zv{b,k}_subext.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/zvbb.c: New test.
> * gcc.target/riscv/zvbc.c: New test.
> * gcc.target/riscv/zvkg.c: New test.
> * gcc.target/riscv/zvkn-1.c: New test.
> * gcc.target/riscv/zvkn.c: New test.
> * gcc.target/riscv/zvknc-1.c: New test.
> * gcc.target/riscv/zvknc-2.c: New test.
> * gcc.target/riscv/zvknc.c: New test.
> * gcc.target/riscv/zvkned.c: New test.
> * gcc.target/riscv/zvkng-1.c: New test.
> * gcc.target/riscv/zvkng-2.c: New test.
> * gcc.target/riscv/zvkng.c: New test.
> * gcc.target/riscv/zvknha.c: New test.
> * gcc.target/riscv/zvknhb.c: New test.
> * gcc.target/riscv/zvks-1.c: New test.
> * gcc.target/riscv/zvks.c: New test.
> * gcc.target/riscv/zvksc-1.c: New test.
> * gcc.target/riscv/zvksc-2.c: New test.
> * gcc.target/riscv/zvksc.c: New test.
> * gcc.target/riscv/zvksed.c: New test.
> * gcc.target/riscv/zvksg-1.c: New test.
> * gcc.target/riscv/zvksg-2.c: New test.
> * gcc.target/riscv/zvksg.c: New test.
> * gcc.target/riscv/zvksh.c: New test.
> * gcc.target/riscv/zvkt.c: New test.
>
> Signed-off-by: Christoph Müllner 
> ---
> Changes for v2:
> - Update patch for specification version v20230620
>
>  gcc/common/config/riscv/riscv-common.cc  | 55 
>  gcc/config/riscv/arch-canonicalize   |  7 +++
>  gcc/config/riscv/riscv-opts.h| 34 +++
>  gcc/config/riscv/riscv.opt   |  6 +++
>  gcc/testsuite/gcc.target/riscv/zvbb.c| 13 ++
>  gcc/testsuite/gcc.target/riscv/zvbc.c| 13 ++
>  gcc/testsuite/gcc.target/riscv/zvkg.c| 13 ++
>  gcc/testsuite/gcc.target/riscv/zvkn-1.c  | 29 +
>  gcc/testsuite/gcc.target/riscv/zvkn.c| 29 +
>  gcc/testsuite/gcc.target/riscv/zvknc-1.c | 37 
>  gcc/testsuite/gcc.target/riscv/zvknc-2.c | 37 
>  gcc/testsuite/gcc.target/riscv/zvknc.c   | 37 
>  gcc/testsuite/gcc.target/riscv/zvkned.c  | 13 ++
>  gcc/testsuite/gcc.target/riscv/zvkng-1.c | 37 
>  gcc/testsuite/gcc.target/riscv/zvkng-2.c | 37 
>  gcc/testsuite/gcc.target/riscv/zvkng.c   | 37 
>  gcc/testsuite/gcc.target/riscv/zvknha.c  | 13 ++
>  gcc/testsuite/gcc.target/riscv/zvknhb.c  | 13 ++
>  gcc/testsuite/gcc.target/riscv/zvks-1.c  | 29 +
>  gcc/testsuite/gcc.target/riscv/zvks.c| 29 +
>  gcc/testsuite/gcc.target/riscv/zvksc-1.c | 37 
>  gcc/testsuite/gcc.target/riscv/zvksc-2.c | 37 
>  gcc/testsuite/gcc.target/riscv/zvksc.c   | 37 
>  gcc/testsuite/gcc.target/riscv/zvksed.c  | 13 ++
>

Re: [PATCH v1] RISC-V: Fix one typo for emit_mode_set.

2023-07-03 Thread Kito Cheng via Gcc-patches

Lgtm


juzhe.zh...@rivai.ai 於 2023年7月3日 週一，19:11寫道：

> LGTM
>
>
>
> juzhe.zh...@rivai.ai
>
> From: pan2.li
> Date: 2023-07-03 18:57
> To: gcc-patches
> CC: juzhe.zhong; jeffreyalaw; pan2.li; yanzhang.wang; kito.cheng
> Subject: [PATCH v1] RISC-V: Fix one typo for emit_mode_set.
> From: Pan Li 
>
> This patch would like to fix one typo for scaler[should be scalar] in
> emit_mode_set, as well as minor change for mov emit.
>
> Signed-off-by: Pan Li 
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc (riscv_emit_mode_set): Fix typo.
> ---
> gcc/config/riscv/riscv.cc | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index e4dc8115e69..7761e946761 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -7672,11 +7672,11 @@ riscv_emit_mode_set (int entity, int mode, int
> prev_mode,
>  case RISCV_FRM:
>if (mode != FRM_MODE_NONE && mode != prev_mode)
> {
> -   rtx scaler = gen_reg_rtx (SImode);
> +   rtx scalar = gen_reg_rtx (SImode);
>   rtx imm = gen_int_mode (mode, SImode);
> -   emit_insn (gen_movsi (scaler, imm));
> -   emit_insn (gen_fsrm (scaler, scaler));
> +   emit_move_insn (scalar, imm);
> +   emit_insn (gen_fsrm (scalar, scalar));
> }
>break;
>  default:
> --
> 2.34.1
>
>
>

[COMMITTED] ada: Fix renaming of predefined equality operator for unchecked union types

2023-07-03 Thread Marc Poulhiès via Gcc-patches

From: Eric Botcazou 

The problem is that the predefined equality operator for unchecked union
types is implemented out of line by invoking a function that takes more
parameters than the two operands, which means that the renaming is not
seen as type conforming with this function and, therefore, is rejected.

The way out is to implement these additional parameters as "extra" formal
parameters, since this kind of parameters is not taken into account for
semantic checks.  The change also factors out the duplicated generation
of actuals for these additional parameters into a single procedure.

gcc/ada/

* exp_ch3.ads (Build_Variant_Record_Equality): Add Spec_Id as second
parameter.
* exp_ch3.adb (Build_Variant_Record_Equality): For unchecked union
types, build the additional parameters as extra formal parameters.
(Expand_Freeze_Record_Type.Build_Variant_Record_Equality): Pass
Empty as Spec_Id in call to Build_Variant_Record_Equality.
* exp_ch4.ads (Expand_Unchecked_Union_Equality): New procedure.
* exp_ch4.adb (Expand_Composite_Equality): In the presence of a
function implementing composite equality, do not special case the
unchecked union types, and only convert the operands if the base
types are not the same like in Build_Equality_Call.
(Build_Equality_Call): Do not special case the unchecked union types
and relocate the operands only once.
(Expand_N_Op_Eq): Do not special case the unchecked union types.
(Expand_Unchecked_Union_Equality): New procedure implementing the
specific expansion of calls to the predefined equality function.
* exp_ch6.adb (Is_Unchecked_Union_Equality): New predicate.
(Expand_Call): Call Is_Unchecked_Union_Equality to determine whether
to call Expand_Unchecked_Union_Equality or Expand_Call_Helper.
* exp_ch8.adb (Build_Body_For_Renaming): Set Has_Delayed_Freeze flag
earlier on Id and pass Id in call to Build_Variant_Record_Equality.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_ch3.adb |  57 +++-
 gcc/ada/exp_ch3.ads |   4 +-
 gcc/ada/exp_ch4.adb | 682 ++--
 gcc/ada/exp_ch4.ads |   8 +
 gcc/ada/exp_ch6.adb |  63 +++-
 gcc/ada/exp_ch8.adb |   3 +-
 6 files changed, 390 insertions(+), 427 deletions(-)

diff --git a/gcc/ada/exp_ch3.adb b/gcc/ada/exp_ch3.adb
index 463b77fae67..daf27fb25e9 100644
--- a/gcc/ada/exp_ch3.adb
+++ b/gcc/ada/exp_ch3.adb
@@ -4606,6 +4606,7 @@ package body Exp_Ch3 is
 
function Build_Variant_Record_Equality
  (Typ : Entity_Id;
+  Spec_Id : Entity_Id;
   Body_Id : Entity_Id;
   Param_Specs : List_Id) return Node_Id
is
@@ -4652,42 +4653,66 @@ package body Exp_Ch3 is
 
   if Is_Unchecked_Union (Typ) then
  declare
+Right_Formal : constant Entity_Id :=
+  (if Present (Spec_Id) then Last_Formal (Spec_Id) else Right);
+Scop : constant Entity_Id :=
+  (if Present (Spec_Id) then Spec_Id else Body_Id);
+
+procedure Decorate_Extra_Formal (F, F_Typ : Entity_Id);
+--  Decorate extra formal F with type F_Typ
+
+---
+-- Decorate_Extra_Formal --
+---
+
+procedure Decorate_Extra_Formal (F, F_Typ : Entity_Id) is
+begin
+   Mutate_Ekind  (F, E_In_Parameter);
+   Set_Etype (F, F_Typ);
+   Set_Scope (F, Scop);
+   Set_Mechanism (F, By_Copy);
+end Decorate_Extra_Formal;
+
 A  : Entity_Id;
 B  : Entity_Id;
 Discr  : Entity_Id;
 Discr_Type : Entity_Id;
+Last_Extra : Entity_Id := Empty;
 New_Discrs : Elist_Id;
 
  begin
+Mutate_Ekind (Body_Id, E_Subprogram_Body);
 New_Discrs := New_Elmt_List;
 
 Discr := First_Discriminant (Typ);
 while Present (Discr) loop
Discr_Type := Etype (Discr);
 
+   --  Add the new parameters as extra formals
+
A :=
  Make_Defining_Identifier (Loc,
Chars => New_External_Name (Chars (Discr), 'A'));
 
+   Decorate_Extra_Formal (A, Discr_Type);
+
+   if Present (Last_Extra) then
+  Set_Extra_Formal (Last_Extra, A);
+   else
+  Set_Extra_Formal (Right_Formal, A);
+  Set_Extra_Formals (Scop, A);
+   end if;
+
+   Append_Elmt (A, New_Discrs);
+
B :=
  Make_Defining_Identifier (Loc,
Chars => New_External_Name (Chars (Discr), 'B'));
 
-   --  Add new parameters to the parameter list
+   Decorate_Extra_Formal (B, Discr_Type);
 
-

[COMMITTED] ada: Fix discrepancy in expansion of untagged record equality

2023-07-03 Thread Marc Poulhiès via Gcc-patches

From: Eric Botcazou 

The expansion of the predefined equality operator for untagged record types
can be done either in line, i.e. into the component-wise comparison of the
operands, or out of line, i.e. into a call to a function implementing this
comparison, and the heuristics of the selection are essentially based on the
complexity of the implementation.

For discriminated record types with a variant part, which comprise unchecked
union types, the expansion is always done out of line.  For nondiscriminated
types, the expansion is done in line, unless one of the components is of a
record type for which a user-defined equality operator exists, in which case
the expansion is done out of line.

For the third case, i.e. discriminated record types without a variant part,
the expansion is always done in line.  Now given that the discriminants are
considered as mere components for the purpose of predefined equality in this
case, there does not seem to be any reason for treating it differently from
the second case above.

gcc/ada/

* exp_ch3.adb (Build_Untagged_Equality): Rename into...
(Build_Untagged_Record_Equality): ...this.
(Expand_Freeze_Record_Type): Adjust to above renaming and invoke
the procedure also for discriminated types without a variant part.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_ch3.adb | 41 -
 1 file changed, 16 insertions(+), 25 deletions(-)

diff --git a/gcc/ada/exp_ch3.adb b/gcc/ada/exp_ch3.adb
index 7ac4680b395..463b77fae67 100644
--- a/gcc/ada/exp_ch3.adb
+++ b/gcc/ada/exp_ch3.adb
@@ -139,7 +139,7 @@ package body Exp_Ch3 is
--  the code expansion for controlled components (when control actions
--  are active) can lead to very large blocks that GCC handles poorly.
 
-   procedure Build_Untagged_Equality (Typ : Entity_Id);
+   procedure Build_Untagged_Record_Equality (Typ : Entity_Id);
--  AI05-0123: Equality on untagged records composes. This procedure
--  builds the equality routine for an untagged record that has components
--  of a record type that has user-defined primitive equality operations.
@@ -4450,11 +4450,11 @@ package body Exp_Ch3 is
   Set_Is_Pure (Proc_Name);
end Build_Slice_Assignment;
 
-   -
-   -- Build_Untagged_Equality --
-   -
+   
+   -- Build_Untagged_Record_Equality --
+   
 
-   procedure Build_Untagged_Equality (Typ : Entity_Id) is
+   procedure Build_Untagged_Record_Equality (Typ : Entity_Id) is
   Build_Eq : Boolean;
   Comp : Entity_Id;
   Decl : Node_Id;
@@ -4481,7 +4481,7 @@ package body Exp_Ch3 is
  end if;
   end User_Defined_Eq;
 
-   --  Start of processing for Build_Untagged_Equality
+   --  Start of processing for Build_Untagged_Record_Equality
 
begin
   --  If a record component has a primitive equality operation, we must
@@ -4558,7 +4558,7 @@ package body Exp_Ch3 is
 Set_Is_Public (Op);
  end if;
   end if;
-   end Build_Untagged_Equality;
+   end Build_Untagged_Record_Equality;
 
---
-- Build_Variant_Record_Equality --
@@ -5803,25 +5803,18 @@ package body Exp_Ch3 is
  end if;
 
   --  In the untagged case, ever since Ada 83 an equality function must
-  --  be  provided for variant records that are not unchecked unions.
-  --  In Ada 2012 the equality function composes, and thus must be built
-  --  explicitly just as for tagged records.
+  --  be provided for variant records that are not unchecked unions.
 
   elsif Has_Discriminants (Typ)
 and then not Is_Limited_Type (Typ)
+and then Present (Component_List (Type_Definition (Typ_Decl)))
+and then
+  Present (Variant_Part (Component_List (Type_Definition (Typ_Decl
   then
- declare
-Comps : constant Node_Id :=
-  Component_List (Type_Definition (Typ_Decl));
- begin
-if Present (Comps)
-  and then Present (Variant_Part (Comps))
-then
-   Build_Variant_Record_Equality (Typ);
-end if;
- end;
+ Build_Variant_Record_Equality (Typ);
 
-  --  Otherwise create primitive equality operation (AI05-0123)
+  --  In Ada 2012 the equality function composes, and thus must be built
+  --  explicitly just as for tagged records.
 
   --  This is done unconditionally to ensure that tools can be linked
   --  properly with user programs compiled with older language versions.
@@ -5832,7 +5825,7 @@ package body Exp_Ch3 is
 and then Convention (Typ) = Convention_Ada
 and then not Is_Limited_Type (Typ)
   then
- Build_Untagged_Equality (Typ);
+ Build_Untagged_Record_Equality (Typ);
   end if;
 
   --  Before

[COMMITTED] ada: Fix small inaccuracy in implementation of B.3.3(20/2)

2023-07-03 Thread Marc Poulhiès via Gcc-patches

From: Eric Botcazou 

This is the clause about inferable discriminants in unchecked unions.

gcc/ada/

* sem_util.adb (Has_Inferable_Discriminants): In the case of a
component with a per-object constraint, also return true if the
enclosing object is not of an unchecked union type.
In the default case, remove a useless call to Base_Type.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/sem_util.adb | 35 ++-
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/gcc/ada/sem_util.adb b/gcc/ada/sem_util.adb
index d9ea00e53cb..736751f5fae 100644
--- a/gcc/ada/sem_util.adb
+++ b/gcc/ada/sem_util.adb
@@ -12272,33 +12272,26 @@ package body Sem_Util is
begin
   --  For selected components, the subtype of the selector must be a
   --  constrained Unchecked_Union. If the component is subject to a
-  --  per-object constraint, then the enclosing object must have inferable
-  --  discriminants.
+  --  per-object constraint, then the enclosing object must either be
+  --  a regular discriminated type or must have inferable discriminants.
 
   if Nkind (N) = N_Selected_Component then
- if Has_Per_Object_Constraint (Entity (Selector_Name (N))) then
-
---  A small hack. If we have a per-object constrained selected
---  component of a formal parameter, return True since we do not
---  know the actual parameter association yet.
-
-if Prefix_Is_Formal_Parameter (N) then
-   return True;
-
---  Otherwise, check the enclosing object and the selector
-
-else
-   return Has_Inferable_Discriminants (Prefix (N))
- and then Has_Inferable_Discriminants (Selector_Name (N));
-end if;
-
  --  The call to Has_Inferable_Discriminants will determine whether
  --  the selector has a constrained Unchecked_Union nominal type.
 
- else
-return Has_Inferable_Discriminants (Selector_Name (N));
+ if not Has_Inferable_Discriminants (Selector_Name (N)) then
+return False;
  end if;
 
+ --  A small hack. If we have a per-object constrained selected
+ --  component of a formal parameter, return True since we do not
+ --  know the actual parameter association yet.
+
+ return not Has_Per_Object_Constraint (Entity (Selector_Name (N)))
+   or else not Is_Unchecked_Union (Etype (Prefix (N)))
+   or else Has_Inferable_Discriminants (Prefix (N))
+   or else Prefix_Is_Formal_Parameter (N);
+
   --  A qualified expression has inferable discriminants if its subtype
   --  mark is a constrained Unchecked_Union subtype.
 
@@ -12310,7 +12303,7 @@ package body Sem_Util is
   --  Unchecked_Union nominal subtype.
 
   else
- return Is_Unchecked_Union (Base_Type (Etype (N)))
+ return Is_Unchecked_Union (Etype (N))
and then Is_Constrained (Etype (N));
   end if;
end Has_Inferable_Discriminants;
-- 
2.40.0

Re: [PATCH] middle-end/110495 - avoid associating constants with (VL) vectors

2023-07-03 Thread Richard Biener via Gcc-patches

On Mon, 3 Jul 2023, Richard Sandiford wrote:

> Richard Biener via Gcc-patches  writes:
> > When trying to associate (v + INT_MAX) + INT_MAX we are using
> > the TREE_OVERFLOW bit to check for correctness.  That isn't
> > working for VECTOR_CSTs and it can't in general when one considers
> > VL vectors.  It looks like it should work for COMPLEX_CSTs but
> > I didn't try to single out _Complex int in this change.
> >
> > The following makes sure that for vectors we use the fallback of
> > using unsigned arithmetic when associating the above to
> > v + (INT_MAX + INT_MAX).
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
> >
> > Thanks,
> > Richard.
> >
> > PR middle-end/110495
> > * tree.h (TREE_OVERFLOW): Do not mention VECTOR_CSTs
> > since we do not set TREE_OVERFLOW on those since the
> > introduction of VL vectors.
> > * match.pd (x +- CST +- CST): For VECTOR_CST do not look
> > at TREE_OVERFLOW to determine validity of association.
> >
> > * gcc.dg/tree-ssa/addadd-2.c: Amend.
> > * gcc.dg/tree-ssa/forwprop-27.c: Adjust.
> > ---
> >  gcc/match.pd| 9 +
> >  gcc/testsuite/gcc.dg/tree-ssa/addadd-2.c| 1 +
> >  gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c | 4 +++-
> >  gcc/tree.h  | 2 +-
> >  4 files changed, 10 insertions(+), 6 deletions(-)
> >
> > diff --git a/gcc/match.pd b/gcc/match.pd
> > index f09583bbcac..d193a572005 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -3025,7 +3025,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > (with { tree cst = const_binop (outer_op == inner_op
> > ? PLUS_EXPR : MINUS_EXPR,
> > type, @1, @2); }
> > -(if (cst && !TREE_OVERFLOW (cst))
> > +(if (INTEGRAL_TYPE_P (type) && cst && !TREE_OVERFLOW (cst))
> >   (inner_op @0 { cst; } )
> >   /* X+INT_MAX+1 is X-INT_MIN.  */
> >   (if (INTEGRAL_TYPE_P (type) && cst
> > @@ -3037,7 +3037,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >  (view_convert (inner_op
> > (view_convert:utype @0)
> > (view_convert:utype
> > -{ drop_tree_overflow (cst); }))
> > +{ TREE_OVERFLOW (cst)
> > +  ? drop_tree_overflow (cst) : cst; }))
> 
> It looks like the whole ?(with ?)? expects cst to be nonnull,
> but the ?last resort? doesn't check it (unless I'm misreading).
> Would it be easier to add a top-level ?if (cst)??  (Obviously
> a preexisting thing.)

Hmm, indeed.  I've added an outer if (cst).

> >  
> >/* (CST1 - A) +- CST2 -> CST3 - A  */
> >(for outer_op (plus minus)
> > @@ -3049,7 +3050,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > forever if something doesn't simplify into a constant.  */
> >   (if (!CONSTANT_CLASS_P (@0))
> >(minus (outer_op! (view_convert @1) @2) (view_convert @0)))
> > - (if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
> > + (if (!INTEGRAL_TYPE_P (TREE_TYPE (@0))
> >   || TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
> >(view_convert (minus (outer_op! @1 (view_convert @2)) @0))
> >(if (types_match (type, @0) && !TYPE_OVERFLOW_SANITIZED (type))
> > @@ -3068,7 +3069,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >forever if something doesn't simplify into a constant.  */
> >  (if (!CONSTANT_CLASS_P (@0))
> >   (plus (view_convert @0) (minus! @1 (view_convert @2
> > -(if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
> > +(if (!INTEGRAL_TYPE_P (TREE_TYPE (@0))
> >  || TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
> >   (view_convert (plus @0 (minus! (view_convert @1) @2)))
> >   (if (types_match (type, @0) && !TYPE_OVERFLOW_SANITIZED (type))
> 
> I didn't understand this part.  Doesn't it mean that we allow
> overflow-inducing reassociations for all vector integer types,
> albeit not immediately folded away?

Oh, indeed - I though I can circumvent the TREE_OVERFLOW check for
those (where I don't yet have a testcase) by altering the guarding
check - but that check is to guard the TYPE_OVERFLOW_* checks.

To fix this we'd have to add unsigned fallbacks like for the above
pattern.  I'm going to remove the two hunks for now.

> Also, why do we keep the:
> 
>   !ANY_INTEGRAL_TYPE_P (type) || TYPE_OVERFLOW_WRAPS (type)
> 
> in the outer ifs?

I think this is distict types since both patterns have conditiona
conversions in the patterns they match.  Otherwise it would be
redundant checking and would have been better if placed as 'else'
branch of the inner ifs.

As said, going to fix the missing conditional on non-null 'cst'
and drop the two hunks unrelated to the PR (which also were
wrong - thanks for noticing).

Richard.


> 
> But that's just me not understanding match.pd very well.
> Feel free to ignore if it's nonsense. :)
> 
> Thanks,
> Richard
> 

-- 
Richard Biener 
SUSE Software Solutions

[PATCH] tree-optimization/110310 - move vector epilogue disabling to analysis phase

2023-07-03 Thread Richard Biener via Gcc-patches

The following removes late deciding to elide vectorized epilogues to
the analysis phase and also avoids altering the epilogues niter.
The costing part from vect_determine_partial_vectors_and_peeling is
moved to vect_analyze_loop_costing where we use the main loop
analysis to constrain the epilogue scalar iterations.

I have not tried to integrate this with vect_known_niters_smaller_than_vf.

It seems the for_epilogue_p parameter in
vect_determine_partial_vectors_and_peeling is largely useless and
we could compute that in the function itself.

Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?

I suppose testing on aarch64 would be nice-to-have - any takers?

Thanks,
Richard.

PR tree-optimization/110310
* tree-vect-loop.cc (vect_determine_partial_vectors_and_peeling):
Move costing part ...
(vect_analyze_loop_costing): ... here.  Integrate better
estimate for epilogues from ...
(vect_analyze_loop_2): Call vect_determine_partial_vectors_and_peeling
with actual epilogue status.
* tree-vect-loop-manip.cc (vect_do_peeling): ... here and
avoid cancelling epilogue vectorization.
(vect_update_epilogue_niters): Remove.  No longer update
epilogue LOOP_VINFO_NITERS.

* gcc.target/i386/pr110310.c: New testcase.
* gcc.dg/vect/slp-perm-12.c: Disable epilogue vectorization.
---
 gcc/testsuite/gcc.dg/vect/slp-perm-12.c  |   1 +
 gcc/testsuite/gcc.target/i386/pr110310.c |  13 +++
 gcc/tree-vect-loop-manip.cc  | 104 +--
 gcc/tree-vect-loop.cc|  98 ++---
 4 files changed, 102 insertions(+), 114 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110310.c

diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-12.c 
b/gcc/testsuite/gcc.dg/vect/slp-perm-12.c
index 113223ab0f9..635fca54399 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-perm-12.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-perm-12.c
@@ -1,5 +1,6 @@
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_pack_trunc } */
+/* { dg-additional-options "--param vect-epilogues-nomask=0" } */
 /* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */
 
 #include "tree-vect.h"
diff --git a/gcc/testsuite/gcc.target/i386/pr110310.c 
b/gcc/testsuite/gcc.target/i386/pr110310.c
new file mode 100644
index 000..dce388aeb20
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110310.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=znver4 -fdump-tree-vect-optimized" } */
+
+void foo (int * __restrict a, int *b)
+{
+  for (int i = 0; i < 20; ++i)
+a[i] = b[i] + 42;
+}
+
+/* We should vectorize the main loop with AVX512 and the epilog with SSE.  */
+
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 64 byte 
vectors" "vect" } } */
+/* { dg-final { scan-tree-dump "optimized: loop vectorized using 16 byte 
vectors" "vect" } } */
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 20f570e4a0d..6c452e07880 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -2882,34 +2882,6 @@ slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 }
 
-/* EPILOGUE_VINFO is an epilogue loop that we now know would need to
-   iterate exactly CONST_NITERS times.  Make a final decision about
-   whether the epilogue loop should be used, returning true if so.  */
-
-static bool
-vect_update_epilogue_niters (loop_vec_info epilogue_vinfo,
-unsigned HOST_WIDE_INT const_niters)
-{
-  /* Avoid wrap-around when computing const_niters - 1.  Also reject
- using an epilogue loop for a single scalar iteration, even if
- we could in principle implement that using partial vectors.  */
-  unsigned int gap_niters = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
-  if (const_niters <= gap_niters + 1)
-return false;
-
-  /* Install the number of iterations.  */
-  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (epilogue_vinfo));
-  tree niters_tree = build_int_cst (niters_type, const_niters);
-  tree nitersm1_tree = build_int_cst (niters_type, const_niters - 1);
-
-  LOOP_VINFO_NITERS (epilogue_vinfo) = niters_tree;
-  LOOP_VINFO_NITERSM1 (epilogue_vinfo) = nitersm1_tree;
-
-  /* Decide what to do if the number of epilogue iterations is not
- a multiple of the epilogue loop's vectorization factor.  */
-  return vect_determine_partial_vectors_and_peeling (epilogue_vinfo, true);
-}
-
 /* LOOP_VINFO is an epilogue loop whose corresponding main loop can be skipped.
Return a value that equals:
 
@@ -3039,7 +3011,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
   int estimated_vf;
   int prolog_peeling = 0;
   bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
-  bool vect_epilogues_updated_niters = false;
   /* We currently do not support prolog peeling if

Re: [VSETVL PASS] RISC-V: Optimize local AVL propagation

2023-07-03 Thread Robin Dapp via Gcc-patches

LGTM.

Regards
 Robin

Re: [PATCH] middle-end/110495 - avoid associating constants with (VL) vectors

2023-07-03 Thread Richard Sandiford via Gcc-patches

Richard Biener via Gcc-patches  writes:
> When trying to associate (v + INT_MAX) + INT_MAX we are using
> the TREE_OVERFLOW bit to check for correctness.  That isn't
> working for VECTOR_CSTs and it can't in general when one considers
> VL vectors.  It looks like it should work for COMPLEX_CSTs but
> I didn't try to single out _Complex int in this change.
>
> The following makes sure that for vectors we use the fallback of
> using unsigned arithmetic when associating the above to
> v + (INT_MAX + INT_MAX).
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
>
> Thanks,
> Richard.
>
>   PR middle-end/110495
>   * tree.h (TREE_OVERFLOW): Do not mention VECTOR_CSTs
>   since we do not set TREE_OVERFLOW on those since the
>   introduction of VL vectors.
>   * match.pd (x +- CST +- CST): For VECTOR_CST do not look
>   at TREE_OVERFLOW to determine validity of association.
>
>   * gcc.dg/tree-ssa/addadd-2.c: Amend.
>   * gcc.dg/tree-ssa/forwprop-27.c: Adjust.
> ---
>  gcc/match.pd| 9 +
>  gcc/testsuite/gcc.dg/tree-ssa/addadd-2.c| 1 +
>  gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c | 4 +++-
>  gcc/tree.h  | 2 +-
>  4 files changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index f09583bbcac..d193a572005 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -3025,7 +3025,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (with { tree cst = const_binop (outer_op == inner_op
>   ? PLUS_EXPR : MINUS_EXPR,
>   type, @1, @2); }
> -  (if (cst && !TREE_OVERFLOW (cst))
> +  (if (INTEGRAL_TYPE_P (type) && cst && !TREE_OVERFLOW (cst))
> (inner_op @0 { cst; } )
> /* X+INT_MAX+1 is X-INT_MIN.  */
> (if (INTEGRAL_TYPE_P (type) && cst
> @@ -3037,7 +3037,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>(view_convert (inner_op
>   (view_convert:utype @0)
>   (view_convert:utype
> -  { drop_tree_overflow (cst); }))
> +  { TREE_OVERFLOW (cst)
> +? drop_tree_overflow (cst) : cst; }))

It looks like the whole “(with …)” expects cst to be nonnull,
but the “last resort” doesn't check it (unless I'm misreading).
Would it be easier to add a top-level “if (cst)”?  (Obviously
a preexisting thing.)
>  
>/* (CST1 - A) +- CST2 -> CST3 - A  */
>(for outer_op (plus minus)
> @@ -3049,7 +3050,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   forever if something doesn't simplify into a constant.  */
>   (if (!CONSTANT_CLASS_P (@0))
>(minus (outer_op! (view_convert @1) @2) (view_convert @0)))
> - (if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
> + (if (!INTEGRAL_TYPE_P (TREE_TYPE (@0))
> || TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
>(view_convert (minus (outer_op! @1 (view_convert @2)) @0))
>(if (types_match (type, @0) && !TYPE_OVERFLOW_SANITIZED (type))
> @@ -3068,7 +3069,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>forever if something doesn't simplify into a constant.  */
>  (if (!CONSTANT_CLASS_P (@0))
>   (plus (view_convert @0) (minus! @1 (view_convert @2
> -(if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
> +(if (!INTEGRAL_TYPE_P (TREE_TYPE (@0))
>|| TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
>   (view_convert (plus @0 (minus! (view_convert @1) @2)))
>   (if (types_match (type, @0) && !TYPE_OVERFLOW_SANITIZED (type))

I didn't understand this part.  Doesn't it mean that we allow
overflow-inducing reassociations for all vector integer types,
albeit not immediately folded away?

Also, why do we keep the:

  !ANY_INTEGRAL_TYPE_P (type) || TYPE_OVERFLOW_WRAPS (type)

in the outer ifs?

But that's just me not understanding match.pd very well.
Feel free to ignore if it's nonsense. :)

Thanks,
Richard

[VSETVL PASS] RISC-V: Optimize local AVL propagation

2023-07-03 Thread Juzhe-Zhong

I recently noticed that current VSETVL pass has a unnecessary restriction on 
local
AVL propgation.

Consider this following case:

+  insn 1: vsetvli a5,a3,e8,mf4,ta,mu
+  insn 2: vsetvli zero,a5,e32,m1,ta,ma
+  ...
+  vle32.v v1,0(a1)
+  vsetvli a2,zero,e32,m1,ta,ma
+  vadd.vv v1,v1,v1
+  vsetvli zero,a5,e32,m1,ta,ma
+  vse32.v v1,0(a0)
+  ...
+  insn 3: sub a3,a3,a5
+  ...

We failed to elide insn 2 (vsetvl insn) since insn 3 is modifying "a3" AVL.
Actually, we don't really care about insn 3 since we should only check and make 
sure
there is no insn between insn 1 and insn 2 that modifies "a3" AVL. Then, we can 
propgate
AVL "a3" from insn 1 to insn 2. Finally, insn 2 is eliminated.

After this patch:

+  insn 1: vsetvli a5,a3,e8,mf4,ta,ma
+  ...
+  vle32.v v1,0(a1)
+  vsetvli a2,zero,e32,m1,ta,ma
+  vadd.vv v1,v1,v1
+  vsetvli zero,a5,e32,m1,ta,ma
+  vse32.v v1,0(a0)
+  ...
+  insn 3: sub a3,a3,a5
+  ...

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (vector_insn_info::parse_insn): Add 
early break.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/avl_prop-1.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 22 +++
 .../gcc.target/riscv/rvv/vsetvl/avl_prop-1.c  | 21 ++
 2 files changed, 43 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_prop-1.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 2d576e8d5c1..ab47901e23f 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2025,6 +2025,28 @@ vector_insn_info::parse_insn (insn_info *insn)
   real_insn_and_same_bb_p (i, get_insn ()->bb ());
   i = i->next_nondebug_insn ())
{
+ /* Consider this following sequence:
+
+  insn 1: vsetvli a5,a3,e8,mf4,ta,mu
+  insn 2: vsetvli zero,a5,e32,m1,ta,ma
+  ...
+  vle32.v v1,0(a1)
+  vsetvli a2,zero,e32,m1,ta,ma
+  vadd.vv v1,v1,v1
+  vsetvli zero,a5,e32,m1,ta,ma
+  vse32.v v1,0(a0)
+  ...
+  insn 3: sub a3,a3,a5
+  ...
+
+  We can local AVL propagate "a3" from insn 1 to insn 2
+  if no insns between insn 1 and insn 2 modify "a3 even
+  though insn 3 modifies "a3".
+  Otherwise, we can't perform local AVL propagation.
+
+  Early break if we reach the insn 2.  */
+ if (!before_p (i, insn))
+   break;
  if (find_access (i->defs (), REGNO (new_info.get_avl (
{
  modified_p = true;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_prop-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_prop-1.c
new file mode 100644
index 000..19ea0f14df5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/avl_prop-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32 -fno-schedule-insns 
-fno-schedule-insns2" } */
+
+#include "riscv_vector.h"
+
+void
+foo (void *a, void *b, void *c, size_t n)
+{
+  for (size_t vl; n > 0; n -= vl, a += vl, b += vl * 4, c += vl)
+{
+  vl = __riscv_vsetvl_e8mf4 (n);
+  vint32m1_t vec_b = __riscv_vle32_v_i32m1 (b, vl);
+  vint32m1_t vec_a = __riscv_vadd_vv_i32m1 (vec_b, vec_b, 
__riscv_vsetvlmax_e32m1 ());
+  __riscv_vse32_v_i32m1 (a, vec_a, vl);
+}
+}
+
+/* { dg-final { scan-assembler-times {vsetvli} 3 { target { no-opts "-O0" 
no-opts "-Os" no-opts "-Oz" no-opts "-O1" no-opts "-g" no-opts "-funroll-loops" 
} } } } */
+/* { dg-final { scan-assembler-times 
{vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e8,\s*mf4,\s*t[au],\s*m[au]} 1 { target { 
no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-O1" no-opts "-g" no-opts 
"-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times 
{vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { 
no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-O1" no-opts "-g" no-opts 
"-funroll-loops" } } } } */
+/* { dg-final { scan-assembler-times 
{vsetvli\s+[a-x0-9]+,\s*zero,\s*e32,\s*m1,\s*t[au],\s*m[au]} 1 { target { 
no-opts "-O0" no-opts "-Os" no-opts "-Oz" no-opts "-O1" no-opts "-g" no-opts 
"-funroll-loops" } } } } */
-- 
2.36.3

[PATCH] middle-end/110495 - avoid associating constants with (VL) vectors

2023-07-03 Thread Richard Biener via Gcc-patches

When trying to associate (v + INT_MAX) + INT_MAX we are using
the TREE_OVERFLOW bit to check for correctness.  That isn't
working for VECTOR_CSTs and it can't in general when one considers
VL vectors.  It looks like it should work for COMPLEX_CSTs but
I didn't try to single out _Complex int in this change.

The following makes sure that for vectors we use the fallback of
using unsigned arithmetic when associating the above to
v + (INT_MAX + INT_MAX).

Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?

Thanks,
Richard.

PR middle-end/110495
* tree.h (TREE_OVERFLOW): Do not mention VECTOR_CSTs
since we do not set TREE_OVERFLOW on those since the
introduction of VL vectors.
* match.pd (x +- CST +- CST): For VECTOR_CST do not look
at TREE_OVERFLOW to determine validity of association.

* gcc.dg/tree-ssa/addadd-2.c: Amend.
* gcc.dg/tree-ssa/forwprop-27.c: Adjust.
---
 gcc/match.pd| 9 +
 gcc/testsuite/gcc.dg/tree-ssa/addadd-2.c| 1 +
 gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c | 4 +++-
 gcc/tree.h  | 2 +-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index f09583bbcac..d193a572005 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3025,7 +3025,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(with { tree cst = const_binop (outer_op == inner_op
? PLUS_EXPR : MINUS_EXPR,
type, @1, @2); }
-(if (cst && !TREE_OVERFLOW (cst))
+(if (INTEGRAL_TYPE_P (type) && cst && !TREE_OVERFLOW (cst))
  (inner_op @0 { cst; } )
  /* X+INT_MAX+1 is X-INT_MIN.  */
  (if (INTEGRAL_TYPE_P (type) && cst
@@ -3037,7 +3037,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (view_convert (inner_op
(view_convert:utype @0)
(view_convert:utype
-{ drop_tree_overflow (cst); }))
+{ TREE_OVERFLOW (cst)
+  ? drop_tree_overflow (cst) : cst; }))
 
   /* (CST1 - A) +- CST2 -> CST3 - A  */
   (for outer_op (plus minus)
@@ -3049,7 +3050,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
forever if something doesn't simplify into a constant.  */
  (if (!CONSTANT_CLASS_P (@0))
   (minus (outer_op! (view_convert @1) @2) (view_convert @0)))
- (if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
+ (if (!INTEGRAL_TYPE_P (TREE_TYPE (@0))
  || TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
   (view_convert (minus (outer_op! @1 (view_convert @2)) @0))
   (if (types_match (type, @0) && !TYPE_OVERFLOW_SANITIZED (type))
@@ -3068,7 +3069,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   forever if something doesn't simplify into a constant.  */
 (if (!CONSTANT_CLASS_P (@0))
  (plus (view_convert @0) (minus! @1 (view_convert @2
-(if (!ANY_INTEGRAL_TYPE_P (TREE_TYPE (@0))
+(if (!INTEGRAL_TYPE_P (TREE_TYPE (@0))
 || TYPE_OVERFLOW_WRAPS (TREE_TYPE (@0)))
  (view_convert (plus @0 (minus! (view_convert @1) @2)))
  (if (types_match (type, @0) && !TYPE_OVERFLOW_SANITIZED (type))
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/addadd-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/addadd-2.c
index 39aa032c9b1..8c05911f473 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/addadd-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/addadd-2.c
@@ -12,4 +12,5 @@ void k(S*x){
   *x = (S)(y + __INT_MAX__);
 }
 
+/* { dg-final { scan-tree-dump "4294967294" "optimized" { target int32plus } } 
} */
 /* { dg-final { scan-tree-dump-not "2147483647" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c 
b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c
index 9775a4c6367..6c71a4fc81c 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-27.c
@@ -33,7 +33,9 @@ void i (V *v1, V *v2){
   *v2 = (c1-*v2)+c2;
 }
 
-/* { dg-final { scan-tree-dump-not "\\\+" "forwprop1"} } */
+/* { dg-final { scan-tree-dump-times "\\\+" 1 "forwprop1"} } */
 /* { dg-final { scan-tree-dump "{ 0, 4 }" "forwprop1"} } */
 /* { dg-final { scan-tree-dump "{ 37, -5 }" "forwprop1"} } */
+/* { dg-final { scan-tree-dump "{ 27, 23 }" "forwprop1"} } */
+/* { dg-final { scan-tree-dump "{ 37, 3 }" "forwprop1"} } */
 
diff --git a/gcc/tree.h b/gcc/tree.h
index f11c758afb9..bedbbb0bcc0 100644
--- a/gcc/tree.h
+++ b/gcc/tree.h
@@ -824,7 +824,7 @@ extern void omp_clause_range_check_failed (const_tree, 
const char *, int,
 #define TYPE_REF_CAN_ALIAS_ALL(NODE) \
   (PTR_OR_REF_CHECK (NODE)->base.static_flag)
 
-/* In an INTEGER_CST, REAL_CST, COMPLEX_CST, or VECTOR_CST, this means
+/* In an INTEGER_CST, REAL_CST, or COMPLEX_CST, this means
there was an overflow in folding.  */
 
 #define TREE_OVERFLOW(NODE) (CST_CHECK (NODE)->base.public_flag)
-- 
2.35.3

Re: [PATCH] Fortran: fixes for procedures with ALLOCATABLE,INTENT(OUT) arguments [PR92178]

2023-07-03 Thread Mikael Morin


Hello,

Le 02/07/2023 à 22:38, Harald Anlauf via Fortran a écrit :

Dear all,

the attached patch fixes a long-standing issue with the
order of evaluation of procedure argument expressions and
deallocation of allocatable actual arguments passed to
allocatable dummies with intent(out) attribute.

It is based on an initial patch by Steve, handles issues
pointed out by Tobias, and includes a suggestion by Tobias
to scan the procedure arguments first to decide whether the
creation of temporaries is needed.

There is one unresolved issue left that might be more
general: it appears to affect character arguments (only)
in that quite often there still is no temporary generated.
I haven't found the reason why and would like to defer this,
unless someone has a good suggestion.


No problem, let's fix the easier parts first.


Regtested on x86_64-pc-linux-gnu. OK for mainline?


A few thing to double check below.


pr92178.diff

From 609ba636927811cddc74fb815cb18809c7d33565 Mon Sep 17 00:00:00 2001
From: Harald Anlauf 
Date: Sun, 2 Jul 2023 22:14:19 +0200
Subject: [PATCH] Fortran: fixes for procedures with ALLOCATABLE,INTENT(OUT)
 arguments [PR92178]

gcc/fortran/ChangeLog:

PR fortran/92178
* trans-expr.cc (gfc_conv_procedure_call): Check procedures for
allocatable dummy arguments with INTENT(OUT) and move deallocation
of actual arguments after evaluation of argument expressions before
the procedure is executed.

gcc/testsuite/ChangeLog:

PR fortran/92178
* gfortran.dg/pr92178.f90: New test.
* gfortran.dg/pr92178_2.f90: New test.

Co-authored-by: Steven G. Kargl 
---
 gcc/fortran/trans-expr.cc   | 52 ++--
 gcc/testsuite/gfortran.dg/pr92178.f90   | 83 +
 gcc/testsuite/gfortran.dg/pr92178_2.f90 | 46 ++
 3 files changed, 177 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/pr92178.f90
 create mode 100644 gcc/testsuite/gfortran.dg/pr92178_2.f90

diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index 30946ba3f63..16e8f037cfc 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc

(...)

@@ -6117,6 +6118,33 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
   && UNLIMITED_POLY (sym)
   && comp && (strcmp ("_copy", comp->name) == 0);

+  /* First scan argument list for allocatable actual arguments passed to
+ allocatable dummy arguments with INTENT(OUT).  As the corresponding
+ actual arguments are deallocated before execution of the procedure, we
+ evaluate actual argument expressions to avoid problems with possible
+ dependencies.  */
+  bool force_eval_args = false;
+  gfc_formal_arglist *tmp_formal;
+  for (arg = args, tmp_formal = formal; arg != NULL;
+   arg = arg->next, tmp_formal = tmp_formal ? tmp_formal->next : NULL)
+{
+  e = arg->expr;
+  fsym = tmp_formal ? tmp_formal->sym : NULL;
+  if (e && fsym
+ && e->expr_type == EXPR_VARIABLE
+ && fsym->attr.intent == INTENT_OUT
+ && (fsym->ts.type == BT_CLASS && fsym->attr.class_ok
+ ? CLASS_DATA (fsym)->attr.allocatable
+ : fsym->attr.allocatable)
+ && e->symtree
+ && e->symtree->n.sym
+ && gfc_variable_attr (e, NULL).allocatable)
+   {
+ force_eval_args = true;
+ break;
+   }
+}
+
The function is already big enough, would you mind outlining this to its 
own function?



   /* Evaluate the arguments.  */
   for (arg = args, argc = 0; arg != NULL;
arg = arg->next, formal = formal ? formal->next : NULL, ++argc)
@@ -6680,7 +6708,7 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
  else
tmp = gfc_finish_block ();

- gfc_add_expr_to_block (>pre, tmp);
+ gfc_add_expr_to_block (_blk, tmp);
}

  /* A class array element needs converting back to be a
@@ -6980,7 +7008,7 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
build_empty_stmt (input_location));
  }
if (tmp != NULL_TREE)
- gfc_add_expr_to_block (>pre, tmp);
+ gfc_add_expr_to_block (_blk, tmp);
  }

  tmp = parmse.expr;
@@ -7004,7 +7032,7 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
 void_type_node,
 gfc_conv_expr_present (e->symtree->n.sym),
   tmp, build_empty_stmt (input_location));
- gfc_add_expr_to_block (>pre, tmp);
+ gfc_add_expr_to_block (_blk, tmp);
}
}
}
These look good, but I'm surprised that there is no similar change at 
the 6819 line.

This is the class array actual vs class

Re: [PATCH v1] RISC-V: Fix one typo for emit_mode_set.

2023-07-03 Thread juzhe.zh...@rivai.ai

LGTM



juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-07-03 18:57
To: gcc-patches
CC: juzhe.zhong; jeffreyalaw; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Fix one typo for emit_mode_set.
From: Pan Li 
 
This patch would like to fix one typo for scaler[should be scalar] in
emit_mode_set, as well as minor change for mov emit.
 
Signed-off-by: Pan Li 
 
gcc/ChangeLog:
 
* config/riscv/riscv.cc (riscv_emit_mode_set): Fix typo.
---
gcc/config/riscv/riscv.cc | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
 
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e4dc8115e69..7761e946761 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7672,11 +7672,11 @@ riscv_emit_mode_set (int entity, int mode, int 
prev_mode,
 case RISCV_FRM:
   if (mode != FRM_MODE_NONE && mode != prev_mode)
{
-   rtx scaler = gen_reg_rtx (SImode);
+   rtx scalar = gen_reg_rtx (SImode);
  rtx imm = gen_int_mode (mode, SImode);
-   emit_insn (gen_movsi (scaler, imm));
-   emit_insn (gen_fsrm (scaler, scaler));
+   emit_move_insn (scalar, imm);
+   emit_insn (gen_fsrm (scalar, scalar));
}
   break;
 default:
-- 
2.34.1

[PATCH V7] Machine Description: Add LEN_MASK_{GATHER_LOAD, SCATTER_STORE} pattern

2023-07-03 Thread juzhe . zhong

From: Ju-Zhe Zhong 

Hi, Richi and Richard.

Base one the review comments from Richard:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623405.html

I change len_mask_gather_load/len_mask_scatter_store order into:
{len,bias,mask}

We adjust adding len and mask using using add_len_and_mask_args
which is same as partial_load/parial_store.

Now, the codes become more reasonable and easier maintain.

This patch is adding LEN_MASK_{GATHER_LOAD,SCATTER_STORE} to allow targets
handle flow control by mask and loop control by length on gather/scatter memory
operations. Consider this following case:

#include 
void
f (uint8_t *restrict a,
   uint8_t *restrict b, int n,
   int base, int step,
   int *restrict cond)
{
  for (int i = 0; i < n; ++i)
{
  if (cond[i])
a[i * step + base] = b[i * step + base];
}
}

We hope RVV can vectorize such case into following IR:

loop_len = SELECT_VL
control_mask = comparison
v = LEN_MASK_GATHER_LOAD (.., loop_len, bias, control_mask)
LEN_SCATTER_STORE (... v, ..., loop_len, bias, control_mask)

This patch doesn't apply such patterns into vectorizer, just add patterns
and update the documents.

Will send patch which apply such patterns into vectorizer soon after this
patch is approved.

Ok for trunk?

gcc/ChangeLog:

* doc/md.texi: Add len_mask_gather_load/len_mask_scatter_store.
* internal-fn.cc (expand_scatter_store_optab_fn): Ditto.
(expand_gather_load_optab_fn): Ditto.
(internal_load_fn_p): Ditto.
(internal_store_fn_p): Ditto.
(internal_gather_scatter_fn_p): Ditto.
(internal_fn_len_index): Ditto.
(internal_fn_mask_index): Ditto.
(internal_fn_stored_value_index): Ditto.
* internal-fn.def (LEN_MASK_GATHER_LOAD): Ditto.
(LEN_MASK_SCATTER_STORE): Ditto.
* optabs.def (OPTAB_CD): Ditto.

---
 gcc/doc/md.texi | 17 +
 gcc/internal-fn.cc  | 32 +---
 gcc/internal-fn.def |  8 ++--
 gcc/optabs.def  |  2 ++
 4 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 5e5482265cd..f14dd32b2dc 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5040,6 +5040,15 @@ operand 5.  Bit @var{i} of the mask is set if element 
@var{i}
 of the result should be loaded from memory and clear if element @var{i}
 of the result should be set to zero.
 
+@cindex @code{len_mask_gather_load@var{m}@var{n}} instruction pattern
+@item @samp{len_mask_gather_load@var{m}@var{n}}
+Like @samp{gather_load@var{m}@var{n}}, but takes an extra length operand 
(operand 5),
+a bias operand (operand 6) as well as a mask operand (operand 7).  Similar to 
len_maskload,
+the instruction loads at most (operand 5 + operand 6) elements from memory.
+Bit @var{i} of the mask is set if element @var{i} of the result should
+be loaded from memory and clear if element @var{i} of the result should be 
undefined.
+Mask elements @var{i} with @var{i} > (operand 5 + operand 6) are ignored.
+
 @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
 @item @samp{scatter_store@var{m}@var{n}}
 Store a vector of mode @var{m} into several distinct memory locations.
@@ -5069,6 +5078,14 @@ Like @samp{scatter_store@var{m}@var{n}}, but takes an 
extra mask operand as
 operand 5.  Bit @var{i} of the mask is set if element @var{i}
 of the result should be stored to memory.
 
+@cindex @code{len_mask_scatter_store@var{m}@var{n}} instruction pattern
+@item @samp{len_mask_scatter_store@var{m}@var{n}}
+Like @samp{scatter_store@var{m}@var{n}}, but takes an extra length operand 
(operand 5),
+a bias operand (operand 6) as well as a mask operand (operand 7).  The 
instruction stores
+at most (operand 5 + operand 6) elements of (operand 4) to memory.
+Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be 
stored.
+Mask elements @var{i} with @var{i} > (operand 5 + operand 6) are ignored.
+
 @cindex @code{vec_set@var{m}} instruction pattern
 @item @samp{vec_set@var{m}}
 Set given field in the vector value.  Operand 0 is the vector to modify,
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index c1fcb38b17b..303df102d81 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3507,7 +3507,6 @@ expand_scatter_store_optab_fn (internal_fn, gcall *stmt, 
direct_optab optab)
 {
   internal_fn ifn = gimple_call_internal_fn (stmt);
   int rhs_index = internal_fn_stored_value_index (ifn);
-  int mask_index = internal_fn_mask_index (ifn);
   tree base = gimple_call_arg (stmt, 0);
   tree offset = gimple_call_arg (stmt, 1);
   tree scale = gimple_call_arg (stmt, 2);
@@ -3518,19 +3517,14 @@ expand_scatter_store_optab_fn (internal_fn, gcall 
*stmt, direct_optab optab)
   HOST_WIDE_INT scale_int = tree_to_shwi (scale);
   rtx rhs_rtx = expand_normal (rhs);
 
-  class expand_operand ops[6];
+  class expand_operand ops[8];
   int i = 0;
   create_address_operand ([i++], base_rtx);
   create_input_operand ([i++], offset_rtx,

[PATCH v2] RISC-V: Add support for vector crypto extensions

2023-07-03 Thread Christoph Muellner

From: Christoph Müllner 

This series adds basic support for the vector crypto extensions:
* Zvbb
* Zvbc
* Zvkg
* Zvkned
* Zvkhn[a,b]
* Zvksed
* Zvksh
* Zvkn
* Zvknc
* Zvkng
* Zvks
* Zvksc
* Zvksg
* Zvkt

This patch is based on the v20230620 version of the Vector Cryptography
specification. The specification is frozen and can be found here:
  https://github.com/riscv/riscv-crypto/releases/tag/v20230620

Binutils support has been merged upstream a few days ago.

All extensions come with tests for the feature test macros.

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc: Add support for zvbb,
zvbc, zvkg, zvkned, zvknha, zvknhb, zvksed, zvksh, zvkn,
zvknc, zvkng, zvks, zvksc, zvksg, zvkt and the implied subsets.
* config/riscv/arch-canonicalize: Add canonicalization info for
zvkn, zvknc, zvkng, zvks, zvksc, zvksg.
* config/riscv/riscv-opts.h (MASK_ZVBB): New macro.
(MASK_ZVBC): Likewise.
(TARGET_ZVBB): Likewise.
(TARGET_ZVBC): Likewise.
(MASK_ZVKG): Likewise.
(MASK_ZVKNED): Likewise.
(MASK_ZVKNHA): Likewise.
(MASK_ZVKNHB): Likewise.
(MASK_ZVKSED): Likewise.
(MASK_ZVKSH): Likewise.
(MASK_ZVKN): Likewise.
(MASK_ZVKNC): Likewise.
(MASK_ZVKNG): Likewise.
(MASK_ZVKS): Likewise.
(MASK_ZVKSC): Likewise.
(MASK_ZVKSG): Likewise.
(MASK_ZVKT): Likewise.
(TARGET_ZVKG): Likewise.
(TARGET_ZVKNED): Likewise.
(TARGET_ZVKNHA): Likewise.
(TARGET_ZVKNHB): Likewise.
(TARGET_ZVKSED): Likewise.
(TARGET_ZVKSH): Likewise.
(TARGET_ZVKN): Likewise.
(TARGET_ZVKNC): Likewise.
(TARGET_ZVKNG): Likewise.
(TARGET_ZVKS): Likewise.
(TARGET_ZVKSC): Likewise.
(TARGET_ZVKSG): Likewise.
(TARGET_ZVKT): Likewise.
* config/riscv/riscv.opt: Introduction of riscv_zv{b,k}_subext.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/zvbb.c: New test.
* gcc.target/riscv/zvbc.c: New test.
* gcc.target/riscv/zvkg.c: New test.
* gcc.target/riscv/zvkn-1.c: New test.
* gcc.target/riscv/zvkn.c: New test.
* gcc.target/riscv/zvknc-1.c: New test.
* gcc.target/riscv/zvknc-2.c: New test.
* gcc.target/riscv/zvknc.c: New test.
* gcc.target/riscv/zvkned.c: New test.
* gcc.target/riscv/zvkng-1.c: New test.
* gcc.target/riscv/zvkng-2.c: New test.
* gcc.target/riscv/zvkng.c: New test.
* gcc.target/riscv/zvknha.c: New test.
* gcc.target/riscv/zvknhb.c: New test.
* gcc.target/riscv/zvks-1.c: New test.
* gcc.target/riscv/zvks.c: New test.
* gcc.target/riscv/zvksc-1.c: New test.
* gcc.target/riscv/zvksc-2.c: New test.
* gcc.target/riscv/zvksc.c: New test.
* gcc.target/riscv/zvksed.c: New test.
* gcc.target/riscv/zvksg-1.c: New test.
* gcc.target/riscv/zvksg-2.c: New test.
* gcc.target/riscv/zvksg.c: New test.
* gcc.target/riscv/zvksh.c: New test.
* gcc.target/riscv/zvkt.c: New test.

Signed-off-by: Christoph Müllner 
---
Changes for v2:
- Update patch for specification version v20230620

 gcc/common/config/riscv/riscv-common.cc  | 55 
 gcc/config/riscv/arch-canonicalize   |  7 +++
 gcc/config/riscv/riscv-opts.h| 34 +++
 gcc/config/riscv/riscv.opt   |  6 +++
 gcc/testsuite/gcc.target/riscv/zvbb.c| 13 ++
 gcc/testsuite/gcc.target/riscv/zvbc.c| 13 ++
 gcc/testsuite/gcc.target/riscv/zvkg.c| 13 ++
 gcc/testsuite/gcc.target/riscv/zvkn-1.c  | 29 +
 gcc/testsuite/gcc.target/riscv/zvkn.c| 29 +
 gcc/testsuite/gcc.target/riscv/zvknc-1.c | 37 
 gcc/testsuite/gcc.target/riscv/zvknc-2.c | 37 
 gcc/testsuite/gcc.target/riscv/zvknc.c   | 37 
 gcc/testsuite/gcc.target/riscv/zvkned.c  | 13 ++
 gcc/testsuite/gcc.target/riscv/zvkng-1.c | 37 
 gcc/testsuite/gcc.target/riscv/zvkng-2.c | 37 
 gcc/testsuite/gcc.target/riscv/zvkng.c   | 37 
 gcc/testsuite/gcc.target/riscv/zvknha.c  | 13 ++
 gcc/testsuite/gcc.target/riscv/zvknhb.c  | 13 ++
 gcc/testsuite/gcc.target/riscv/zvks-1.c  | 29 +
 gcc/testsuite/gcc.target/riscv/zvks.c| 29 +
 gcc/testsuite/gcc.target/riscv/zvksc-1.c | 37 
 gcc/testsuite/gcc.target/riscv/zvksc-2.c | 37 
 gcc/testsuite/gcc.target/riscv/zvksc.c   | 37 
 gcc/testsuite/gcc.target/riscv/zvksed.c  | 13 ++
 gcc/testsuite/gcc.target/riscv/zvksg-1.c | 37 
 gcc/testsuite/gcc.target/riscv/zvksg-2.c | 37 
 gcc/testsuite/gcc.target/riscv/zvksg.c   | 37 
 gcc/testsuite/gcc.target/riscv/zvksh.c   | 13 ++
 gcc/testsuite/gcc.target/riscv/zvkt.c| 13 ++
 29

Re: [PATCH] gcc-ar: Handle response files properly [PR77576]

2023-07-03 Thread Costas Argyris via Gcc-patches

I should also add that for a rsp file that contains just "--version":

gcc-ar @rsp

fails without the patch (current problem) and successfully prints
the version info with it.

On Sat, 1 Jul 2023 at 22:45, Costas Argyris 
wrote:

> Basically implementing what Andrew said in the PR:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77576
>
> If @file has been passed to gcc-ar, do the following:
>
> 1) Expand it to get an argv without any @files.
> 2) Then apply the plugin modifications to argv.
> 3) Create temporary response file.
> 4) Put the modified argv in the temporary file.
> 5) Call ar with @tmp.
> 6) Delete the temporary response file.
>

[PATCH v1] RISC-V: Fix one typo for emit_mode_set.

2023-07-03 Thread Pan Li via Gcc-patches

From: Pan Li 

This patch would like to fix one typo for scaler[should be scalar] in
emit_mode_set, as well as minor change for mov emit.

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_emit_mode_set): Fix typo.
---
 gcc/config/riscv/riscv.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index e4dc8115e69..7761e946761 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7672,11 +7672,11 @@ riscv_emit_mode_set (int entity, int mode, int 
prev_mode,
 case RISCV_FRM:
   if (mode != FRM_MODE_NONE && mode != prev_mode)
{
- rtx scaler = gen_reg_rtx (SImode);
+ rtx scalar = gen_reg_rtx (SImode);
  rtx imm = gen_int_mode (mode, SImode);
 
- emit_insn (gen_movsi (scaler, imm));
- emit_insn (gen_fsrm (scaler, scaler));
+ emit_move_insn (scalar, imm);
+ emit_insn (gen_fsrm (scalar, scalar));
}
   break;
 default:
-- 
2.34.1

Re: [PATCH] gimple-isel: Recognize vec_extract pattern.

2023-07-03 Thread Richard Biener via Gcc-patches

On Mon, 3 Jul 2023, Robin Dapp wrote:

> Hi,
> 
> In gimple-isel we already deduce a vec_set pattern from an
> ARRAY_REF(VIEW_CONVERT_EXPR).  This patch does the same for a
> vec_extract.
> 
> The code is largely similar to the vec_set one including
> the addition of a can_vec_extract_var_idx_p function
> in optabs.cc to check if the backend can handle a register
> operand as index.  We already have can_vec_extract in
> optabs-query but that one checks whether we can extract
> specific modes.
> 
> With the introduction of an internal function for vec_extract
> the expander must not FAIL.  For vec_set this has already been
> the case so adjust the documentation accordingly.
> 
> Additionally, clarify the wording of the vector-vector case for
> vec_extract.
> 
> During testing I noticed that the aarch64 simd vec_extract
> expander is the only one that FAILs.  Richard is currently
> testing a patch that tries to remove this.   Bootstrap and
> testsuite was unchanged on x86 and power.
> 
> I was a bit torn whether to add a separate function to recognize
> vec_extract or not and ended up doing it inline with several
> is_extract checks.
> 
> Regards
>  Robin
> 
> gcc/ChangeLog:
> 
>   * doc/md.texi: Document that vec_set and vec_extract must not
>   fail.
>   * gimple-isel.cc (gimple_expand_vec_set_expr): Rename this...
>   (gimple_expand_vec_set_extract_expr): ...to this.
>   (gimple_expand_vec_exprs): Call renamed function.
>   * internal-fn.cc (vec_extract_direct): Add.
>   (expand_vec_extract_optab_fn): New function to expand
>   vec_extract optab.
>   (direct_vec_extract_optab_supported_p): Add.
>   * internal-fn.def (VEC_EXTRACT): Add.
>   * optabs.cc (can_vec_extract_var_idx_p): New function.
>   * optabs.h (can_vec_extract_var_idx_p): Declare.
> ---
>  gcc/doc/md.texi |  7 +++-
>  gcc/gimple-isel.cc  | 85 +
>  gcc/internal-fn.cc  | 39 +
>  gcc/internal-fn.def |  1 +
>  gcc/optabs.cc   | 24 +
>  gcc/optabs.h|  1 +
>  6 files changed, 141 insertions(+), 16 deletions(-)
> 
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 9648fdc846a..c61602fb04d 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5074,6 +5074,8 @@ of the result should be stored to memory.
>  Set given field in the vector value.  Operand 0 is the vector to modify,
>  operand 1 is new value of field and operand 2 specify the field index.
>  
> +This pattern is not allowed to @code{FAIL}.
> +
>  @cindex @code{vec_extract@var{m}@var{n}} instruction pattern
>  @item @samp{vec_extract@var{m}@var{n}}
>  Extract given field from the vector value.  Operand 1 is the vector, operand 
> 2
> @@ -5081,7 +5083,10 @@ specify field index and operand 0 place to store value 
> into.  The
>  @var{n} mode is the mode of the field or vector of fields that should be
>  extracted, should be either element mode of the vector mode @var{m}, or
>  a vector mode with the same element mode and smaller number of elements.
> -If @var{n} is a vector mode, the index is counted in units of that mode.
> +If @var{n} is a vector mode the index is counted in multiples of
> +mode @var{n}.
> +
> +This pattern is not allowed to @code{FAIL}.
>  
>  @cindex @code{vec_init@var{m}@var{n}} instruction pattern
>  @item @samp{vec_init@var{m}@var{n}}
> diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
> index ef688ddb57f..d6c560b63dd 100644
> --- a/gcc/gimple-isel.cc
> +++ b/gcc/gimple-isel.cc
> @@ -42,15 +42,26 @@ along with GCC; see the file COPYING3.  If not see
>  
>  /* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
> internal function based on vector type of selected expansion.
> -   i.e.:
> +
> +   For vec_set:
> +
>   VIEW_CONVERT_EXPR(u)[_1] = i_4(D);
> =>
>   _7 = u;
>   _8 = .VEC_SET (_7, i_4(D), _1);
> - u = _8;  */
> + u = _8;
> +
> +   For vec_extract:
> +
> +  _3 = VIEW_CONVERT_EXPR(vD.2208)[idx_2(D)];
> +   =>
> +  _4 = vD.2208;
> +  _5 = .VEC_EXTRACT (_4, idx_2(D));
> +  _3 = _5;  */
>  
>  static bool
> -gimple_expand_vec_set_expr (struct function *fun, gimple_stmt_iterator *gsi)
> +gimple_expand_vec_set_extract_expr (struct function *fun,
> + gimple_stmt_iterator *gsi)
>  {
>enum tree_code code;
>gcall *new_stmt = NULL;
> @@ -62,45 +73,88 @@ gimple_expand_vec_set_expr (struct function *fun, 
> gimple_stmt_iterator *gsi)
>if (!stmt)
>  return false;
>  
> +  bool is_extract = false;
> +
>tree lhs = gimple_assign_lhs (stmt);
> +  tree rhs = gimple_assign_rhs1 (stmt);
> +  tree val, op0;
>code = TREE_CODE (lhs);
> -  if (code != ARRAY_REF)
> -return false;
> +  if (code == ARRAY_REF)
> +{
> +  /* Assume it is a vec_set.  */
> +  val = rhs;
> +  op0 = TREE_OPERAND (lhs, 0);
> +}
> +  else
> +{
> +  /* It can still be a vec_extract.  */
> +  code = TREE_CODE

[PATCH] gimple-isel: Recognize vec_extract pattern.

2023-07-03 Thread Robin Dapp via Gcc-patches

Hi,

In gimple-isel we already deduce a vec_set pattern from an
ARRAY_REF(VIEW_CONVERT_EXPR).  This patch does the same for a
vec_extract.

The code is largely similar to the vec_set one including
the addition of a can_vec_extract_var_idx_p function
in optabs.cc to check if the backend can handle a register
operand as index.  We already have can_vec_extract in
optabs-query but that one checks whether we can extract
specific modes.

With the introduction of an internal function for vec_extract
the expander must not FAIL.  For vec_set this has already been
the case so adjust the documentation accordingly.

Additionally, clarify the wording of the vector-vector case for
vec_extract.

During testing I noticed that the aarch64 simd vec_extract
expander is the only one that FAILs.  Richard is currently
testing a patch that tries to remove this.   Bootstrap and
testsuite was unchanged on x86 and power.

I was a bit torn whether to add a separate function to recognize
vec_extract or not and ended up doing it inline with several
is_extract checks.

Regards
 Robin

gcc/ChangeLog:

* doc/md.texi: Document that vec_set and vec_extract must not
fail.
* gimple-isel.cc (gimple_expand_vec_set_expr): Rename this...
(gimple_expand_vec_set_extract_expr): ...to this.
(gimple_expand_vec_exprs): Call renamed function.
* internal-fn.cc (vec_extract_direct): Add.
(expand_vec_extract_optab_fn): New function to expand
vec_extract optab.
(direct_vec_extract_optab_supported_p): Add.
* internal-fn.def (VEC_EXTRACT): Add.
* optabs.cc (can_vec_extract_var_idx_p): New function.
* optabs.h (can_vec_extract_var_idx_p): Declare.
---
 gcc/doc/md.texi |  7 +++-
 gcc/gimple-isel.cc  | 85 +
 gcc/internal-fn.cc  | 39 +
 gcc/internal-fn.def |  1 +
 gcc/optabs.cc   | 24 +
 gcc/optabs.h|  1 +
 6 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 9648fdc846a..c61602fb04d 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5074,6 +5074,8 @@ of the result should be stored to memory.
 Set given field in the vector value.  Operand 0 is the vector to modify,
 operand 1 is new value of field and operand 2 specify the field index.
 
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{vec_extract@var{m}@var{n}} instruction pattern
 @item @samp{vec_extract@var{m}@var{n}}
 Extract given field from the vector value.  Operand 1 is the vector, operand 2
@@ -5081,7 +5083,10 @@ specify field index and operand 0 place to store value 
into.  The
 @var{n} mode is the mode of the field or vector of fields that should be
 extracted, should be either element mode of the vector mode @var{m}, or
 a vector mode with the same element mode and smaller number of elements.
-If @var{n} is a vector mode, the index is counted in units of that mode.
+If @var{n} is a vector mode the index is counted in multiples of
+mode @var{n}.
+
+This pattern is not allowed to @code{FAIL}.
 
 @cindex @code{vec_init@var{m}@var{n}} instruction pattern
 @item @samp{vec_init@var{m}@var{n}}
diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index ef688ddb57f..d6c560b63dd 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -42,15 +42,26 @@ along with GCC; see the file COPYING3.  If not see
 
 /* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
internal function based on vector type of selected expansion.
-   i.e.:
+
+   For vec_set:
+
  VIEW_CONVERT_EXPR(u)[_1] = i_4(D);
=>
  _7 = u;
  _8 = .VEC_SET (_7, i_4(D), _1);
- u = _8;  */
+ u = _8;
+
+   For vec_extract:
+
+  _3 = VIEW_CONVERT_EXPR(vD.2208)[idx_2(D)];
+   =>
+  _4 = vD.2208;
+  _5 = .VEC_EXTRACT (_4, idx_2(D));
+  _3 = _5;  */
 
 static bool
-gimple_expand_vec_set_expr (struct function *fun, gimple_stmt_iterator *gsi)
+gimple_expand_vec_set_extract_expr (struct function *fun,
+   gimple_stmt_iterator *gsi)
 {
   enum tree_code code;
   gcall *new_stmt = NULL;
@@ -62,45 +73,88 @@ gimple_expand_vec_set_expr (struct function *fun, 
gimple_stmt_iterator *gsi)
   if (!stmt)
 return false;
 
+  bool is_extract = false;
+
   tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  tree val, op0;
   code = TREE_CODE (lhs);
-  if (code != ARRAY_REF)
-return false;
+  if (code == ARRAY_REF)
+{
+  /* Assume it is a vec_set.  */
+  val = rhs;
+  op0 = TREE_OPERAND (lhs, 0);
+}
+  else
+{
+  /* It can still be a vec_extract.  */
+  code = TREE_CODE (rhs);
+  if (code != ARRAY_REF)
+   return false;
+
+  /* Sides are swapped for vec_extract.  */
+  is_extract = true;
+  val = lhs;
+  op0 = TREE_OPERAND (rhs, 0);
+}
 
-  tree val = gimple_assign_rhs1 (stmt);
-  tree op0 = TREE_OPERAND (lhs, 0);
   if

Re: [PATCH 1/2] c++, libstdc++: implement __is_scalar built-in trait

2023-07-03 Thread Ken Matsui via Gcc-patches

Hi,

Here is the benchmark result for is_scalar:

https://github.com/ken-matsui/gcc-benches/blob/main/is_scalar.md#mon-jul--3-022250-am-pdt-2023

Time: -90.6237%
Peak Memory Usage: -78.5155%
Total Memory Usage: -82.1901%

Sincerely,
Ken Matsui

On Mon, Jul 3, 2023 at 2:14 AM Ken Matsui  wrote:
>
> This patch implements built-in trait for std::is_scalar. The existent
> __is_scalar codes were replaced with is_scalar to avoid unintentional
> macro replacement by the new built-in.
>
> gcc/cp/ChangeLog:
>
> * cp-trait.def: Define __is_scalar.
> * constraint.cc (diagnose_trait_expr): Handle CPTK_IS_SCALAR.
> * semantics.cc (trait_expr_value): Likewise.
> (finish_trait_expr): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * g++.dg/ext/has-builtin-1.C: Test existence of __is_scalar.
> * g++.dg/ext/is_scalar.C: New test.
> * g++.dg/tm/pr46567.C: Use is_scalar instead.
> * g++.dg/torture/pr57107.C: Likewise.
>
> libstdc++-v3/ChangeLog:
>
> * include/bits/cpp_type_traits.h (__is_scalar): Rename to ...
> (is_scalar): ... this.
> * include/bits/stl_algobase.h: Use is_scalar instead.
> * include/bits/valarray_array.h: Likewise.
>
> Signed-off-by: Ken Matsui 
> ---
>  gcc/cp/constraint.cc|  3 ++
>  gcc/cp/cp-trait.def |  1 +
>  gcc/cp/semantics.cc |  4 +++
>  gcc/testsuite/g++.dg/ext/has-builtin-1.C|  3 ++
>  gcc/testsuite/g++.dg/ext/is_scalar.C| 31 +
>  gcc/testsuite/g++.dg/tm/pr46567.C   | 10 +++
>  gcc/testsuite/g++.dg/torture/pr57107.C  |  4 +--
>  libstdc++-v3/include/bits/cpp_type_traits.h |  2 +-
>  libstdc++-v3/include/bits/stl_algobase.h|  8 +++---
>  libstdc++-v3/include/bits/valarray_array.h  |  2 +-
>  10 files changed, 55 insertions(+), 13 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/ext/is_scalar.C
>
> diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
> index 8cf0f2d0974..4c27f2a3a62 100644
> --- a/gcc/cp/constraint.cc
> +++ b/gcc/cp/constraint.cc
> @@ -3751,6 +3751,9 @@ diagnose_trait_expr (tree expr, tree args)
>  case CPTK_IS_UNION:
>inform (loc, "  %qT is not a union", t1);
>break;
> +case CPTK_IS_SCALAR:
> +  inform (loc, "  %qT is not a scalar type", t1);
> +  break;
>  case CPTK_IS_AGGREGATE:
>inform (loc, "  %qT is not an aggregate", t1);
>break;
> diff --git a/gcc/cp/cp-trait.def b/gcc/cp/cp-trait.def
> index 8b7fece0cc8..59ae087c457 100644
> --- a/gcc/cp/cp-trait.def
> +++ b/gcc/cp/cp-trait.def
> @@ -82,6 +82,7 @@ DEFTRAIT_EXPR (IS_TRIVIALLY_ASSIGNABLE, 
> "__is_trivially_assignable", 2)
>  DEFTRAIT_EXPR (IS_TRIVIALLY_CONSTRUCTIBLE, "__is_trivially_constructible", 
> -1)
>  DEFTRAIT_EXPR (IS_TRIVIALLY_COPYABLE, "__is_trivially_copyable", 1)
>  DEFTRAIT_EXPR (IS_UNION, "__is_union", 1)
> +DEFTRAIT_EXPR (IS_SCALAR, "__is_scalar", 1)
>  DEFTRAIT_EXPR (REF_CONSTRUCTS_FROM_TEMPORARY, 
> "__reference_constructs_from_temporary", 2)
>  DEFTRAIT_EXPR (REF_CONVERTS_FROM_TEMPORARY, 
> "__reference_converts_from_temporary", 2)
>  /* FIXME Added space to avoid direct usage in GCC 13.  */
> diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
> index 8fb47fd179e..3edc7f23212 100644
> --- a/gcc/cp/semantics.cc
> +++ b/gcc/cp/semantics.cc
> @@ -12118,6 +12118,9 @@ trait_expr_value (cp_trait_kind kind, tree type1, 
> tree type2)
>  case CPTK_IS_UNION:
>return type_code1 == UNION_TYPE;
>
> +case CPTK_IS_SCALAR:
> +  return SCALAR_TYPE_P (type1);
> +
>  case CPTK_IS_ASSIGNABLE:
>return is_xible (MODIFY_EXPR, type1, type2);
>
> @@ -12296,6 +12299,7 @@ finish_trait_expr (location_t loc, cp_trait_kind 
> kind, tree type1, tree type2)
>  case CPTK_IS_ENUM:
>  case CPTK_IS_UNION:
>  case CPTK_IS_SAME:
> +case CPTK_IS_SCALAR:
>break;
>
>  case CPTK_IS_LAYOUT_COMPATIBLE:
> diff --git a/gcc/testsuite/g++.dg/ext/has-builtin-1.C 
> b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
> index f343e153e56..75acbdfb9fc 100644
> --- a/gcc/testsuite/g++.dg/ext/has-builtin-1.C
> +++ b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
> @@ -146,3 +146,6 @@
>  #if !__has_builtin (__remove_cvref)
>  # error "__has_builtin (__remove_cvref) failed"
>  #endif
> +#if !__has_builtin (__is_scalar)
> +# error "__has_builtin (__is_scalar) failed"
> +#endif
> diff --git a/gcc/testsuite/g++.dg/ext/is_scalar.C 
> b/gcc/testsuite/g++.dg/ext/is_scalar.C
> new file mode 100644
> index 000..457fddc52fc
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/ext/is_scalar.C
> @@ -0,0 +1,31 @@
> +// { dg-do compile { target c++11 } }
> +
> +#include   // std::nullptr_t
> +#include 
> +
> +using namespace __gnu_test;
> +
> +#define SA(X) static_assert((X),#X)
> +
> +#define SA_TEST_CATEGORY(TRAIT, TYPE, EXPECT)  \
> +  SA(TRAIT(TYPE) == EXPECT);   \
> +  SA(TRAIT(const TYPE) ==

Re: Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering

2023-07-03 Thread Lehua Ding

Commited, thanks Robin and Jeff.


-- Original --
From:   
 "juzhe.zh...@rivai.ai"

Re: [PATCH V2] Middle-end: Change order of LEN_MASK_LOAD/LEN_MASK_STORE arguments

2023-07-03 Thread Robin Dapp via Gcc-patches



> Similar to LEN_MASK_LOAD/STORE, their orders are consistent now after
> this patch.
Ah right, apologies.

Regards
 Robin

Re: [PATCH] RISC-V: Support vfwnmacc/vfwmsac/vfwnmsac combine lowering

2023-07-03 Thread Lehua Ding

Commited, thanks Robin, Kito, and Jeff.


--Original--
From: "juzhe.zh...@rivai.ai"

Re: [PATCH V2] Middle-end: Change order of LEN_MASK_LOAD/LEN_MASK_STORE arguments

2023-07-03 Thread Richard Sandiford via Gcc-patches

juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong 
>
> Hi, Richard. I fix the order as you suggeted.
>
> Before this patch, the order is {len,mask,bias}.
>
> Now, after this patch, the order becomes {len,bias,mask}.
>
> Since you said we should not need 'internal_fn_bias_index', the bias index 
> should always be the len index + 1.
> I notice LEN_STORE order is {len,vector,bias}, to make them consistent, I 
> reorder into LEN_STORE {len,bias,vector}.
> Just like MASK_STORE {mask,vector}.
>
> Ok for trunk ?
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md: Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
> * config/riscv/riscv-v.cc (expand_load_store): Ditto.
> * doc/md.texi: Ditto.
> * gimple-fold.cc (gimple_fold_partial_load_store_mem_ref): Ditto.
> * internal-fn.cc (len_maskload_direct): Ditto.
> (len_maskstore_direct): Ditto.
> (add_len_and_mask_args): New function.
> (expand_partial_load_optab_fn): Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
> (expand_partial_store_optab_fn): Ditto.
> (internal_fn_len_index): New function.
> (internal_fn_mask_index): Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
> (internal_fn_stored_value_index): Ditto.
> (internal_len_load_store_bias): Ditto.
> * internal-fn.h (internal_fn_len_index): New function.
> * tree-ssa-dse.cc (initialize_ao_ref_for_dse): Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
> * tree-vect-stmts.cc (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.

OK, thanks.

Richard

> ---
>  gcc/config/riscv/autovec.md |   8 +-
>  gcc/config/riscv/riscv-v.cc |   2 +-
>  gcc/doc/md.texi |  16 ++--
>  gcc/gimple-fold.cc  |   8 +-
>  gcc/internal-fn.cc  | 156 ++--
>  gcc/internal-fn.h   |   1 +
>  gcc/tree-ssa-dse.cc |  11 +--
>  gcc/tree-vect-stmts.cc  |  11 +--
>  8 files changed, 107 insertions(+), 106 deletions(-)
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 1488f2be1be..4ab0e9f99eb 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -26,8 +26,8 @@
>[(match_operand:V 0 "register_operand")
> (match_operand:V 1 "memory_operand")
> (match_operand 2 "autovec_length_operand")
> -   (match_operand: 3 "vector_mask_operand")
> -   (match_operand 4 "const_0_operand")]
> +   (match_operand 3 "const_0_operand")
> +   (match_operand: 4 "vector_mask_operand")]
>"TARGET_VECTOR"
>  {
>riscv_vector::expand_load_store (operands, true);
> @@ -38,8 +38,8 @@
>[(match_operand:V 0 "memory_operand")
> (match_operand:V 1 "register_operand")
> (match_operand 2 "autovec_length_operand")
> -   (match_operand: 3 "vector_mask_operand")
> -   (match_operand 4 "const_0_operand")]
> +   (match_operand 3 "const_0_operand")
> +   (match_operand: 4 "vector_mask_operand")]
>"TARGET_VECTOR"
>  {
>riscv_vector::expand_load_store (operands, false);
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index adb8d7d36a5..8d5bed7ebe4 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -2777,7 +2777,7 @@ expand_load_store (rtx *ops, bool is_load)
>  {
>poly_int64 value;
>rtx len = ops[2];
> -  rtx mask = ops[3];
> +  rtx mask = ops[4];
>machine_mode mode = GET_MODE (ops[0]);
>  
>if (poly_int_rtx_p (len, ) && known_eq (value, GET_MODE_NUNITS 
> (mode)))
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index cefdee84821..5e5482265cd 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5302,15 +5302,15 @@ This pattern is not allowed to @code{FAIL}.
>  @cindex @code{len_maskload@var{m}@var{n}} instruction pattern
>  @item @samp{len_maskload@var{m}@var{n}}
>  Perform a masked load from the memory location pointed to by operand 1
> -into register operand 0.  (operand 2 + operand 4) elements are loaded from
> +into register operand 0.  (operand 2 + operand 3) elements are loaded from
>  memory and other elements in operand 0 are set to undefined values.
>  This is a combination of len_load and maskload.
>  Operands 0 and 1 have mode @var{m}, which must be a vector mode.  Operand 2
>  has whichever integer mode the target prefers.  A mask is specified in
> -operand 3 which must be of type @var{n}.  The mask has lower precedence than
> +operand 4 which must be of type @var{n}.  The mask has lower precedence than
>  the length and is itself subject to length masking,
> -i.e. only mask indices < (operand 2 + operand 4) are used.
> -Operand 4 conceptually has mode @code{QI}.
> +i.e. only mask indices < (operand 2 + operand 3) are used.
> +Operand 3 conceptually has mode @code{QI}.
>  
>  Operand 2 can be a variable or a constant amount.  Operand 4 specifies a
>  constant bias: it is

Re: Re: [PATCH V2] Middle-end: Change order of LEN_MASK_LOAD/LEN_MASK_STORE arguments

2023-07-03 Thread juzhe.zh...@rivai.ai

Take mask_store/MASK_STORE as example.

In gimple IR: MASK_STORE (ptr, align, mask, v)
In maskstore RTL IR maskstore (ptr, v, mask)

For LEN_STORE/len_store, after adjusted:

In gimple IR: LEN_STORE (ptr, align, len, bias, v)
In len_store RTL IR len_store (ptr, v, len, bias)

Similar to LEN_MASK_LOAD/STORE, their orders are consistent now after this 
patch.

Thanks.


juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-03 17:17
To: juzhe.zhong; gcc-patches
CC: rdapp.gcc; richard.sandiford; rguenther; linkw; krebbel
Subject: Re: [PATCH V2] Middle-end: Change order of 
LEN_MASK_LOAD/LEN_MASK_STORE arguments
Hi Juzhe,
 
when changing the argument order for LEN_LOAD/LEN_STORE, you will also
need to adjust rs6000's and s390's expanders. 
 
Regards
Robin

Re: Re: [PATCH V2] Middle-end: Change order of LEN_MASK_LOAD/LEN_MASK_STORE arguments

2023-07-03 Thread juzhe.zh...@rivai.ai

No, we don't need to.

len_load/len_store optab in backend

(define_expand "len_load_v16qi"
  [(match_operand:V16QI 0 "register_operand")
   (match_operand:V16QI 1 "memory_operand")
   (match_operand:QI 2 "register_operand")
   (match_operand:QI 3 "vll_bias_operand")
  ]
  "TARGET_VX && TARGET_64BIT"
{
  rtx mem = adjust_address (operands[1], BLKmode, 0);

  rtx len = gen_reg_rtx (SImode);
  emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
  emit_insn (gen_vllv16qi (operands[0], len, mem));
  DONE;
})

(define_expand "len_store_v16qi"
  [(match_operand:V16QI 0 "memory_operand")
   (match_operand:V16QI 1 "register_operand")
   (match_operand:QI 2 "register_operand")
   (match_operand:QI 3 "vll_bias_operand")
  ]
  "TARGET_VX && TARGET_64BIT"
{
  rtx mem = adjust_address (operands[0], BLKmode, 0);

  rtx len = gen_reg_rtx (SImode);
  emit_move_insn (len, gen_rtx_ZERO_EXTEND (SImode, operands[2]));
  emit_insn (gen_vstlv16qi (operands[1], len, mem));
  DONE;
});;

is already correct order {len,bias}. Only Gimple IR need to be adjusted.

I have already tested len_load/len_store optab.

Thanks.


juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-03 17:17
To: juzhe.zhong; gcc-patches
CC: rdapp.gcc; richard.sandiford; rguenther; linkw; krebbel
Subject: Re: [PATCH V2] Middle-end: Change order of 
LEN_MASK_LOAD/LEN_MASK_STORE arguments
Hi Juzhe,
 
when changing the argument order for LEN_LOAD/LEN_STORE, you will also
need to adjust rs6000's and s390's expanders. 
 
Regards
Robin

Re: [PATCH V2] Middle-end: Change order of LEN_MASK_LOAD/LEN_MASK_STORE arguments

2023-07-03 Thread Robin Dapp via Gcc-patches

Hi Juzhe,

when changing the argument order for LEN_LOAD/LEN_STORE, you will also
need to adjust rs6000's and s390's expanders. 

Regards
 Robin

[PATCH 2/2] libstdc++: use new built-in trait __is_scalar for std::is_scalar

2023-07-03 Thread Ken Matsui via Gcc-patches

This patch gets std::is_scalar to dispatch to new built-in trait
__is_scalar.

libstdc++-v3/ChangeLog:

* include/std/type_traits (is_scalar): Use __is_scalar built-in
trait.
(is_scalar_v): Likewise.

Signed-off-by: Ken Matsui 
---
 libstdc++-v3/include/std/type_traits | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/libstdc++-v3/include/std/type_traits 
b/libstdc++-v3/include/std/type_traits
index 0e7a9c9c7f3..bc90b2c61ca 100644
--- a/libstdc++-v3/include/std/type_traits
+++ b/libstdc++-v3/include/std/type_traits
@@ -678,11 +678,18 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 struct is_member_pointer;
 
   /// is_scalar
+#if __has_builtin(__is_scalar)
+  template
+struct is_scalar
+: public __bool_constant<__is_scalar(_Tp)>
+{ };
+#else
   template
 struct is_scalar
 : public __or_, is_enum<_Tp>, is_pointer<_Tp>,
is_member_pointer<_Tp>, is_null_pointer<_Tp>>::type
 { };
+#endif
 
   /// is_compound
   template
@@ -3204,8 +3211,15 @@ template 
   inline constexpr bool is_fundamental_v = is_fundamental<_Tp>::value;
 template 
   inline constexpr bool is_object_v = is_object<_Tp>::value;
+
+#if __has_builtin(__is_scalar)
+template 
+  inline constexpr bool is_scalar_v = __is_scalar(_Tp);
+#else
 template 
   inline constexpr bool is_scalar_v = is_scalar<_Tp>::value;
+#endif
+
 template 
   inline constexpr bool is_compound_v = is_compound<_Tp>::value;
 template 
-- 
2.41.0

[PATCH 1/2] c++, libstdc++: implement __is_scalar built-in trait

2023-07-03 Thread Ken Matsui via Gcc-patches

This patch implements built-in trait for std::is_scalar. The existent
__is_scalar codes were replaced with is_scalar to avoid unintentional
macro replacement by the new built-in.

gcc/cp/ChangeLog:

* cp-trait.def: Define __is_scalar.
* constraint.cc (diagnose_trait_expr): Handle CPTK_IS_SCALAR.
* semantics.cc (trait_expr_value): Likewise.
(finish_trait_expr): Likewise.

gcc/testsuite/ChangeLog:

* g++.dg/ext/has-builtin-1.C: Test existence of __is_scalar.
* g++.dg/ext/is_scalar.C: New test.
* g++.dg/tm/pr46567.C: Use is_scalar instead.
* g++.dg/torture/pr57107.C: Likewise.

libstdc++-v3/ChangeLog:

* include/bits/cpp_type_traits.h (__is_scalar): Rename to ...
(is_scalar): ... this.
* include/bits/stl_algobase.h: Use is_scalar instead.
* include/bits/valarray_array.h: Likewise.

Signed-off-by: Ken Matsui 
---
 gcc/cp/constraint.cc|  3 ++
 gcc/cp/cp-trait.def |  1 +
 gcc/cp/semantics.cc |  4 +++
 gcc/testsuite/g++.dg/ext/has-builtin-1.C|  3 ++
 gcc/testsuite/g++.dg/ext/is_scalar.C| 31 +
 gcc/testsuite/g++.dg/tm/pr46567.C   | 10 +++
 gcc/testsuite/g++.dg/torture/pr57107.C  |  4 +--
 libstdc++-v3/include/bits/cpp_type_traits.h |  2 +-
 libstdc++-v3/include/bits/stl_algobase.h|  8 +++---
 libstdc++-v3/include/bits/valarray_array.h  |  2 +-
 10 files changed, 55 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/ext/is_scalar.C

diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
index 8cf0f2d0974..4c27f2a3a62 100644
--- a/gcc/cp/constraint.cc
+++ b/gcc/cp/constraint.cc
@@ -3751,6 +3751,9 @@ diagnose_trait_expr (tree expr, tree args)
 case CPTK_IS_UNION:
   inform (loc, "  %qT is not a union", t1);
   break;
+case CPTK_IS_SCALAR:
+  inform (loc, "  %qT is not a scalar type", t1);
+  break;
 case CPTK_IS_AGGREGATE:
   inform (loc, "  %qT is not an aggregate", t1);
   break;
diff --git a/gcc/cp/cp-trait.def b/gcc/cp/cp-trait.def
index 8b7fece0cc8..59ae087c457 100644
--- a/gcc/cp/cp-trait.def
+++ b/gcc/cp/cp-trait.def
@@ -82,6 +82,7 @@ DEFTRAIT_EXPR (IS_TRIVIALLY_ASSIGNABLE, 
"__is_trivially_assignable", 2)
 DEFTRAIT_EXPR (IS_TRIVIALLY_CONSTRUCTIBLE, "__is_trivially_constructible", -1)
 DEFTRAIT_EXPR (IS_TRIVIALLY_COPYABLE, "__is_trivially_copyable", 1)
 DEFTRAIT_EXPR (IS_UNION, "__is_union", 1)
+DEFTRAIT_EXPR (IS_SCALAR, "__is_scalar", 1)
 DEFTRAIT_EXPR (REF_CONSTRUCTS_FROM_TEMPORARY, 
"__reference_constructs_from_temporary", 2)
 DEFTRAIT_EXPR (REF_CONVERTS_FROM_TEMPORARY, 
"__reference_converts_from_temporary", 2)
 /* FIXME Added space to avoid direct usage in GCC 13.  */
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index 8fb47fd179e..3edc7f23212 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/semantics.cc
@@ -12118,6 +12118,9 @@ trait_expr_value (cp_trait_kind kind, tree type1, tree 
type2)
 case CPTK_IS_UNION:
   return type_code1 == UNION_TYPE;
 
+case CPTK_IS_SCALAR:
+  return SCALAR_TYPE_P (type1);
+
 case CPTK_IS_ASSIGNABLE:
   return is_xible (MODIFY_EXPR, type1, type2);
 
@@ -12296,6 +12299,7 @@ finish_trait_expr (location_t loc, cp_trait_kind kind, 
tree type1, tree type2)
 case CPTK_IS_ENUM:
 case CPTK_IS_UNION:
 case CPTK_IS_SAME:
+case CPTK_IS_SCALAR:
   break;
 
 case CPTK_IS_LAYOUT_COMPATIBLE:
diff --git a/gcc/testsuite/g++.dg/ext/has-builtin-1.C 
b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
index f343e153e56..75acbdfb9fc 100644
--- a/gcc/testsuite/g++.dg/ext/has-builtin-1.C
+++ b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
@@ -146,3 +146,6 @@
 #if !__has_builtin (__remove_cvref)
 # error "__has_builtin (__remove_cvref) failed"
 #endif
+#if !__has_builtin (__is_scalar)
+# error "__has_builtin (__is_scalar) failed"
+#endif
diff --git a/gcc/testsuite/g++.dg/ext/is_scalar.C 
b/gcc/testsuite/g++.dg/ext/is_scalar.C
new file mode 100644
index 000..457fddc52fc
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/is_scalar.C
@@ -0,0 +1,31 @@
+// { dg-do compile { target c++11 } }
+
+#include   // std::nullptr_t
+#include 
+
+using namespace __gnu_test;
+
+#define SA(X) static_assert((X),#X)
+
+#define SA_TEST_CATEGORY(TRAIT, TYPE, EXPECT)  \
+  SA(TRAIT(TYPE) == EXPECT);   \
+  SA(TRAIT(const TYPE) == EXPECT); \
+  SA(TRAIT(volatile TYPE) == EXPECT);  \
+  SA(TRAIT(const volatile TYPE) == EXPECT)
+
+// volatile return type would cause a warning.
+#define SA_FN_TEST_CATEGORY(TRAIT, TYPE, EXPECT)   \
+  SA(TRAIT(TYPE) == EXPECT);   \
+  SA(TRAIT(const TYPE) == EXPECT)
+
+SA_TEST_CATEGORY(__is_scalar, int, true);
+SA_TEST_CATEGORY(__is_scalar, float, true);
+SA_TEST_CATEGORY(__is_scalar, EnumType, true);
+SA_TEST_CATEGORY(__is_scalar, int*, true);

Re: [PATCH 2/2] ifcvt: Allow more operations in multiple set if conversion

2023-07-03 Thread Robin Dapp via Gcc-patches

Hi Manolis,

that looks like a nice enhancement of what's already possible.  The concern
I had some years back already was that this function would eventually
grow and cannibalize on some of what the other functions in ifcvt already
do :)  At some point we really should unify but that's not within the
scope of this patch.

IMHO we're already pretty far towards general "conditional execution"
with conditional increments, selects and so on (and the function is still
called "_noce") and historically the cond_exec functions would have
taken care of that.  To my knowledge though, none of the major backends
implements anything like (cond_exec ...) anymore and relies on bit-twiddling
tricks to generate the conditional instructions.

Have you checked whether cond_exec and others could be adjusted to
handle the conditional instructions you want to see?  They don't perform
full cost comparison though but just count.

I would expect a bit of discussion around that but from a first look
I don't have major concerns.

> -/* Return true iff basic block TEST_BB is comprised of only
> -   (SET (REG) (REG)) insns suitable for conversion to a series
> -   of conditional moves.  Also check that we have more than one set
> -   (other routines can handle a single set better than we would), and
> -   fewer than PARAM_MAX_RTL_IF_CONVERSION_INSNS sets.  While going
> +/* Return true iff basic block TEST_BB is suitable for conversion to a
> +   series of conditional moves.  Also check that we have more than one

Might want to change the "conditional moves" while you're at it.

>  
> -  if (!((REG_P (src) || CONSTANT_P (src))
> - || (GET_CODE (src) == SUBREG && REG_P (SUBREG_REG (src))
> -   && subreg_lowpart_p (src
> +  /* Allow a wide range of operations and let the costing function decide
> +  if the conversion is worth it later.  */
> +  enum rtx_code code = GET_CODE (src);
> +  if (!(CONSTANT_P (src)
> + || code == REG
> + || code == SUBREG
> + || code == ZERO_EXTEND
> + || code == SIGN_EXTEND
> + || code == NOT
> + || code == NEG
> + || code == PLUS
> + || code == MINUS
> + || code == AND
> + || code == IOR
> + || code == MULT
> + || code == ASHIFT
> + || code == ASHIFTRT
> + || code == NE
> + || code == EQ
> + || code == GE
> + || code == GT
> + || code == LE
> + || code == LT
> + || code == GEU
> + || code == GTU
> + || code == LEU
> + || code == LTU
> + || code == COMPARE))

We're potentially checking many more patterns than before.  Maybe it
would make sense to ask the backend whether it has a pattern for
the respective code?

Regards
 Robin

[PATCH 1/2] c++: implement __is_scalar built-in trait

2023-07-03 Thread Ken Matsui via Gcc-patches

This patch implements built-in trait for std::is_scalar. The existent
__is_scalar codes were replaced with is_scalar to avoid unintentional
macro replacement by the new built-in.

gcc/cp/ChangeLog:

* cp-trait.def: Define __is_scalar.
* constraint.cc (diagnose_trait_expr): Handle CPTK_IS_SCALAR.
* semantics.cc (trait_expr_value): Likewise.
(finish_trait_expr): Likewise.

gcc/testsuite/ChangeLog:

* g++.dg/ext/has-builtin-1.C: Test existence of __is_scalar.
* g++.dg/ext/is_scalar.C: New test.
* g++.dg/tm/pr46567.C: Use is_scalar instead.
* g++.dg/torture/pr57107.C: Likewise.

libstdc++-v3/ChangeLog:

* include/bits/cpp_type_traits.h (__is_scalar): Rename to ...
(is_scalar): ... this.
* include/bits/stl_algobase.h: Use is_scalar instead.
* include/bits/valarray_array.h: Likewise.

Signed-off-by: Ken Matsui 
---
 gcc/cp/constraint.cc|  3 ++
 gcc/cp/cp-trait.def |  1 +
 gcc/cp/semantics.cc |  4 +++
 gcc/testsuite/g++.dg/ext/has-builtin-1.C|  3 ++
 gcc/testsuite/g++.dg/ext/is_scalar.C| 31 +
 gcc/testsuite/g++.dg/tm/pr46567.C   | 10 +++
 gcc/testsuite/g++.dg/torture/pr57107.C  |  4 +--
 libstdc++-v3/include/bits/cpp_type_traits.h |  2 +-
 libstdc++-v3/include/bits/stl_algobase.h|  8 +++---
 libstdc++-v3/include/bits/valarray_array.h  |  2 +-
 10 files changed, 55 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/ext/is_scalar.C

diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
index 8cf0f2d0974..4c27f2a3a62 100644
--- a/gcc/cp/constraint.cc
+++ b/gcc/cp/constraint.cc
@@ -3751,6 +3751,9 @@ diagnose_trait_expr (tree expr, tree args)
 case CPTK_IS_UNION:
   inform (loc, "  %qT is not a union", t1);
   break;
+case CPTK_IS_SCALAR:
+  inform (loc, "  %qT is not a scalar type", t1);
+  break;
 case CPTK_IS_AGGREGATE:
   inform (loc, "  %qT is not an aggregate", t1);
   break;
diff --git a/gcc/cp/cp-trait.def b/gcc/cp/cp-trait.def
index 8b7fece0cc8..59ae087c457 100644
--- a/gcc/cp/cp-trait.def
+++ b/gcc/cp/cp-trait.def
@@ -82,6 +82,7 @@ DEFTRAIT_EXPR (IS_TRIVIALLY_ASSIGNABLE, 
"__is_trivially_assignable", 2)
 DEFTRAIT_EXPR (IS_TRIVIALLY_CONSTRUCTIBLE, "__is_trivially_constructible", -1)
 DEFTRAIT_EXPR (IS_TRIVIALLY_COPYABLE, "__is_trivially_copyable", 1)
 DEFTRAIT_EXPR (IS_UNION, "__is_union", 1)
+DEFTRAIT_EXPR (IS_SCALAR, "__is_scalar", 1)
 DEFTRAIT_EXPR (REF_CONSTRUCTS_FROM_TEMPORARY, 
"__reference_constructs_from_temporary", 2)
 DEFTRAIT_EXPR (REF_CONVERTS_FROM_TEMPORARY, 
"__reference_converts_from_temporary", 2)
 /* FIXME Added space to avoid direct usage in GCC 13.  */
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index 8fb47fd179e..3edc7f23212 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/semantics.cc
@@ -12118,6 +12118,9 @@ trait_expr_value (cp_trait_kind kind, tree type1, tree 
type2)
 case CPTK_IS_UNION:
   return type_code1 == UNION_TYPE;
 
+case CPTK_IS_SCALAR:
+  return SCALAR_TYPE_P (type1);
+
 case CPTK_IS_ASSIGNABLE:
   return is_xible (MODIFY_EXPR, type1, type2);
 
@@ -12296,6 +12299,7 @@ finish_trait_expr (location_t loc, cp_trait_kind kind, 
tree type1, tree type2)
 case CPTK_IS_ENUM:
 case CPTK_IS_UNION:
 case CPTK_IS_SAME:
+case CPTK_IS_SCALAR:
   break;
 
 case CPTK_IS_LAYOUT_COMPATIBLE:
diff --git a/gcc/testsuite/g++.dg/ext/has-builtin-1.C 
b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
index f343e153e56..75acbdfb9fc 100644
--- a/gcc/testsuite/g++.dg/ext/has-builtin-1.C
+++ b/gcc/testsuite/g++.dg/ext/has-builtin-1.C
@@ -146,3 +146,6 @@
 #if !__has_builtin (__remove_cvref)
 # error "__has_builtin (__remove_cvref) failed"
 #endif
+#if !__has_builtin (__is_scalar)
+# error "__has_builtin (__is_scalar) failed"
+#endif
diff --git a/gcc/testsuite/g++.dg/ext/is_scalar.C 
b/gcc/testsuite/g++.dg/ext/is_scalar.C
new file mode 100644
index 000..457fddc52fc
--- /dev/null
+++ b/gcc/testsuite/g++.dg/ext/is_scalar.C
@@ -0,0 +1,31 @@
+// { dg-do compile { target c++11 } }
+
+#include   // std::nullptr_t
+#include 
+
+using namespace __gnu_test;
+
+#define SA(X) static_assert((X),#X)
+
+#define SA_TEST_CATEGORY(TRAIT, TYPE, EXPECT)  \
+  SA(TRAIT(TYPE) == EXPECT);   \
+  SA(TRAIT(const TYPE) == EXPECT); \
+  SA(TRAIT(volatile TYPE) == EXPECT);  \
+  SA(TRAIT(const volatile TYPE) == EXPECT)
+
+// volatile return type would cause a warning.
+#define SA_FN_TEST_CATEGORY(TRAIT, TYPE, EXPECT)   \
+  SA(TRAIT(TYPE) == EXPECT);   \
+  SA(TRAIT(const TYPE) == EXPECT)
+
+SA_TEST_CATEGORY(__is_scalar, int, true);
+SA_TEST_CATEGORY(__is_scalar, float, true);
+SA_TEST_CATEGORY(__is_scalar, EnumType, true);
+SA_TEST_CATEGORY(__is_scalar, int*, true);

Re: Re: [PATCH] RISC-V: Support vfwnmacc/vfwmsac/vfwnmsac combine lowering

2023-07-03 Thread juzhe.zh...@rivai.ai

Thanks kito. 
Lehua will merge it for me.

juzhe.zh...@rivai.ai

From: Kito Cheng
Date: 2023-07-03 17:01
To: Robin Dapp
CC: juzhe.zh...@rivai.ai; jeffreyalaw; gcc-patches; Kito.cheng; palmer; palmer
Subject: Re: [PATCH] RISC-V: Support vfwnmacc/vfwmsac/vfwnmsac combine lowering
Tried on local, widen-complicate-7.c, widen-complicate-8.c and
widen-complicate-9.c need those bridge pattern, otherwise will fail to
combine, so give an explicitly LGTM from my side.

On Mon, Jul 3, 2023 at 3:48 PM Robin Dapp via Gcc-patches
 wrote:
>
> To reiterate, this is OK from my side.  As discussed in the other
> thread, Jeff would like to have more info on whether a bridge pattern
> is needed at all and I agreed to get back to it in a while.  Until
> then, we can merge this.
>
> Regards
>  Robin
>

[PATCH V2] Middle-end: Change order of LEN_MASK_LOAD/LEN_MASK_STORE arguments

2023-07-03 Thread juzhe . zhong

From: Ju-Zhe Zhong 

Hi, Richard. I fix the order as you suggeted.

Before this patch, the order is {len,mask,bias}.

Now, after this patch, the order becomes {len,bias,mask}.

Since you said we should not need 'internal_fn_bias_index', the bias index 
should always be the len index + 1.
I notice LEN_STORE order is {len,vector,bias}, to make them consistent, I 
reorder into LEN_STORE {len,bias,vector}.
Just like MASK_STORE {mask,vector}.

Ok for trunk ?

gcc/ChangeLog:

* config/riscv/autovec.md: Change order of 
LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
* config/riscv/riscv-v.cc (expand_load_store): Ditto.
* doc/md.texi: Ditto.
* gimple-fold.cc (gimple_fold_partial_load_store_mem_ref): Ditto.
* internal-fn.cc (len_maskload_direct): Ditto.
(len_maskstore_direct): Ditto.
(add_len_and_mask_args): New function.
(expand_partial_load_optab_fn): Change order of 
LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
(expand_partial_store_optab_fn): Ditto.
(internal_fn_len_index): New function.
(internal_fn_mask_index): Change order of 
LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
(internal_fn_stored_value_index): Ditto.
(internal_len_load_store_bias): Ditto.
* internal-fn.h (internal_fn_len_index): New function.
* tree-ssa-dse.cc (initialize_ao_ref_for_dse): Change order of 
LEN_MASK_LOAD/LEN_MASK_STORE/LEN_LOAD/LEN_STORE arguments.
* tree-vect-stmts.cc (vectorizable_store): Ditto.
(vectorizable_load): Ditto.

---
 gcc/config/riscv/autovec.md |   8 +-
 gcc/config/riscv/riscv-v.cc |   2 +-
 gcc/doc/md.texi |  16 ++--
 gcc/gimple-fold.cc  |   8 +-
 gcc/internal-fn.cc  | 156 ++--
 gcc/internal-fn.h   |   1 +
 gcc/tree-ssa-dse.cc |  11 +--
 gcc/tree-vect-stmts.cc  |  11 +--
 8 files changed, 107 insertions(+), 106 deletions(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 1488f2be1be..4ab0e9f99eb 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -26,8 +26,8 @@
   [(match_operand:V 0 "register_operand")
(match_operand:V 1 "memory_operand")
(match_operand 2 "autovec_length_operand")
-   (match_operand: 3 "vector_mask_operand")
-   (match_operand 4 "const_0_operand")]
+   (match_operand 3 "const_0_operand")
+   (match_operand: 4 "vector_mask_operand")]
   "TARGET_VECTOR"
 {
   riscv_vector::expand_load_store (operands, true);
@@ -38,8 +38,8 @@
   [(match_operand:V 0 "memory_operand")
(match_operand:V 1 "register_operand")
(match_operand 2 "autovec_length_operand")
-   (match_operand: 3 "vector_mask_operand")
-   (match_operand 4 "const_0_operand")]
+   (match_operand 3 "const_0_operand")
+   (match_operand: 4 "vector_mask_operand")]
   "TARGET_VECTOR"
 {
   riscv_vector::expand_load_store (operands, false);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index adb8d7d36a5..8d5bed7ebe4 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -2777,7 +2777,7 @@ expand_load_store (rtx *ops, bool is_load)
 {
   poly_int64 value;
   rtx len = ops[2];
-  rtx mask = ops[3];
+  rtx mask = ops[4];
   machine_mode mode = GET_MODE (ops[0]);
 
   if (poly_int_rtx_p (len, ) && known_eq (value, GET_MODE_NUNITS (mode)))
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index cefdee84821..5e5482265cd 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5302,15 +5302,15 @@ This pattern is not allowed to @code{FAIL}.
 @cindex @code{len_maskload@var{m}@var{n}} instruction pattern
 @item @samp{len_maskload@var{m}@var{n}}
 Perform a masked load from the memory location pointed to by operand 1
-into register operand 0.  (operand 2 + operand 4) elements are loaded from
+into register operand 0.  (operand 2 + operand 3) elements are loaded from
 memory and other elements in operand 0 are set to undefined values.
 This is a combination of len_load and maskload.
 Operands 0 and 1 have mode @var{m}, which must be a vector mode.  Operand 2
 has whichever integer mode the target prefers.  A mask is specified in
-operand 3 which must be of type @var{n}.  The mask has lower precedence than
+operand 4 which must be of type @var{n}.  The mask has lower precedence than
 the length and is itself subject to length masking,
-i.e. only mask indices < (operand 2 + operand 4) are used.
-Operand 4 conceptually has mode @code{QI}.
+i.e. only mask indices < (operand 2 + operand 3) are used.
+Operand 3 conceptually has mode @code{QI}.
 
 Operand 2 can be a variable or a constant amount.  Operand 4 specifies a
 constant bias: it is either a constant 0 or a constant -1.  The predicate on
@@ -5329,14 +5329,14 @@ This pattern is not allowed to @code{FAIL}.
 @cindex @code{len_maskstore@var{m}@var{n}} instruction pattern
 @item @samp{len_maskstore@var{m}@var{n}}
 Perform a masked store from vector

[PATCH] aarch64: Fix vector-to-vector vec_extract

2023-07-03 Thread Richard Sandiford via Gcc-patches

The documentation says:

-
@cindex @code{vec_extract@var{m}@var{n}} instruction pattern
@item @samp{vec_extract@var{m}@var{n}}
Extract given field from the vector value.  [...]  The
@var{n} mode is the mode of the field or vector of fields that should be
extracted, [...]
If @var{n} is a vector mode, the index is counted in units of that mode.
-

However, Robin pointed out that, in practice, the index is counted
in whole multiples of @var{n}.  These are the semantics that x86
and target-independent code follow.

This patch updates the aarch64 pattern to match, which also removes
the FAIL.  I think Robin has patches that update the documentation
and make more use of the de facto semantics.

I haven't found an existing testcase that shows the difference.
We do now use the pattern for:

union u { int32x4_t x; int32x2_t y[2]; };
int32x2_t f(int32x4_t x) { union u u = { x }; return u.y[1]; }

but we were already generating perfect code for it.  Because of that,
it didn't really seem worth adding a specific dump test.

Tested on aarch64-linux-gnu & pushed.

Richard

gcc/
* config/aarch64/aarch64-simd.md (vec_extract): Expect
the index to be 0 or 1.
---
 gcc/config/aarch64/aarch64-simd.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index f1687d92eb2..d9539410147 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -8755,8 +8755,8 @@ (define_expand "vec_extract"
   "TARGET_SIMD"
 {
   int start = INTVAL (operands[2]);
-  if (start != 0 && start !=  / 2)
-FAIL;
+  gcc_assert (start == 0 || start == 1);
+  start *=  / 2;
   rtx sel = aarch64_gen_stepped_int_parallel ( / 2, start, 1);
   emit_insn (gen_aarch64_get_half (operands[0], operands[1], sel));
   DONE;
-- 
2.25.1

Re: [PATCH] RISC-V: Support vfwnmacc/vfwmsac/vfwnmsac combine lowering

2023-07-03 Thread Kito Cheng via Gcc-patches

Tried on local, widen-complicate-7.c, widen-complicate-8.c and
widen-complicate-9.c need those bridge pattern, otherwise will fail to
combine, so give an explicitly LGTM from my side.

On Mon, Jul 3, 2023 at 3:48 PM Robin Dapp via Gcc-patches
 wrote:
>
> To reiterate, this is OK from my side.  As discussed in the other
> thread, Jeff would like to have more info on whether a bridge pattern
> is needed at all and I agreed to get back to it in a while.  Until
> then, we can merge this.
>
> Regards
>  Robin
>

Re: Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering

2023-07-03 Thread juzhe.zh...@rivai.ai

OK. Thanks. Will commit with your cleanup patch.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-03 16:49
To: juzhe.zh...@rivai.ai
CC: rdapp.gcc; jeffreyalaw; gcc-patches; kito.cheng; Kito.cheng; palmer; palmer
Subject: Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering
On 7/3/23 10:45, juzhe.zh...@rivai.ai wrote:
> We can apply it but not sure why the patchwork shows it's rejected.
 
I believe it also failed for me locally because the order of
patterns in autovec-opt.md was somehow different.  The one attached
worked for me though after some minor merge adjustments on my branch.
 
Regards
Robin
 
From 29b12a473a31b2caa64fa2d1d97920a460ced0a2 Mon Sep 17 00:00:00 2001
From: Juzhe-Zhong 
Date: Wed, 28 Jun 2023 12:15:12 +0800
Subject: [PATCH] RISC-V: Support vfwmul.vv combine lowering
 
Consider the following complicate case:
#define TEST_TYPE(TYPE1, TYPE2)\
  __attribute__ ((noipa)) void vwadd_##TYPE1_##TYPE2 ( \
TYPE1 *__restrict dst, TYPE1 *__restrict dst2, TYPE1 *__restrict dst3, \
TYPE1 *__restrict dst4, TYPE2 *__restrict a, TYPE2 *__restrict b,  \
TYPE2 *__restrict a2, TYPE2 *__restrict b2, int n) \
  {\
for (int i = 0; i < n; i++)\
  {\
dst[i] = (TYPE1) a[i] * (TYPE1) b[i];  \
dst2[i] = (TYPE1) a2[i] * (TYPE1) b[i];\
dst3[i] = (TYPE1) a2[i] * (TYPE1) a[i];\
dst4[i] = (TYPE1) a[i] * (TYPE1) b2[i];\
  }\
  }
 
TEST_TYPE (double, float)
 
Such complicate situation, Combine PASS can not combine extension of both 
operands on the fly.
So the combine PASS will first try to combine one of the combine extension, and 
then combine
the other. The combine flow is as follows:
 
Original IR:
(set (reg 0) (float_extend: (reg 1))
(set (reg 3) (float_extend: (reg 2))
(set (reg 4) (mult: (reg 0) (reg 3))
 
First step of combine:
(set (reg 3) (float_extend: (reg 2))
(set (reg 4) (mult: (float_extend: (reg 1) (reg 3))
 
Second step of combine:
(set (reg 4) (mult: (float_extend: (reg 1) (float_extend: (reg 2))
 
So, to enhance the combine optimization, we add a "pseudo vwfmul.wv" RTL 
pattern in autovec-opt.md
which is (set (reg 0) (mult (float_extend (reg 1) (reg 2.
 
gcc/ChangeLog:
 
* config/riscv/autovec-opt.md 
(@pred_single_widen_mul): Change "@" into "*" in pattern 
name which simplifies build files.
(*pred_single_widen_mul): Ditto.
(*pred_single_widen_mul): New pattern.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/widen/widen-3.c: Add floating-point.
* gcc.target/riscv/rvv/autovec/widen/widen-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/widen-complicate-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/widen_run-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/widen_run-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-3.c: New test.
* gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-7.c: New test.
---
gcc/config/riscv/autovec-opt.md   | 39 +++
.../riscv/rvv/autovec/widen/widen-3.c |  7 +++-
.../riscv/rvv/autovec/widen/widen-7.c |  7 +++-
.../rvv/autovec/widen/widen-complicate-3.c|  7 +++-
.../riscv/rvv/autovec/widen/widen_run-3.c |  5 ++-
.../riscv/rvv/autovec/widen/widen_run-7.c |  5 ++-
.../rvv/autovec/widen/widen_run_zvfh-3.c  | 28 +
.../rvv/autovec/widen/widen_run_zvfh-7.c  | 28 +
8 files changed, 116 insertions(+), 10 deletions(-)
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-3.c
create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-7.c
 
diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index fd9cd27f50a..99b609a99d9 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -406,6 +406,45 @@ (define_insn "*pred_extract_first_sextsi"
   [(set_attr "type" "vimovvx")
(set_attr "mode" "")])
+;; We don't have vfwmul.wv instruction like vfwadd.wv in RVV.
+;; This pattern is an intermediate RTL IR as a pseudo vfwmul.wv to enhance
+;; optimization of instructions combine.
+(define_insn_and_split "*pred_single_widen_mul"
+  [(set (match_operand:VWEXTF 0 "register_operand"  "=,  
")
+   (if_then_else:VWEXTF
+ (unspec:
+   [(match_operand: 1 "vector_mask_operand"   
"vmWc1,vmWc1")
+(match_operand 5 "vector_length_operand"  "   rK,   
rK")
+(match_operand 6 "const_int_operand"

Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering

2023-07-03 Thread Robin Dapp via Gcc-patches

On 7/3/23 10:45, juzhe.zh...@rivai.ai wrote:
> We can apply it but not sure why the patchwork shows it's rejected.

I believe it also failed for me locally because the order of
patterns in autovec-opt.md was somehow different.  The one attached
worked for me though after some minor merge adjustments on my branch.

Regards
 Robin

>From 29b12a473a31b2caa64fa2d1d97920a460ced0a2 Mon Sep 17 00:00:00 2001
From: Juzhe-Zhong 
Date: Wed, 28 Jun 2023 12:15:12 +0800
Subject: [PATCH] RISC-V: Support vfwmul.vv combine lowering

Consider the following complicate case:
#define TEST_TYPE(TYPE1, TYPE2)\
  __attribute__ ((noipa)) void vwadd_##TYPE1_##TYPE2 ( \
TYPE1 *__restrict dst, TYPE1 *__restrict dst2, TYPE1 *__restrict dst3, \
TYPE1 *__restrict dst4, TYPE2 *__restrict a, TYPE2 *__restrict b,  \
TYPE2 *__restrict a2, TYPE2 *__restrict b2, int n) \
  {\
for (int i = 0; i < n; i++)\
  {\
dst[i] = (TYPE1) a[i] * (TYPE1) b[i];  \
dst2[i] = (TYPE1) a2[i] * (TYPE1) b[i];\
dst3[i] = (TYPE1) a2[i] * (TYPE1) a[i];\
dst4[i] = (TYPE1) a[i] * (TYPE1) b2[i];\
  }\
  }

TEST_TYPE (double, float)

Such complicate situation, Combine PASS can not combine extension of both 
operands on the fly.
So the combine PASS will first try to combine one of the combine extension, and 
then combine
the other. The combine flow is as follows:

Original IR:
(set (reg 0) (float_extend: (reg 1))
(set (reg 3) (float_extend: (reg 2))
(set (reg 4) (mult: (reg 0) (reg 3))

First step of combine:
(set (reg 3) (float_extend: (reg 2))
(set (reg 4) (mult: (float_extend: (reg 1) (reg 3))

Second step of combine:
(set (reg 4) (mult: (float_extend: (reg 1) (float_extend: (reg 2))

So, to enhance the combine optimization, we add a "pseudo vwfmul.wv" RTL 
pattern in autovec-opt.md
which is (set (reg 0) (mult (float_extend (reg 1) (reg 2.

gcc/ChangeLog:

* config/riscv/autovec-opt.md 
(@pred_single_widen_mul): Change "@" into "*" in pattern 
name which simplifies build files.
(*pred_single_widen_mul): Ditto.
(*pred_single_widen_mul): New pattern.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/widen/widen-3.c: Add floating-point.
* gcc.target/riscv/rvv/autovec/widen/widen-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/widen-complicate-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/widen_run-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/widen_run-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-3.c: New test.
* gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-7.c: New test.
---
 gcc/config/riscv/autovec-opt.md   | 39 +++
 .../riscv/rvv/autovec/widen/widen-3.c |  7 +++-
 .../riscv/rvv/autovec/widen/widen-7.c |  7 +++-
 .../rvv/autovec/widen/widen-complicate-3.c|  7 +++-
 .../riscv/rvv/autovec/widen/widen_run-3.c |  5 ++-
 .../riscv/rvv/autovec/widen/widen_run-7.c |  5 ++-
 .../rvv/autovec/widen/widen_run_zvfh-3.c  | 28 +
 .../rvv/autovec/widen/widen_run_zvfh-7.c  | 28 +
 8 files changed, 116 insertions(+), 10 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-3.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run_zvfh-7.c

diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index fd9cd27f50a..99b609a99d9 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -406,6 +406,45 @@ (define_insn "*pred_extract_first_sextsi"
   [(set_attr "type" "vimovvx")
(set_attr "mode" "")])
 
+;; We don't have vfwmul.wv instruction like vfwadd.wv in RVV.
+;; This pattern is an intermediate RTL IR as a pseudo vfwmul.wv to enhance
+;; optimization of instructions combine.
+(define_insn_and_split "*pred_single_widen_mul"
+  [(set (match_operand:VWEXTF 0 "register_operand"  "=,  
")
+   (if_then_else:VWEXTF
+ (unspec:
+   [(match_operand: 1 "vector_mask_operand"   
"vmWc1,vmWc1")
+(match_operand 5 "vector_length_operand"  "   rK,   
rK")
+(match_operand 6 "const_int_operand"  "i,
i")
+(match_operand 7 "const_int_operand"  "i,
i")
+(match_operand 8 "const_int_operand"  "i,
i")
+(match_operand 9 "const_int_operand"

Re: [PATCH 0/9] vect: Move costing next to the transform for vect load

2023-07-03 Thread Richard Biener via Gcc-patches

On Mon, Jul 3, 2023 at 5:39 AM Kewen.Lin  wrote:
>
> Hi Richi,
>
> Thanks for your review comments on this and some others!
>
> on 2023/6/30 19:37, Richard Biener wrote:
> > On Tue, Jun 13, 2023 at 4:07 AM Kewen Lin  wrote:
> >>
> >> This patch series follows Richi's suggestion at the link [1],
> >> which suggest structuring vectorizable_load to make costing
> >> next to the transform, in order to make it easier to keep
> >> costing and the transform in sync.  For now, it's a known
> >> issue that what we cost can be inconsistent with what we
> >> transform, as the case in PR82255 and some other associated
> >> test cases in the patches of this series show.
> >>
> >> Basically this patch series makes costing not call function
> >> vect_model_load_cost any more.  To make the review and
> >> bisection easy, I organized the changes according to the
> >> memory access types of vector load.  For each memory access
> >> type, firstly it follows the handlings in the function
> >> vect_model_load_costto avoid any missing, then refines
> >> further by referring to the transform code, I also checked
> >> them with some typical test cases to verify.  Hope the
> >> subjects of patches are clear enough.
> >>
> >> The whole series can be bootstrapped and regtested
> >> incrementally on:
> >>   - x86_64-redhat-linux
> >>   - aarch64-linux-gnu
> >>   - powerpc64-linux-gnu P7, P8 and P9
> >>   - powerpc64le-linux-gnu P8, P9 and P10
> >>
> >> By considering the current vector test buckets are mainly
> >> tested without cost model, I also verified the whole patch
> >> series was neutral for SPEC2017 int/fp on Power9 at O2,
> >> O3 and Ofast separately.
> >
> > I went through the series now and I like it overall (well, I suggested
> > the change).
> > Looking at the changes I think we want some followup to reduce the
> > mess in the final loop nest.  We already have some VMAT_* cases handled
> > separately, maybe we can split out some more cases.  Maybe we should
>
> At first glance, the simple parts look to be the handlings for
> VMAT_LOAD_STORE_LANES, and VMAT_GATHER_SCATTER (with ifn and emulated).
> It seems a bit straightforward if it's fine to duplicate the nested loop,
> but may need to care about removing some useless code.
>
> > bite the bullet and duplicate that loop nest for the different VMAT_* cases.
> > Maybe we can merge some of the if (!costing_p) checks by clever
> > re-ordering.
>
> I've tried a bit to merge them if possible, like the place to check
> VMAT_CONTIGUOUS, VMAT_CONTIGUOUS_REVERSE and VMAT_CONTIGUOUS_PERMUTE.
> But will keep in mind for the following updates.
>
> > So what
> > this series doesn't improve is overall readability of the code (indent and 
> > our
> > 80 char line limit).
>
> Sorry about that.
>
> >
> > The change also makes it more difficult(?) to separate analysis and 
> > transform
> > though in the end I hope that analysis will actually "code generate" to a 
> > (SLP)
> > data structure so the target will have a chance to see the actual flow of 
> > insns.
> >
> > That said, I'd like to hear from Richard whether he thinks this is a step
> > in the right direction.
> >
> > Are you willing to followup with doing the same re-structuring to
> > vectorizable_store?
>
> Yes, vectorizable_store was also pointed out in your original suggestion [1],
> I planned to update this once this series meets your expectations and gets 
> landed.
>
> >
> > OK from my side with the few comments addressed.  The patch likely needs 
> > refresh
> > after the RVV changes in this area?
>
> Thanks!  Yes, I've updated 2/9 and 3/9 according to your comments, and updated
> 5/9 and 9/9 as they had some conflicts when rebasing.  Re-testing is ongoing,
> do the updated versions look good to you?  Is this series ok for trunk if all 
> the
> test runs go well again as before?

Yes.

Thanks,
Richard.

> BR,
> Kewen

Re: Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering

2023-07-03 Thread juzhe.zh...@rivai.ai

We can apply it but not sure why the patchwork shows it's rejected.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-03 16:44
To: juzhe.zh...@rivai.ai
CC: rdapp.gcc; jeffreyalaw; gcc-patches; kito.cheng; Kito.cheng; palmer; palmer
Subject: Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering
> We failed to merge it since it's been rejected.
> https://patchwork.sourceware.org/project/gcc/patch/20230628041512.188243-1-juzhe.zh...@rivai.ai/
>  
> 
>  
 
Err, who rejected?  Or is this about the patch itself
that doesn't apply cleanly anymore?
 
Regards
Robin

Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering

2023-07-03 Thread Robin Dapp via Gcc-patches

> We failed to merge it since it's been rejected.
> https://patchwork.sourceware.org/project/gcc/patch/20230628041512.188243-1-juzhe.zh...@rivai.ai/
>  
> 
>  

Err, who rejected?  Or is this about the patch itself
that doesn't apply cleanly anymore?

Regards
 Robin

Re: Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering

2023-07-03 Thread juzhe.zh...@rivai.ai

We failed to merge it since it's been rejected.
https://patchwork.sourceware.org/project/gcc/patch/20230628041512.188243-1-juzhe.zh...@rivai.ai/
 




juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-07-03 15:49
To: juzhe.zhong
CC: rdapp.gcc; Jeff Law; gcc-patches; kito.cheng; kito.cheng; palmer; palmer
Subject: Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering
> Thanks. Ok for trunk？
 
OK from my side.  As agreed with Jeff, I'm going to get back to this
and revisit/change if needed in the future.
 
Regards
Robin

RE: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread Li, Pan2 via Gcc-patches

Committed, thanks Robin and Juzhe.

Pan

-Original Message-
From: Robin Dapp  
Sent: Monday, July 3, 2023 4:15 PM
To: juzhe.zh...@rivai.ai; Li, Pan2 ; gcc-patches 

Cc: rdapp@gmail.com; jeffreyalaw ; Wang, Yanzhang 
; kito.cheng 
Subject: Re: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition


> Thanks for fixing it. LGTM.
> I think you can merge it when Robin is ok since this is a simple typo
> fix.

Yes, that's definitely simple enough :)

Regards
 Robin

Re: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread Robin Dapp via Gcc-patches



> Thanks for fixing it. LGTM.
> I think you can merge it when Robin is ok since this is a simple typo
> fix.

Yes, that's definitely simple enough :)

Regards
 Robin

Re: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread juzhe.zh...@rivai.ai

Thanks for fixing it. LGTM.
I think you can merge it when Robin is ok since this is a simple typo fix.

Thanks.


juzhe.zh...@rivai.ai
 
From: pan2.li
Date: 2023-07-03 16:08
To: gcc-patches
CC: juzhe.zhong; jeffreyalaw; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Fix one typo of FRM dynamic definition
From: Pan Li 
 
This patch would like to fix one typo that take rdn instead of dyn by
mistake.
 
Signed-off-by: Pan Li 
 
gcc/ChangeLog:
 
* config/riscv/vector.md: Fix typo.
---
gcc/config/riscv/vector.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
 
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index a6174f9483e..2864475b35a 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -491,8 +491,8 @@ (define_attr "frm_mode" "rne,rtz,rdn,rup,rmm,dyn,none"
  (match_test "INTVAL (operands[9]) == riscv_vector::FRM_RMM")
  (const_string "rmm")
-   (match_test "INTVAL (operands[9]) == riscv_vector::FRM_RDN")
-   (const_string "rdn")
+   (match_test "INTVAL (operands[9]) == riscv_vector::FRM_DYN")
+   (const_string "dyn")
]
(const_string "none")
   )
-- 
2.34.1

[PATCH v1] RISC-V: Fix one typo of FRM dynamic definition

2023-07-03 Thread Pan Li via Gcc-patches

From: Pan Li 

This patch would like to fix one typo that take rdn instead of dyn by
mistake.

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/vector.md: Fix typo.
---
 gcc/config/riscv/vector.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index a6174f9483e..2864475b35a 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -491,8 +491,8 @@ (define_attr "frm_mode" "rne,rtz,rdn,rup,rmm,dyn,none"
  (match_test "INTVAL (operands[9]) == riscv_vector::FRM_RMM")
  (const_string "rmm")
 
- (match_test "INTVAL (operands[9]) == riscv_vector::FRM_RDN")
- (const_string "rdn")
+ (match_test "INTVAL (operands[9]) == riscv_vector::FRM_DYN")
+ (const_string "dyn")
]
(const_string "none")
   )
-- 
2.34.1

Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors

2023-07-03 Thread Richard Biener via Gcc-patches

On Mon, Jul 3, 2023 at 8:50 AM Tejas Belagod  wrote:
>
> On 6/29/23 6:55 PM, Richard Biener wrote:
> > On Wed, Jun 28, 2023 at 1:26 PM Tejas Belagod  wrote:
> >>
> >>
> >>
> >>
> >>
> >> From: Richard Biener 
> >> Date: Tuesday, June 27, 2023 at 12:58 PM
> >> To: Tejas Belagod 
> >> Cc: gcc-patches@gcc.gnu.org 
> >> Subject: Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors
> >>
> >> On Tue, Jun 27, 2023 at 8:30 AM Tejas Belagod  
> >> wrote:
> >>>
> >>>
> >>>
> >>>
> >>>
> >>> From: Richard Biener 
> >>> Date: Monday, June 26, 2023 at 2:23 PM
> >>> To: Tejas Belagod 
> >>> Cc: gcc-patches@gcc.gnu.org 
> >>> Subject: Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors
> >>>
> >>> On Mon, Jun 26, 2023 at 8:24 AM Tejas Belagod via Gcc-patches
> >>>  wrote:
> 
>  Hi,
> 
>  Packed Boolean Vectors
>  --
> 
>  I'd like to propose a feature addition to GNU Vector extensions to add 
>  packed
>  boolean vectors (PBV).  This has been discussed in the past here[1] and 
>  a variant has
>  been implemented in Clang recently[2].
> 
>  With predication features being added to vector architectures (SVE, MVE, 
>  AVX),
>  it is a useful feature to have to model predication on targets.  This 
>  could
>  find its use in intrinsics or just used as is as a GNU vector extension 
>  being
>  mapped to underlying target features.  For example, the packed boolean 
>  vector
>  could directly map to a predicate register on SVE.
> 
>  Also, this new packed boolean type GNU extension can be used with SVE 
>  ACLE
>  intrinsics to replace a fixed-length svbool_t.
> 
>  Here are a few options to represent the packed boolean vector type.
> >>>
> >>> The GIMPLE frontend uses a new 'vector_mask' attribute:
> >>>
> >>> typedef int v8si __attribute__((vector_size(8*sizeof(int;
> >>> typedef v8si v8sib __attribute__((vector_mask));
> >>>
> >>> it get's you a vector type that's the appropriate (dependent on the
> >>> target) vector
> >>> mask type for the vector data type (v8si in this case).
> >>>
> >>>
> >>>
> >>> Thanks Richard.
> >>>
> >>> Having had a quick look at the implementation, it does seem to tick the 
> >>> boxes.
> >>>
> >>> I must admit I haven't dug deep, but if the target hook allows the mask 
> >>> to be
> >>>
> >>> defined in way that is target-friendly (and I don't know how much effort 
> >>> it will
> >>>
> >>> be to migrate the attribute to more front-ends), it should do the job 
> >>> nicely.
> >>>
> >>> Let me go back and dig a bit deeper and get back with questions if any.
> >>
> >>
> >> Let me add that the advantage of this is the compiler doesn't need
> >> to support weird explicitely laid out packed boolean vectors that do
> >> not match what the target supports and the user doesn't need to know
> >> what the target supports (and thus have an #ifdef maze around explicitely
> >> specified layouts).
> >>
> >> Sorry for the delayed response – I spent a day experimenting with 
> >> vector_mask.
> >>
> >>
> >>
> >> Yeah, this is what option 4 in the RFC is trying to achieve – be portable 
> >> enough
> >>
> >> to avoid having to sprinkle the code with ifdefs.
> >>
> >>
> >> It does remove some flexibility though, for example with -mavx512f 
> >> -mavx512vl
> >> you'll get AVX512 style masks for V4SImode data vectors but of course the
> >> target sill supports SSE2/AVX2 style masks as well, but those would not be
> >> available as "packed boolean vectors", though they are of course in fact
> >> equal to V4SImode data vectors with -1 or 0 values, so in this particular
> >> case it might not matter.
> >>
> >> That said, the vector_mask attribute will get you V4SImode vectors with
> >> signed boolean elements of 32 bits for V4SImode data vectors with
> >> SSE2/AVX2.
> >>
> >>
> >>
> >> This sounds very much like what the scenario would be with NEON vs SVE. 
> >> Coming to think
> >>
> >> of it, vector_mask resembles option 4 in the proposal with ‘n’ implied by 
> >> the ‘base’ vector type
> >>
> >> and a ‘w’ specified for the type.
> >>
> >>
> >>
> >> Given its current implementation, if vector_mask is exposed to the CFE, 
> >> would there be any
> >>
> >> major challenges wrt implementation or defining behaviour semantics? I 
> >> played around with a
> >>
> >> few examples from the testsuite and wrote some new ones. I mostly tried 
> >> operations that
> >>
> >> the new type would have to support (unary, binary bitwise, initializations 
> >> etc) – with a couple of exceptions
> >>
> >> most of the ops seem to be supported. I also triggered a couple of ICEs in 
> >> some tests involving
> >>
> >> implicit conversions to wider/narrower vector_mask types (will raise 
> >> reports for these). Correct me
> >>
> >> if I’m wrong here, but we’d probably have to support a couple of new ops 
> >> if vector_mask is exposed
> >>
> >> to the CFE – initialization and subscript operations?

Re: [PATCH] Use chain_next on eh_landing_pad_d for GTY (PR middle-end/110510)

2023-07-03 Thread Richard Biener via Gcc-patches

On Sat, Jul 1, 2023 at 11:29 PM Andrew Pinski via Gcc-patches
 wrote:
>
> The backtrace in the bug report suggest there is a running out of
> stack during GC collection, because of a long chain of eh_landing_pad_d.
> This might fix that by adding chain_next onto eh_landing_pad_d's GTY marker.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu with no regressions.

OK.

Richard.

> gcc/ChangeLog:
>
> PR middle-end/110510
> * except.h (struct eh_landing_pad_d): Add chain_next GTY.
> ---
>  gcc/except.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/except.h b/gcc/except.h
> index 378a9e4cb77..173b0f026db 100644
> --- a/gcc/except.h
> +++ b/gcc/except.h
> @@ -66,7 +66,7 @@ enum eh_region_type
>  /* A landing pad for a given exception region.  Any transfer of control
> from the EH runtime to the function happens at a landing pad.  */
>
> -struct GTY(()) eh_landing_pad_d
> +struct GTY((chain_next("%h.next_lp"))) eh_landing_pad_d
>  {
>/* The linked list of all landing pads associated with the region.  */
>struct eh_landing_pad_d *next_lp;
> --
> 2.31.1
>

Re: [PATCH 1/2] Fix PR 110487: invalid signed boolean value

2023-07-03 Thread Richard Biener via Gcc-patches

On Sat, Jul 1, 2023 at 10:23 AM Andrew Pinski via Gcc-patches
 wrote:
>
> This fixes the first part of this bug where `a ? -1 : 0`
> would cause a value of 1 into the signed boolean value.
> It fixes the problem by casting to an integer type of
> the same size/signedness before doing the negative and
> then casting to the type of expression.
>
> OK? Bootstrapped and tested on x86_64.

OK.

Richard.

> gcc/ChangeLog:
>
> * match.pd (a?-1:0): Cast type an integer type
> rather the type before the negative.
> (a?0:-1): Likewise.
> ---
>  gcc/match.pd | 22 --
>  1 file changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 45c72e733a5..a0d114f6a16 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -4703,7 +4703,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>  /* a ? -1 : 0 -> -a.  No need to check the TYPE_PRECISION not being 1
> here as the powerof2cst case above will handle that case correctly.  
> */
>  (if (INTEGRAL_TYPE_P (type) && integer_all_onesp (@1))
> - (negate (convert (convert:boolean_type_node @0))
> + (with {
> +   auto prec = TYPE_PRECISION (type);
> +   auto unsign = TYPE_UNSIGNED (type);
> +   tree inttype = build_nonstandard_integer_type (prec, unsign);
> +  }
> +  (convert (negate (convert:inttype (convert:boolean_type_node @0
>(if (integer_zerop (@1))
> (with {
>tree booltrue = constant_boolean_node (true, boolean_type_node);
> @@ -4722,7 +4727,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   /* a ? -1 : 0 -> -(!a).  No need to check the TYPE_PRECISION not being 1
> here as the powerof2cst case above will handle that case correctly.  
> */
>   (if (INTEGRAL_TYPE_P (type) && integer_all_onesp (@2))
> -  (negate (convert (bit_xor (convert:boolean_type_node @0) { booltrue; } 
> 
> +  (with {
> +   auto prec = TYPE_PRECISION (type);
> +   auto unsign = TYPE_UNSIGNED (type);
> +   tree inttype = build_nonstandard_integer_type (prec, unsign);
> +   }
> +   (convert
> +   (negate
> + (convert:inttype
> + (bit_xor (convert:boolean_type_node @0) { booltrue; } )
> +)
> +   )
> +   )
> +  )
> + )
>  )
> )
>)
> --
> 2.31.1
>

Re: [EXTERNAL] Re: [PATCH] Collect both user and kernel events for autofdo tests and autoprofiledbootstrap

2023-07-03 Thread Richard Biener via Gcc-patches

On Sat, Jul 1, 2023 at 12:05 AM Eugene Rozenfeld
 wrote:
>
> I also set /proc/sys/kernel/perf_event_paranoid to 1 instead of the default 2.

Does the perf attempt fail when the privileges are not adjusted and you specify
--all?  I see it adds /uk as flags, when I do

> perf record -e instructions//uk ./a.out

it doesn't complain in any way with

> cat /proc/sys/kernel/kptr_restrict
1
> cat /proc/sys/kernel/perf_event_paranoid
2

so in case the 'kernel' side is simply ignored when profiling there
isn't permitted/possible
then I guess the patch is OK?

Can you confirm?

Thanks,
Richard.

> -Original Message-
> From: Gcc-patches  On 
> Behalf Of Eugene Rozenfeld via Gcc-patches
> Sent: Friday, June 30, 2023 2:44 PM
> To: Sam James ; Richard Biener 
> Cc: gcc-patches@gcc.gnu.org
> Subject: RE: [EXTERNAL] Re: [PATCH] Collect both user and kernel events for 
> autofdo tests and autoprofiledbootstrap
>
> I don't run this with elevated privileges but I set 
> /proc/sys/kernel/kptr_restrict to 0. Setting that does require elevated 
> privileges.
>
> If that's not acceptable, the only fix I can think of is to make that event 
> mapping threshold percentage a parameter to create_gcov and pass something 
> low enough. 80% instead of the current threshold of 95% should work, although 
> it's a bit fragile.
>
> Eugene
>
> -Original Message-
> From: Sam James 
> Sent: Friday, June 30, 2023 1:59 AM
> To: Richard Biener 
> Cc: Eugene Rozenfeld ; gcc-patches@gcc.gnu.org
> Subject: [EXTERNAL] Re: [PATCH] Collect both user and kernel events for 
> autofdo tests and autoprofiledbootstrap
>
> [You don't often get email from s...@gentoo.org. Learn why this is important 
> at https://aka.ms/LearnAboutSenderIdentification ]
>
> Richard Biener via Gcc-patches  writes:
>
> > On Fri, Jun 30, 2023 at 7:28 AM Eugene Rozenfeld via Gcc-patches
> >  wrote:
> >>
> >> When we collect just user events for autofdo with lbr we get some
> >> events where branch sources are kernel addresses and branch targets
> >> are user addresses. Without kernel MMAP events create_gcov can't make
> >> sense of kernel addresses. Currently create_gcov fails if it can't
> >> map at least 95% of events. We sometimes get below this threshold with 
> >> just user events. The change is to collect both user events and kernel 
> >> events.
> >
> > Does this require elevated privileges?  Can we instead "fix" create_gcov 
> > here?
>
> Right, requiring privileges for this is going to be a no-go for a lot of 
> builders. In a distro context, for example, it means we can't consider 
> autofdo at all.

Re: [PATCH] RISC-V: Support vfwmul.vv combine lowering

2023-07-03 Thread Robin Dapp via Gcc-patches

> Thanks. Ok for trunk？

OK from my side.  As agreed with Jeff, I'm going to get back to this
and revisit/change if needed in the future.

Regards
 Robin

Re: [PATCH] RISC-V: Support vfwnmacc/vfwmsac/vfwnmsac combine lowering

2023-07-03 Thread Robin Dapp via Gcc-patches

To reiterate, this is OK from my side.  As discussed in the other
thread, Jeff would like to have more info on whether a bridge pattern
is needed at all and I agreed to get back to it in a while.  Until
then, we can merge this.

Regards
 Robin

Re: [PATCH] Middle-end: Change order of LEN_MASK_LOAD/LEN_MASK_STORE arguments

2023-07-03 Thread Richard Sandiford via Gcc-patches

juzhe.zh...@rivai.ai writes:
> From: Ju-Zhe Zhong 
>
> Hi, Richard and Richi.
>
> According to Richard's review comments:
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623405.html
>
> current len, bias and mask order is not reasonable.
>
> Change {len,mask,bias} into {len,bias,mask}.
>
> This patch is a simple fix patch, ok for trunk ?

Sorry to be awkward, but can you put the patches in the opposite order,
so that this one comes first, and then the gather/scatter patch comes
second?

I don't think we want internal_fn_bias_index.  The point of reordering
the arguments was to put the bias immediately after the length in all
cases.  In other words, the length and the bias form a pair: the bias
only exists if the length does, and the length only exists if the bias
does.

In other respects the patch LGTM.  Thanks for doing this.

Richard

>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md: Change order of 
> LEN_MASK_LOAD/LEN_MASK_STORE arguments.
> * config/riscv/riscv-v.cc (expand_load_store): Ditto.
> * doc/md.texi: Ditto.
> * gimple-fold.cc (gimple_fold_partial_load_store_mem_ref): Ditto.
> * internal-fn.cc (len_maskload_direct): Ditto.
> (len_maskstore_direct): Ditto.
> (add_len_bias_and_mask_args): Ditto.
> (expand_partial_load_optab_fn): Ditto.
> (expand_partial_store_optab_fn): Ditto.
> (internal_fn_mask_index): Ditto.
> (internal_fn_len_index): Ditto.
> (internal_fn_bias_index): Ditto.
> (internal_fn_stored_value_index): Ditto.
> (internal_len_load_store_bias): Ditto.
> * tree-ssa-dse.cc (initialize_ao_ref_for_dse): Ditto.
> * tree-vect-stmts.cc (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
>
> ---
>  gcc/config/riscv/autovec.md |   8 +-
>  gcc/config/riscv/riscv-v.cc |   2 +-
>  gcc/doc/md.texi |  16 ++--
>  gcc/gimple-fold.cc  |   7 +-
>  gcc/internal-fn.cc  | 176 +---
>  gcc/tree-ssa-dse.cc |  13 +--
>  gcc/tree-vect-stmts.cc  |   6 +-
>  7 files changed, 90 insertions(+), 138 deletions(-)
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 1488f2be1be..4ab0e9f99eb 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -26,8 +26,8 @@
>[(match_operand:V 0 "register_operand")
> (match_operand:V 1 "memory_operand")
> (match_operand 2 "autovec_length_operand")
> -   (match_operand: 3 "vector_mask_operand")
> -   (match_operand 4 "const_0_operand")]
> +   (match_operand 3 "const_0_operand")
> +   (match_operand: 4 "vector_mask_operand")]
>"TARGET_VECTOR"
>  {
>riscv_vector::expand_load_store (operands, true);
> @@ -38,8 +38,8 @@
>[(match_operand:V 0 "memory_operand")
> (match_operand:V 1 "register_operand")
> (match_operand 2 "autovec_length_operand")
> -   (match_operand: 3 "vector_mask_operand")
> -   (match_operand 4 "const_0_operand")]
> +   (match_operand 3 "const_0_operand")
> +   (match_operand: 4 "vector_mask_operand")]
>"TARGET_VECTOR"
>  {
>riscv_vector::expand_load_store (operands, false);
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index adb8d7d36a5..8d5bed7ebe4 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -2777,7 +2777,7 @@ expand_load_store (rtx *ops, bool is_load)
>  {
>poly_int64 value;
>rtx len = ops[2];
> -  rtx mask = ops[3];
> +  rtx mask = ops[4];
>machine_mode mode = GET_MODE (ops[0]);
>  
>if (poly_int_rtx_p (len, ) && known_eq (value, GET_MODE_NUNITS 
> (mode)))
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index b44d1ba3af9..f14dd32b2dc 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5319,15 +5319,15 @@ This pattern is not allowed to @code{FAIL}.
>  @cindex @code{len_maskload@var{m}@var{n}} instruction pattern
>  @item @samp{len_maskload@var{m}@var{n}}
>  Perform a masked load from the memory location pointed to by operand 1
> -into register operand 0.  (operand 2 + operand 4) elements are loaded from
> +into register operand 0.  (operand 2 + operand 3) elements are loaded from
>  memory and other elements in operand 0 are set to undefined values.
>  This is a combination of len_load and maskload.
>  Operands 0 and 1 have mode @var{m}, which must be a vector mode.  Operand 2
>  has whichever integer mode the target prefers.  A mask is specified in
> -operand 3 which must be of type @var{n}.  The mask has lower precedence than
> +operand 4 which must be of type @var{n}.  The mask has lower precedence than
>  the length and is itself subject to length masking,
> -i.e. only mask indices < (operand 2 + operand 4) are used.
> -Operand 4 conceptually has mode @code{QI}.
> +i.e. only mask indices < (operand 2 + operand 3) are used.
> +Operand 3 conceptually has mode @code{QI}.
>  
>  Operand 2 can be a variable or a constant amount.  Operand 4 specifies a
>

Re: [PATCH] arm: Fix MVE intrinsics support with LTO (PR target/110268)

2023-07-03 Thread Christophe Lyon via Gcc-patches

Ping?


On Mon, 26 Jun 2023 at 17:02, Christophe Lyon 
wrote:

> After the recent MVE intrinsics re-implementation, LTO stopped working
> because the intrinsics would no longer be defined.
>
> The main part of the patch is simple and similar to what we do for
> AArch64:
> - call handle_arm_mve_h() from arm_init_mve_builtins to declare the
>   intrinsics when the compiler is in LTO mode
> - actually implement arm_builtin_decl for MVE.
>
> It was just a bit tricky to handle __ARM_MVE_PRESERVE_USER_NAMESPACE:
> its value in the user code cannot be guessed at LTO time, so we always
> have to assume that it was not defined.  The led to a few fixes in the
> way we register MVE builtins as placeholders or not.  Without this
> patch, we would just omit some versions of the inttrinsics when
> __ARM_MVE_PRESERVE_USER_NAMESPACE is true. In fact, like for the C/C++
> placeholders, we need to always keep entries for all of them to ensure
> that we have a consistent numbering scheme.
>
> 2023-06-26  Christophe Lyon   
>
> PR target/110268
> gcc/
> * config/arm/arm-builtins.cc (arm_init_mve_builtins): Handle LTO.
> (arm_builtin_decl): Hahndle MVE builtins.
> * config/arm/arm-mve-builtins.cc (builtin_decl): New function.
> (add_unique_function): Fix handling of
> __ARM_MVE_PRESERVE_USER_NAMESPACE.
> (add_overloaded_function): Likewise.
> * config/arm/arm-protos.h (builtin_decl): New declaration.
>
> gcc/testsuite/
> * gcc.target/arm/pr110268-1.c: New test.
> * gcc.target/arm/pr110268-2.c: New test.
> ---
>  gcc/config/arm/arm-builtins.cc| 11 +++-
>  gcc/config/arm/arm-mve-builtins.cc| 61 ---
>  gcc/config/arm/arm-protos.h   |  1 +
>  gcc/testsuite/gcc.target/arm/pr110268-1.c | 11 
>  gcc/testsuite/gcc.target/arm/pr110268-2.c | 22 
>  5 files changed, 76 insertions(+), 30 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/pr110268-1.c
>  create mode 100644 gcc/testsuite/gcc.target/arm/pr110268-2.c
>
> diff --git a/gcc/config/arm/arm-builtins.cc
> b/gcc/config/arm/arm-builtins.cc
> index 36365e40a5b..fca7dcaf565 100644
> --- a/gcc/config/arm/arm-builtins.cc
> +++ b/gcc/config/arm/arm-builtins.cc
> @@ -1918,6 +1918,15 @@ arm_init_mve_builtins (void)
>arm_builtin_datum *d = _builtin_data[i];
>arm_init_builtin (fcode, d, "__builtin_mve");
>  }
> +
> +  if (in_lto_p)
> +{
> +  arm_mve::handle_arm_mve_types_h ();
> +  /* Under LTO, we cannot know whether
> +__ARM_MVE_PRESERVE_USER_NAMESPACE was defined, so assume it
> +was not.  */
> +  arm_mve::handle_arm_mve_h (false);
> +}
>  }
>
>  /* Set up all the NEON builtins, even builtins for instructions that are
> not
> @@ -2723,7 +2732,7 @@ arm_builtin_decl (unsigned code, bool initialize_p
> ATTRIBUTE_UNUSED)
>  case ARM_BUILTIN_GENERAL:
>return arm_general_builtin_decl (subcode);
>  case ARM_BUILTIN_MVE:
> -  return error_mark_node;
> +  return arm_mve::builtin_decl (subcode);
>  default:
>gcc_unreachable ();
>  }
> diff --git a/gcc/config/arm/arm-mve-builtins.cc
> b/gcc/config/arm/arm-mve-builtins.cc
> index 7033e41a571..e9a12f27411 100644
> --- a/gcc/config/arm/arm-mve-builtins.cc
> +++ b/gcc/config/arm/arm-mve-builtins.cc
> @@ -493,6 +493,16 @@ handle_arm_mve_h (bool preserve_user_namespace)
>  preserve_user_namespace);
>  }
>
> +/* Return the function decl with SVE function subcode CODE, or
> error_mark_node
> +   if no such function exists.  */
> +tree
> +builtin_decl (unsigned int code)
> +{
> +  if (code >= vec_safe_length (registered_functions))
> +return error_mark_node;
> +  return (*registered_functions)[code]->decl;
> +}
> +
>  /* Return true if CANDIDATE is equivalent to MODEL_TYPE for overloading
> purposes.  */
>  static bool
> @@ -849,7 +859,6 @@ function_builder::add_function (const
> function_instance ,
>  ? integer_zero_node
>  : simulate_builtin_function_decl (input_location, name, fntype,
>   code, NULL, attrs);
> -
>registered_function  = *ggc_alloc  ();
>rfn.instance = instance;
>rfn.decl = decl;
> @@ -889,15 +898,12 @@ function_builder::add_unique_function (const
> function_instance ,
>gcc_assert (!*rfn_slot);
>*rfn_slot = 
>
> -  /* Also add the non-prefixed non-overloaded function, if the user
> namespace
> - does not need to be preserved.  */
> -  if (!preserve_user_namespace)
> -{
> -  char *noprefix_name = get_name (instance, false, false);
> -  tree attrs = get_attributes (instance);
> -  add_function (instance, noprefix_name, fntype, attrs,
> requires_float,
> -   false, false);
> -}
> +  /* Also add the non-prefixed non-overloaded function, as placeholder
> + if the user namespace does not need to be preserved.  */
> +  char

[PATCH] tree-optimization/110506 - ICE in pattern recog with TYPE_PRECISION

2023-07-03 Thread Richard Biener via Gcc-patches

The following re-orders checks to make sure we check TYPE_PRECISION
on an integral type.

Bootstrap and regtest running on x86_6-unknown-linux-gnu.

PR tree-optimization/110506
* tree-vect-patterns.cc (vect_recog_rotate_pattern): Re-order
TYPE_PRECISION access with INTEGRAL_TYPE_P check.

* gcc.dg/pr110506-2.c: New testcase.
---
 gcc/testsuite/gcc.dg/pr110506-2.c | 18 ++
 gcc/tree-vect-patterns.cc |  4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr110506-2.c

diff --git a/gcc/testsuite/gcc.dg/pr110506-2.c 
b/gcc/testsuite/gcc.dg/pr110506-2.c
new file mode 100644
index 000..aabca0fa156
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr110506-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+typedef unsigned uint32_t;
+typedef uint32_t uint32x4 __attribute__((vector_size(16)));
+typedef struct {
+  uint32x4 b, d;
+} prng_t;
+prng_t prng_rand_128_r_x;
+int main_flags;
+int main() {
+  uint32_t ref_crc[] = {7, 3};
+  uint32x4 e = (prng_rand_128_r_x.b << 27) + (prng_rand_128_r_x.b >> 32 - 27);
+  prng_rand_128_r_x.d = e;
+  if (ref_crc[main_flags])
+__builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 25f7f8e1e44..de20e9d59cb 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3726,8 +3726,8 @@ vect_recog_rotate_pattern (vec_info *vinfo,
 return NULL;
 
   if (TREE_CODE (oprnd0) != SSA_NAME
-  || TYPE_PRECISION (TREE_TYPE (lhs)) != TYPE_PRECISION (type)
-  || !INTEGRAL_TYPE_P (type))
+  || !INTEGRAL_TYPE_P (type)
+  || TYPE_PRECISION (TREE_TYPE (lhs)) != TYPE_PRECISION (type))
 return NULL;
 
   stmt_vec_info def_stmt_info;
-- 
2.35.3

[PATCH] tree-optimization/110506 - bogus non-zero mask in CCP for vector types

2023-07-03 Thread Richard Biener via Gcc-patches

get_value_for_expr was blindlessly using TYPE_PRECISION to produce
a mask for vector typed entities which the new tree checking now
catches.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

PR tree-optimization/110506
* tree-ssa-ccp.cc (get_value_for_expr): Check for integral
type before relying on TYPE_PRECISION to produce a nonzero mask.

* gcc.dg/pr110506.c: New testcase.
---
 gcc/testsuite/gcc.dg/pr110506.c | 24 
 gcc/tree-ssa-ccp.cc |  1 +
 2 files changed, 25 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/pr110506.c

diff --git a/gcc/testsuite/gcc.dg/pr110506.c b/gcc/testsuite/gcc.dg/pr110506.c
new file mode 100644
index 000..10dbd4e6ce6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr110506.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+struct {
+  long *sp;
+  long *csp;
+} neko_interp_loop_vm;
+int neko_interp_loop_vm_2;
+void neko_interp_loop()
+{
+  void *pc[] = {&, &, &,
+  &, &};
+  long *sp, *csp = neko_interp_loop_vm.csp;
+LabelAccGlobal:
+  neko_interp_loop_vm.sp = sp;
+  neko_interp_loop_vm.csp = csp;
+  goto * 0;
+LabelTailCall:
+  csp = sp -= neko_interp_loop_vm_2;
+LabelMakeArray2:
+LabelPhysCompare:
+LabelLoop:
+  goto * 0;
+}
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 26d5e445abd..0d0f02a8442 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -682,6 +682,7 @@ get_value_for_expr (tree expr, bool for_bits_p)
 }
 
   if (val.lattice_val == VARYING
+  && INTEGRAL_TYPE_P (TREE_TYPE (expr))
   && TYPE_UNSIGNED (TREE_TYPE (expr)))
 val.mask = wi::zext (val.mask, TYPE_PRECISION (TREE_TYPE (expr)));
 
-- 
2.35.3

Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors

2023-07-03 Thread Tejas Belagod via Gcc-patches


On 6/29/23 6:55 PM, Richard Biener wrote:

On Wed, Jun 28, 2023 at 1:26 PM Tejas Belagod  wrote:






From: Richard Biener 
Date: Tuesday, June 27, 2023 at 12:58 PM
To: Tejas Belagod 
Cc: gcc-patches@gcc.gnu.org 
Subject: Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors

On Tue, Jun 27, 2023 at 8:30 AM Tejas Belagod  wrote:






From: Richard Biener 
Date: Monday, June 26, 2023 at 2:23 PM
To: Tejas Belagod 
Cc: gcc-patches@gcc.gnu.org 
Subject: Re: [RFC] GNU Vector Extension -- Packed Boolean Vectors

On Mon, Jun 26, 2023 at 8:24 AM Tejas Belagod via Gcc-patches
 wrote:


Hi,

Packed Boolean Vectors
--

I'd like to propose a feature addition to GNU Vector extensions to add packed
boolean vectors (PBV).  This has been discussed in the past here[1] and a 
variant has
been implemented in Clang recently[2].

With predication features being added to vector architectures (SVE, MVE, AVX),
it is a useful feature to have to model predication on targets.  This could
find its use in intrinsics or just used as is as a GNU vector extension being
mapped to underlying target features.  For example, the packed boolean vector
could directly map to a predicate register on SVE.

Also, this new packed boolean type GNU extension can be used with SVE ACLE
intrinsics to replace a fixed-length svbool_t.

Here are a few options to represent the packed boolean vector type.


The GIMPLE frontend uses a new 'vector_mask' attribute:

typedef int v8si __attribute__((vector_size(8*sizeof(int;
typedef v8si v8sib __attribute__((vector_mask));

it get's you a vector type that's the appropriate (dependent on the
target) vector
mask type for the vector data type (v8si in this case).



Thanks Richard.

Having had a quick look at the implementation, it does seem to tick the boxes.

I must admit I haven't dug deep, but if the target hook allows the mask to be

defined in way that is target-friendly (and I don't know how much effort it will

be to migrate the attribute to more front-ends), it should do the job nicely.

Let me go back and dig a bit deeper and get back with questions if any.



Let me add that the advantage of this is the compiler doesn't need
to support weird explicitely laid out packed boolean vectors that do
not match what the target supports and the user doesn't need to know
what the target supports (and thus have an #ifdef maze around explicitely
specified layouts).

Sorry for the delayed response – I spent a day experimenting with vector_mask.



Yeah, this is what option 4 in the RFC is trying to achieve – be portable enough

to avoid having to sprinkle the code with ifdefs.


It does remove some flexibility though, for example with -mavx512f -mavx512vl
you'll get AVX512 style masks for V4SImode data vectors but of course the
target sill supports SSE2/AVX2 style masks as well, but those would not be
available as "packed boolean vectors", though they are of course in fact
equal to V4SImode data vectors with -1 or 0 values, so in this particular
case it might not matter.

That said, the vector_mask attribute will get you V4SImode vectors with
signed boolean elements of 32 bits for V4SImode data vectors with
SSE2/AVX2.



This sounds very much like what the scenario would be with NEON vs SVE. Coming 
to think

of it, vector_mask resembles option 4 in the proposal with ‘n’ implied by the 
‘base’ vector type

and a ‘w’ specified for the type.



Given its current implementation, if vector_mask is exposed to the CFE, would 
there be any

major challenges wrt implementation or defining behaviour semantics? I played 
around with a

few examples from the testsuite and wrote some new ones. I mostly tried 
operations that

the new type would have to support (unary, binary bitwise, initializations etc) 
– with a couple of exceptions

most of the ops seem to be supported. I also triggered a couple of ICEs in some 
tests involving

implicit conversions to wider/narrower vector_mask types (will raise reports 
for these). Correct me

if I’m wrong here, but we’d probably have to support a couple of new ops if 
vector_mask is exposed

to the CFE – initialization and subscript operations?


Yes, either that or restrict how the mask vectors can be used, thus
properly diagnose improper
uses. 


Indeed.

 A question would be for example how to write common mask test

operations like
if (any (mask)) or if (all (mask)). 


I see 2 options here. New builtins could support new types - they'd 
provide a target independent way to test any and all conditions. Another 
would be to let the target use its intrinsics to do them in the most 
efficient way possible (which the builtins would get lowered down to 
anyway).



 Likewise writing merge operations

- do those as

  a = a | (mask ? b : 0);

thus use ternary ?: for this?  


Yes, like now, the ternary could just translate to

  {mask[0] ? b[0] : 0, mask[1] ? b[1] : 0, ... }

One thing to flesh out is the semantics. Should we allow this operation 
as long as the

[PATCH, rs6000] Extract the element in dword0 by mfvsrd and shift/mask [PR110331]

2023-07-03 Thread HAO CHEN GUI via Gcc-patches

Hi,
  This patch implements the vector element extraction by mfvsrd and
shift/mask when the element is in dword0 of the vector. Originally,
it generates vsplat/mfvsrd on P8 and li/vextract on P9. Since mfvsrd
has lower latency than vextract and rldicl has lower latency than
vsplat, the new sequence has the benefit. Specially, the shift/mask
is no need when the element is the first element of dword0. So it saves
another rldicl when it returns a sign extend value.

  This patch is based on previous one.
https://gcc.gnu.org/pipermail/gcc-patches/2023-June/622101.html

  Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.

Thanks
Gui Haochen


ChangeLog
rs6000: Extract the element in dword0 by mfvsrd and shift/mask

gcc/
PR target/110331
* config/rs6000/rs6000-protos.h (rs6000_vsx_element_in_dword0_p):
Declare.
(rs6000_vsx_extract_element_from_dword0): Declare.
* config/rs6000/rs6000.cc (rs6000_vsx_element_in_dword0_p): New
function to judge if an element is in dword0 of a vector.
(rs6000_vsx_extract_element_from_dword0): Extract an element from
dword0 by mfvsrd and lshiftrt and mask.
* config/rs6000/rs6000.md (*rotl3_mask): Rename to...
(rotl3_mask): ...this
* config/rs6000/vsx.md (vsx_extract_): Add a comment.
(split pattern for p9 vector extract): Call
rs6000_vsx_extract_element_from_dword0 if the element is in dword0.
(*vsx_extract__di_p9): Exclude the elements in dword0 which
are processed by *vsx_extract__zero_extend for both p8 and p9.
(*vsx_extract__zero_extend): Zero extend pattern for vector
extract on the element of dword0.
(*vsx_extract__p8): Call rs6000_vsx_extract_element_from_dword0
when the extracted element is in dword0.  Refined the pattern and
remove reload_completed from split condition.

gcc/testsuite/
PR target/110331
* gcc.target/powerpc/fold-vec-extract-char.p8.c: Set the extracted
elements in dword1.
* gcc.target/powerpc/fold-vec-extract-char.p9.c: Likewise.
* gcc.target/powerpc/fold-vec-extract-int.p8.c: Likewise.
* gcc.target/powerpc/fold-vec-extract-int.p9.c: Likewise.
* gcc.target/powerpc/fold-vec-extract-short.p8.c: Likewise.
* gcc.target/powerpc/fold-vec-extract-short.p9.c: Likewise.
* gcc.target/powerpc/p9-extract-1.c: Likewise.
* gcc.target/powerpc/pr110331-p8.c: New.
* gcc.target/powerpc/pr110331-p9.c: New.
* gcc.target/powerpc/pr110331.h: New.

patch.diff
diff --git a/gcc/config/rs6000/rs6000-protos.h 
b/gcc/config/rs6000/rs6000-protos.h
index f70118ea40f..ccef280122b 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -161,6 +161,8 @@ extern bool rs6000_function_pcrel_p (struct function *);
 extern bool rs6000_pcrel_p (void);
 extern bool rs6000_fndecl_pcrel_p (const_tree);
 extern void rs6000_output_addr_vec_elt (FILE *, int);
+extern bool rs6000_vsx_element_in_dword0_p (rtx, enum machine_mode);
+extern void rs6000_vsx_extract_element_from_dword0 (rtx, rtx, rtx, bool);

 /* Different PowerPC instruction formats that are used by GCC.  There are
various other instruction formats used by the PowerPC hardware, but these
diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 07c3a3d15ac..fad01d6b5dd 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -29098,6 +29098,74 @@ rs6000_opaque_type_invalid_use_p (gimple *stmt)
   return false;
 }

+/* Return true when the element is in dword0 of a vector.  Exclude word
+   element 1 of VS4SI as the word can be extracted by mfvsrwz directly.  */
+
+bool
+rs6000_vsx_element_in_dword0_p (rtx op, enum machine_mode mode)
+{
+  gcc_assert (CONST_INT_P (op));
+  gcc_assert (mode == V16QImode || mode == V8HImode || mode == V4SImode);
+
+  int units = GET_MODE_NUNITS (mode);
+  int elt = INTVAL (op);
+  elt = BYTES_BIG_ENDIAN ? units - 1 - elt : elt;
+
+  if (elt > units / 2
+  || (elt == units / 2 && mode != V4SImode))
+return true;
+  else
+return false;
+}
+
+/* Extract element from dword0 by mfvsrd and lshiftrt and mask.  Extend_p
+   indicates if zero extend is needed or not.  */
+
+void
+rs6000_vsx_extract_element_from_dword0 (rtx dest, rtx src, rtx element,
+   bool extend_p)
+{
+  enum machine_mode mode = GET_MODE (src);
+  gcc_assert (rs6000_vsx_element_in_dword0_p (element, mode));
+
+  enum machine_mode dest_mode = GET_MODE (dest);
+  enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  int units = GET_MODE_NUNITS (mode);
+  int elt = INTVAL (element);
+  elt = BYTES_BIG_ENDIAN ? units - 1 - elt : elt;
+  int value, shift;
+  unsigned int mask;
+
+  rtx vec_tmp = gen_lowpart (V2DImode, src);
+  rtx tmp1 = can_create_pseudo_p ()
+? gen_reg_rtx (DImode)
+: simplify_gen_subreg (DImode, dest,

98 matches

Mail list logo