[PATCH] gimple-fold: Use bitwise vector types rather than barely supported huge integral types in memcpy etc. folding [PR113988]

2024-02-27 Thread Jakub Jelinek
Hi!

The following patch changes the memcpy etc. folding to use bitwise vector
types rather  than huge INTEGER_TYPEs for copying of > MAX_FIXED_MODE_SIZE
lengths.  The problem with the huge INTEGER_TYPEs is that they aren't
supported very much, usually there are just optabs to handle moves of them,
perhaps misaligned moves and that is it, so they pose problems e.g. to
BITINT_TYPE lowering.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-02-28  Jakub Jelinek  

PR tree-optimization/113988
* stor-layout.h (bitwise_mode_for_size): Declare.
* stor-layout.cc (bitwise_mode_for_size): New function.
* gimple-fold.cc (gimple_fold_builtin_memory_op): Use it.
Use bitwise_type_for_mode instead of build_nonstandard_integer_type.
Use BITS_PER_UNIT instead of 8.

* gcc.dg/bitint-91.c: New test.

--- gcc/stor-layout.h.jj2024-01-03 11:51:28.103778767 +0100
+++ gcc/stor-layout.h   2024-02-27 12:32:17.716535079 +0100
@@ -102,6 +102,8 @@ extern opt_machine_mode mode_for_size_tr
 
 extern tree bitwise_type_for_mode (machine_mode);
 
+extern opt_machine_mode bitwise_mode_for_size (poly_uint64);
+
 /* Given a VAR_DECL, PARM_DECL or RESULT_DECL, clears the results of
a previous call to layout_decl and calls it again.  */
 extern void relayout_decl (tree);
--- gcc/stor-layout.cc.jj   2024-01-17 13:53:13.160176498 +0100
+++ gcc/stor-layout.cc  2024-02-27 12:27:20.876647298 +0100
@@ -476,6 +476,32 @@ bitwise_type_for_mode (machine_mode mode
   return inner_type;
 }
 
+/* Find a mode that can be used for efficient bitwise operations on SIZE
+   bits, if one exists.  */
+
+opt_machine_mode
+bitwise_mode_for_size (poly_uint64 size)
+{
+  if (known_le (size, (unsigned int) MAX_FIXED_MODE_SIZE))
+return mode_for_size (size, MODE_INT, true);
+
+  machine_mode mode, ret = VOIDmode;
+  FOR_EACH_MODE_FROM (mode, MIN_MODE_VECTOR_INT)
+if (known_eq (GET_MODE_BITSIZE (mode), size)
+   && (ret == VOIDmode || GET_MODE_INNER (mode) == QImode)
+   && have_regs_of_mode[mode]
+   && targetm.vector_mode_supported_p (mode))
+  {
+   if (GET_MODE_INNER (mode) == QImode)
+ return mode;
+   else if (ret == VOIDmode)
+ ret = mode;
+  }
+  if (ret != VOIDmode)
+return ret;
+  return opt_machine_mode ();
+}
+
 /* Find a mode that is suitable for representing a vector with NUNITS
elements of mode INNERMODE, if one exists.  The returned mode can be
either an integer mode or a vector mode.  */
--- gcc/gimple-fold.cc.jj   2024-02-20 10:25:26.297760979 +0100
+++ gcc/gimple-fold.cc  2024-02-27 12:42:38.338925573 +0100
@@ -995,9 +995,12 @@ gimple_fold_builtin_memory_op (gimple_st
if (warning != OPT_Wrestrict)
  return false;
 
- scalar_int_mode mode;
- if (int_mode_for_size (ilen * 8, 0).exists ()
- && GET_MODE_SIZE (mode) * BITS_PER_UNIT == ilen * 8
+ scalar_int_mode imode;
+ machine_mode mode;
+ if (int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists ()
+ && bitwise_mode_for_size (ilen
+   * BITS_PER_UNIT).exists ()
+ && known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT)
  /* If the destination pointer is not aligned we must be able
 to emit an unaligned store.  */
  && (dest_align >= GET_MODE_ALIGNMENT (mode)
@@ -1005,7 +1008,7 @@ gimple_fold_builtin_memory_op (gimple_st
  || (optab_handler (movmisalign_optab, mode)
  != CODE_FOR_nothing)))
{
- tree type = build_nonstandard_integer_type (ilen * 8, 1);
+ tree type = bitwise_type_for_mode (mode);
  tree srctype = type;
  tree desttype = type;
  if (src_align < GET_MODE_ALIGNMENT (mode))
--- gcc/testsuite/gcc.dg/bitint-91.c.jj 2024-02-27 12:08:15.230481756 +0100
+++ gcc/testsuite/gcc.dg/bitint-91.c2024-02-27 12:08:15.230481756 +0100
@@ -0,0 +1,38 @@
+/* PR tree-optimization/113988 */
+/* { dg-do compile { target bitint } } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-mavx512f" { target i?86-*-* x86_64-*-* } } */
+
+int i;
+
+#if __BITINT_MAXWIDTH__ >= 256
+void
+foo (void *p, _BitInt(256) x)
+{
+  __builtin_memcpy (p, , sizeof x);
+}
+
+_BitInt(256)
+bar (void *p, _BitInt(256) x)
+{
+  _BitInt(246) y = x + 1;
+  __builtin_memcpy (p, , sizeof y);
+  return x;
+}
+#endif
+
+#if __BITINT_MAXWIDTH__ >= 512
+void
+baz (void *p, _BitInt(512) x)
+{
+  __builtin_memcpy (p, , sizeof x);
+}
+
+_BitInt(512)
+qux (void *p, _BitInt(512) x)
+{
+  _BitInt(512) y = x + 1;
+  __builtin_memcpy (p, , sizeof y);
+  return x;
+}
+#endif

Jakub



Re: [PATCH] developer option: -fdump-generic-nodes; initial incorporation

2024-02-27 Thread Richard Biener
On Tue, Feb 27, 2024 at 10:20 PM Robert Dubner  wrote:
>
> Richard,
>
> Thank you very much for your comments.
>
> When I set out to create the capability, I had a "specification" in mind.
>
> I didn't have a clue how to create a GENERIC tree that could be fed to the
> middle end in a way that would successfully result in an executable.  And I
> needed to be able to do that in order to proceed with the project of
> creating a COBOL front end.
>
> So, I came up with the idea of using GCC to compile simple programs, and to
> hook into the compiler to examine the trees fed to the middle end, and to
> display those trees in the human-readable format I needed to understand
> them.  And that's what I did.
>
> My first incarnation generated pure text files, and I used that to get
> going.
>
> After a while I realized that when I used the output file, I was spending a
> lot of time searching through the text files.  And I had the brainstorm!
> Hyperlinks!  HTML files!  We have the technology!  So, I created the .HTML
> files as well.
>
> I found this useful to the point of necessity in order to learn how to
> generate the GENERIC trees.  I believe it would be equally useful to the
> next developer who, for whatever reason, needs to understand, on a "You need
> to learn the alphabet before you can learn how to read" level, what the
> middle end requires from a GENERIC tree generated by a front end.
>
> But I've never used it on a complex program. I've used it only to learn how
> to create the GENERIC nodes for very particular things, and so I would use
> the -fdump-generic-nodes feature on a very simple C program that
> demonstrated, in isolation, the feature I needed.  Once I figured it out, I
> would create front end C routines or macros that used the tree.h/tree.cc
> features to build those GENERIC trees, and then I would move on.
>
> I decided to offer it up here, in order to to learn how to create patches
> and to get
> to know the people and the process, as well as from the desire to share it.
> And instantly I got the "How about a machine-readable format?" comments.
> Which are reasonable.  So, because it wasn't hard, I hacked at the existing
> code to create a JSON output.  (But I remind you that up until now, nobody
> seems to have needed a JSON representation.)
>
> And your observation that the human readable representation could be made
> from the JSON representation is totally accurate.
>
> But that wasn't my specification.  My specification was "A tool so that a
> human being can examine a simple GENERIC tree to learn how it's done."
>
> But it seems to me that we are now moving into the realm of a new
> specification.
>
> Said another way:  To go from "A human readable representation of a simple
> GENERIC tree" to "A machine readable JSON representation of an arbitrarily
> complex GENERIC tree, from which a human readable representation can be
> created" means, in effect, starting over on a different project that I don't
> need.  I already *have* a project that I am working on -- the COBOL front
> end.
>
> The complexity of GENERIC trees is, in my experienced opinion, an obstacle
> for the creation of front ends.  The GCC Internals document has a lot of
> information, but to go from it to a front end is like using the maintenance
> manual for an F16 fighter to try to learn to fly the aircraft.
>
> The program "main(){}" generates a tree with over seventy nodes.  I see no
> way to document why that's true; it's all arbitrary in the sense that "this
> is how GCC works".  -fdump-generic-nodes made it possible for me to figure
> out how those nodes are connected and, thus, how to create a new front end.
> I figure that other developers might find it useful, as well.
>
> I guess I am saying that I am not, at this time, able to work on a whole
> different tool.  I think what I have done so far does something useful that
> doesn't seem to otherwise exist in GCC.
>
> I suppose the question for you is, "Is it useful enough?"
>
> I won't be offended if the answer is "No" and I hope you won't be offended
> by my not having the bandwidth to address your very thoughtful and valid
> observations about how it could be better.

No offense taken - I did realize how useful this was to you (and specifically
the hyper-linking looked even very useful to me!).  I often lament the lack
of domain-specific visualization tools for the various data structures GCC
has - having something for GENERIC would be very welcome.

We have for example ways to dump graphviz .dot format graphs of the CFG
and some other data structures and do that natively, not via JSON indirection.

Incidentially this looks like something fit for a google summer of code project.
Ideally it would hook into print-tree.cc providing an alternate
structured output.
It currently prints in the style

 
unit-size 
align:16 warn_if_not_align:0 symtab:0 alias-set -1
canonical-type 0x7702b540 precision:16 min  max >
QI
size 

RE: [PATCH v2] RISC-V: Introduce gcc option mrvv-vector-bits for RVV

2024-02-27 Thread Li, Pan2
Oh, I see, that indicates simply convert this option value to 
riscv_vector_chunks is not good enough here.

I thought the term zvl* indicates the minimal vector length(somehow similar to 
the concept of scalable)
in previous, which is mentioned in the RVV 1.0 spec if my memory is correct.

Looks march=zvl* + mrvv-vector-bits=zvl means exactly the VLEN like 128 bits. I 
will update it in 
v3 accordingly for the difference semantics here.

Pan

-Original Message-
From: Kito Cheng  
Sent: Wednesday, February 28, 2024 2:17 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; Wang, Yanzhang 
; rdapp@gmail.com; jeffreya...@gmail.com
Subject: Re: [PATCH v2] RISC-V: Introduce gcc option mrvv-vector-bits for RVV

Keep SCALABLE, since it has different semantics with ZVL:

-mrvv-vector-bits=scalble means zvl*b specify the minimal VLEN
-mrvv-vector-bits=zvl means zvl*b specify the exactly VLEN

What's difference exactly?

-mrvv-vector-bits=scalble with zvl128b can run on any machine with VLEN >= 128
-mrvv-vector-bits=zvl with zvl128b can *only* run on the machine with
VLEN == 128

Sizeof vint32m1_t is VLEN under -mrvv-vector-bits=scalble with zvl128b
Sizeof vint32m1_t is 128 under -mrvv-vector-bits=zvl with zvl128b

Give more practical example on codegen with clang:
https://godbolt.org/z/vhdnGvK37

So it more like alias of
--param=riscv-autovec-preference=fixed-vlmax/--param=riscv-autovec-preference=scalable.

On Wed, Feb 28, 2024 at 12:17 PM  wrote:
>
> From: Pan Li 
>
> This patch would like to introduce one new gcc option for RVV. To
> appoint the bits size of one RVV vector register. Valid arguments to
> '-mrvv-vector-bits=' are:
>
> * zvl
>
> The zvl will pick up the zvl*b from the march option. For example,
> the mrvv-vector-bits will be 1024 when march=rv64gcv_zvl1024b.
>
> The below test are passed for this patch.
>
> * The riscv fully regression test.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-opts.h (enum rvv_vector_bits_enum): New enum for
> different RVV vector bits.
> * config/riscv/riscv.cc (riscv_convert_vector_bits): New func to
> get the RVV vector bits, with given min_vlen.
> (riscv_convert_vector_chunks): Combine the mrvv-vector-bits
> option with min_vlen to RVV vector chunks.
> (riscv_override_options_internal): Update comments and rename the
> vector chunks.
> * config/riscv/riscv.opt: Add option mrvv-vector-bits.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/rvv-vector-bits-1.c: New test.
> * gcc.target/riscv/rvv/base/rvv-vector-bits-2.c: New test.
> * gcc.target/riscv/rvv/base/rvv-vector-bits-3.c: New test.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/config/riscv/riscv-opts.h |  7 +
>  gcc/config/riscv/riscv.cc | 31 +++
>  gcc/config/riscv/riscv.opt| 11 +++
>  .../riscv/rvv/base/rvv-vector-bits-1.c|  7 +
>  .../riscv/rvv/base/rvv-vector-bits-2.c|  7 +
>  .../riscv/rvv/base/rvv-vector-bits-3.c| 25 +++
>  6 files changed, 82 insertions(+), 6 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-1.c
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-3.c
>
> diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
> index 4edddbadc37..0162e00515b 100644
> --- a/gcc/config/riscv/riscv-opts.h
> +++ b/gcc/config/riscv/riscv-opts.h
> @@ -129,6 +129,13 @@ enum vsetvl_strategy_enum {
>VSETVL_OPT_NO_FUSION,
>  };
>
> +/* RVV vector bits for option -mrvv-vector-bits
> +   zvl indicates take the bits of zvl*b provided by march as vector bits.
> + */
> +enum rvv_vector_bits_enum {
> +  RVV_VECTOR_BITS_ZVL,
> +};
> +
>  #define TARGET_ZICOND_LIKE (TARGET_ZICOND || (TARGET_XVENTANACONDOPS && 
> TARGET_64BIT))
>
>  /* Bit of riscv_zvl_flags will set contintuly, N-1 bit will set if N-bit is
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 5e984ee2a55..d18e5226bce 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -8801,13 +8801,32 @@ riscv_init_machine_status (void)
>return ggc_cleared_alloc ();
>  }
>
> -/* Return the VLEN value associated with -march.
> +static int
> +riscv_convert_vector_bits (int min_vlen)
> +{
> +  int rvv_bits = 0;
> +
> +  switch (rvv_vector_bits)
> +{
> +  case RVV_VECTOR_BITS_ZVL:
> +   rvv_bits = min_vlen;
> +   break;
> +  default:
> +   gcc_unreachable ();
> +}
> +
> +  return rvv_bits;
> +}
> +
> +/* Return the VLEN value associated with -march and -mwrvv-vector-bits.
> TODO: So far we only support length-agnostic value. */
>  static poly_uint16
> -riscv_convert_vector_bits (struct gcc_options *opts)
> +riscv_convert_vector_chunks (struct gcc_options *opts)
>  {

Re: [PATCH] Fortran - Error compiling PDT Type-bound Procedures [PR82943/86148/86268]

2024-02-27 Thread Alexander Westbrooks
Harald,

Jerry helped me figure out my editor settings so that I could fix
whitespace and formatting issues in my code. With my editor configured
correctly, I saw that my code was not conforming to coding standards
as I previously thought it was. I have fixed those things and updated
my patch. Thank you for your patience.

Let me know if this is okay to push to the trunk.

Thanks,

Alexander Westbrooks

On Sun, Feb 25, 2024 at 2:40 PM Alexander Westbrooks
 wrote:
>
> Harald,
>
> Thank you for reviewing my code. I've been doing research and debugging to 
> investigate the error thrown by Intel and NAG for the deferred parameter in 
> the dummy variable declaration. I found where the problem was and added the 
> fix as part of my patch. I've attached the patch as a file, which also 
> includes your feedback and suggested fixes. I've updated the test case 
> pdt_37.f03 to check for the POINTER or ALLOCATABLE error as you suggested.
>
> All regression tests pass, including the new ones, after including the fix 
> for the POINTER or ALLOCATABLE error for CLASS declarations of PDTs when 
> deferred length parameters are used. This was tested on WSL 2, with Ubuntu 
> 20.04 distro.
>
> Is this okay to push to the trunk?
>
> Thanks,
>
> Alexander Westbrooks
>
>
> On Sun, Feb 11, 2024 at 2:11 PM Harald Anlauf  wrote:
>>
>> Hi Alex,
>>
>> I've been unable to apply your patch to my local trunk, likely due to
>> whitespace issues my newsreader handles differently from your site.
>> I see it inline instead of attached.
>>
>> A few general remarks:
>>
>> Please follow the general recommendation regarding style if possible,
>> see https://www.gnu.org/prep/standards/standards.html#Formatting
>> regarding formatting/whitespace use (5.1) and comments (5.2)
>>
>> Also, when an error message text spans multiple lines, please place the
>> whitespace at the end of a line, not at the beginning of the new one:
>>
>> > +  if ( resolve_bindings_derived->attr.pdt_template &&
>> > +   !gfc_pdt_is_instance_of(resolve_bindings_derived,
>> > +   CLASS_DATA(me_arg)->ts.u.derived))
>> > +{
>> > +  gfc_error ("Argument %qs of %qs with PASS(%s) at %L must be of"
>> > +" the parametric derived-type %qs", me_arg->name, proc->name,
>>
>>gfc_error ("Argument %qs of %qs with PASS(%s) at %L must be of "
>>   "the parametric derived-type %qs", me_arg->name,
>> proc->name,
>>
>> > +me_arg->name, , resolve_bindings_derived->name);
>> > +  goto error;
>> > +}
>>
>> The following change is almost unreadable: the lnegthy comment is split
>> over three parts and almost hides the code.  Couldn't this be combined
>> into one comment before the function?
>>
>> > diff --git a/gcc/fortran/symbol.cc b/gcc/fortran/symbol.cc
>> > index fddf68f8398..11f4bac0415 100644
>> > --- a/gcc/fortran/symbol.cc
>> > +++ b/gcc/fortran/symbol.cc
>> > @@ -5172,6 +5172,35 @@ gfc_type_is_extension_of (gfc_symbol *t1, gfc_symbol
>> > *t2)
>> > return gfc_compare_derived_types (t1, t2);
>> >   }
>> >
>> > +/* Check if a parameterized derived type t2 is an instance of a PDT
>> > template t1 */
>> > +
>> > +bool
>> > +gfc_pdt_is_instance_of(gfc_symbol *t1, gfc_symbol *t2)
>> > +{
>> > +  if ( !t1->attr.pdt_template || !t2->attr.pdt_type )
>> > +return false;
>> > +
>> > +  /*
>> > +in decl.cc, gfc_get_pdt_instance, a pdt instance is given a 3
>> > character prefix "Pdt", followed
>> > +by an underscore list of the kind parameters, up to a maximum of 8.
>> > +
>> > +So to check if a PDT Type corresponds to the template, extract the
>> > core derive_type name,
>> > +and then see if it is type compatible by name...
>> > +
>> > +For example:
>> > +
>> > +Pdtf_2_2 -> extract out the 'f' -> see if the derived type 'f' is
>> > compatible with symbol t1
>> > +  */
>> > +
>> > +  // Starting at index 3 of the string in order to skip past the 'Pdt'
>> > prefix
>> > +  // Also, here the length of the template name is used in order to avoid
>> > the
>> > +  // kind parameter suffixes that are placed at the end of PDT instance
>> > names.
>> > +  if ( !(strncmp(&(t2->name[3]), t1->name, strlen(t1->name)) == 0) )
>> > +return false;
>> > +
>> > +  return true;
>> > +}
>> > +
>> >
>> >   /* Check if two typespecs are type compatible (F03:5.1.1.2):
>> >  If ts1 is nonpolymorphic, ts2 must be the same type.
>>
>> The following testcase tests for errors.  I tried Intel and NAG on it
>> after commenting the 'contains' section of the type desclaration.
>> Both complained about subroutine deferred_len_param, e.g.
>>
>> Intel:
>> A colon may only be used as a type parameter value in the declaration of
>> an object that has the POINTER or ALLOCATABLE attribute.   [THIS]
>>  class(param_deriv_type(:)), intent(inout) :: this
>>
>> NAG:
>> Entity THIS of type PARAM_DERIV_TYPE(A=:) has a deferred length type
>> parameter but is not a data pointer or allocatable
>>
>> Do we 

Re: [PATCH v2] RISC-V: Introduce gcc option mrvv-vector-bits for RVV

2024-02-27 Thread Kito Cheng
Keep SCALABLE, since it has different semantics with ZVL:

-mrvv-vector-bits=scalble means zvl*b specify the minimal VLEN
-mrvv-vector-bits=zvl means zvl*b specify the exactly VLEN

What's difference exactly?

-mrvv-vector-bits=scalble with zvl128b can run on any machine with VLEN >= 128
-mrvv-vector-bits=zvl with zvl128b can *only* run on the machine with
VLEN == 128

Sizeof vint32m1_t is VLEN under -mrvv-vector-bits=scalble with zvl128b
Sizeof vint32m1_t is 128 under -mrvv-vector-bits=zvl with zvl128b

Give more practical example on codegen with clang:
https://godbolt.org/z/vhdnGvK37

So it more like alias of
--param=riscv-autovec-preference=fixed-vlmax/--param=riscv-autovec-preference=scalable.

On Wed, Feb 28, 2024 at 12:17 PM  wrote:
>
> From: Pan Li 
>
> This patch would like to introduce one new gcc option for RVV. To
> appoint the bits size of one RVV vector register. Valid arguments to
> '-mrvv-vector-bits=' are:
>
> * zvl
>
> The zvl will pick up the zvl*b from the march option. For example,
> the mrvv-vector-bits will be 1024 when march=rv64gcv_zvl1024b.
>
> The below test are passed for this patch.
>
> * The riscv fully regression test.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-opts.h (enum rvv_vector_bits_enum): New enum for
> different RVV vector bits.
> * config/riscv/riscv.cc (riscv_convert_vector_bits): New func to
> get the RVV vector bits, with given min_vlen.
> (riscv_convert_vector_chunks): Combine the mrvv-vector-bits
> option with min_vlen to RVV vector chunks.
> (riscv_override_options_internal): Update comments and rename the
> vector chunks.
> * config/riscv/riscv.opt: Add option mrvv-vector-bits.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/base/rvv-vector-bits-1.c: New test.
> * gcc.target/riscv/rvv/base/rvv-vector-bits-2.c: New test.
> * gcc.target/riscv/rvv/base/rvv-vector-bits-3.c: New test.
>
> Signed-off-by: Pan Li 
> ---
>  gcc/config/riscv/riscv-opts.h |  7 +
>  gcc/config/riscv/riscv.cc | 31 +++
>  gcc/config/riscv/riscv.opt| 11 +++
>  .../riscv/rvv/base/rvv-vector-bits-1.c|  7 +
>  .../riscv/rvv/base/rvv-vector-bits-2.c|  7 +
>  .../riscv/rvv/base/rvv-vector-bits-3.c| 25 +++
>  6 files changed, 82 insertions(+), 6 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-1.c
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-2.c
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-3.c
>
> diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
> index 4edddbadc37..0162e00515b 100644
> --- a/gcc/config/riscv/riscv-opts.h
> +++ b/gcc/config/riscv/riscv-opts.h
> @@ -129,6 +129,13 @@ enum vsetvl_strategy_enum {
>VSETVL_OPT_NO_FUSION,
>  };
>
> +/* RVV vector bits for option -mrvv-vector-bits
> +   zvl indicates take the bits of zvl*b provided by march as vector bits.
> + */
> +enum rvv_vector_bits_enum {
> +  RVV_VECTOR_BITS_ZVL,
> +};
> +
>  #define TARGET_ZICOND_LIKE (TARGET_ZICOND || (TARGET_XVENTANACONDOPS && 
> TARGET_64BIT))
>
>  /* Bit of riscv_zvl_flags will set contintuly, N-1 bit will set if N-bit is
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 5e984ee2a55..d18e5226bce 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -8801,13 +8801,32 @@ riscv_init_machine_status (void)
>return ggc_cleared_alloc ();
>  }
>
> -/* Return the VLEN value associated with -march.
> +static int
> +riscv_convert_vector_bits (int min_vlen)
> +{
> +  int rvv_bits = 0;
> +
> +  switch (rvv_vector_bits)
> +{
> +  case RVV_VECTOR_BITS_ZVL:
> +   rvv_bits = min_vlen;
> +   break;
> +  default:
> +   gcc_unreachable ();
> +}
> +
> +  return rvv_bits;
> +}
> +
> +/* Return the VLEN value associated with -march and -mwrvv-vector-bits.
> TODO: So far we only support length-agnostic value. */
>  static poly_uint16
> -riscv_convert_vector_bits (struct gcc_options *opts)
> +riscv_convert_vector_chunks (struct gcc_options *opts)
>  {
>int chunk_num;
>int min_vlen = TARGET_MIN_VLEN_OPTS (opts);
> +  int rvv_bits = riscv_convert_vector_bits (min_vlen);
> +
>if (min_vlen > 32)
>  {
>/* When targetting minimum VLEN > 32, we should use 64-bit chunk size.
> @@ -8826,7 +8845,7 @@ riscv_convert_vector_bits (struct gcc_options *opts)
>- TARGET_MIN_VLEN = 2048bit: [256,256]
>- TARGET_MIN_VLEN = 4096bit: [512,512]
>FIXME: We currently DON'T support TARGET_MIN_VLEN > 4096bit.  */
> -  chunk_num = min_vlen / 64;
> +  chunk_num = rvv_bits / 64;
>  }
>else
>  {
> @@ -8848,7 +8867,7 @@ riscv_convert_vector_bits (struct gcc_options *opts)
>if (TARGET_VECTOR_OPTS_P (opts))
>  {
>if 

RE: [PATCH v2] DSE: Bugfix ICE after allow vector type in get_stored_val

2024-02-27 Thread Li, Pan2
>   if (!targetm.modes_tieable_p (src_int_mode, src_mode))
> return NULL_RTX;
>   if (!targetm.modes_tieable_p (int_mode, mode))
> return NULL_RTX;

Yes, will return NULL_RTX for in the first if, given src_int_mode is E_DImode 
while src_mode is 
E_V2SFmode and mode is E_V4QImode. The extract_low_bits convert the modes 
E_V2SFmode/E_V4QImode 
to E_DImode/E_SImode in advance before tieable checking, validate_subreg and 
gen_lowpart.

Not sure if my understanding is correct but looks extract_low_bits cannot take 
care of vector modes 
up to a point because vector modes are always untieable to its' int mode, and 
then return NULL_RTX.

Pan

-Original Message-
From: Li, Pan2 
Sent: Wednesday, February 28, 2024 9:41 AM
To: Jeff Law ; gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; 
Wang, Yanzhang ; rdapp@gmail.com; Liu, Hongtao 

Subject: RE: [PATCH v2] DSE: Bugfix ICE after allow vector type in 
get_stored_val

> Pan, can you confirm what path we take through extract_low_bits?

Thanks Jeff for comments, will have a try soon and keep you posted.

Pan

-Original Message-
From: Jeff Law  
Sent: Tuesday, February 27, 2024 11:03 PM
To: Li, Pan2 ; gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; 
Wang, Yanzhang ; rdapp@gmail.com; Liu, Hongtao 

Subject: Re: [PATCH v2] DSE: Bugfix ICE after allow vector type in 
get_stored_val



On 2/26/24 07:22, pan2...@intel.com wrote:
> From: Pan Li 
> 
> We allowed vector type for get_stored_val when read is less than or
> equal to store in previous.  Unfortunately, we missed to adjust the
> validate_subreg part accordingly.  When the vector type's size is
> less than vector register, it will be considered as invalid in the
> validate_subreg.
> 
> Consider the validate_subreg is kind of a can with worms and we are
> in stage 4.  We will fix the issue from the DES side, and make sure
> the subreg is valid for both the read_mode and store_mode before
> perform the real gen_lowpart.
> 
> The below test are passed for this patch:
> 
> * The x86 bootstrap test.
> * The x86 regression test.
> * The riscv regression test.
> * The aarch64 regression test.
> 
> gcc/ChangeLog:
> 
>   * dse.cc (get_stored_val): Add validate_subreg check before
>   perform the gen_lowpart for rtl.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/tree-ssa/ssa-fre-44.c: Add compile option to trigger
>   the ICE.
>   * gcc.target/riscv/rvv/base/bug-6.c: New test.
> 
> Signed-off-by: Pan Li 
> ---
>   gcc/dse.cc|  4 +++-
>   gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c|  2 +-
>   .../gcc.target/riscv/rvv/base/bug-6.c | 22 +++
>   3 files changed, 26 insertions(+), 2 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c
> 
> diff --git a/gcc/dse.cc b/gcc/dse.cc
> index edc7a1dfecf..1596da91da0 100644
> --- a/gcc/dse.cc
> +++ b/gcc/dse.cc
> @@ -1946,7 +1946,9 @@ get_stored_val (store_info *store_info, machine_mode 
> read_mode,
>copy_rtx (store_info->const_rhs));
> else if (VECTOR_MODE_P (read_mode) && VECTOR_MODE_P (store_mode)
>   && known_le (GET_MODE_BITSIZE (read_mode), GET_MODE_BITSIZE 
> (store_mode))
> -&& targetm.modes_tieable_p (read_mode, store_mode))
> +&& targetm.modes_tieable_p (read_mode, store_mode)
> +&& validate_subreg (read_mode, store_mode, copy_rtx (store_info->rhs),
> + subreg_lowpart_offset (read_mode, store_mode)))
>   read_reg = gen_lowpart (read_mode, copy_rtx (store_info->rhs));
> else
>   read_reg = extract_low_bits (read_mode, store_mode,

So we're just changing whether or not we call gen_lowpart directly or go 
through extract_low_bits, which may in turn generate subreg, call 
gen_lowpart itself and a few other things.

I'm guessing that extract_low_bits is going to return NULL in this case 
via this code (specifically the second test).

>   if (!targetm.modes_tieable_p (src_int_mode, src_mode))
> return NULL_RTX;
>   if (!targetm.modes_tieable_p (int_mode, mode))
> return NULL_RTX;


Pan, can you confirm what path we take through extract_low_bits?

One might argue that we should just call into extract_low_bits 
unconditionally since it'll ultimately call gen_lowpart when it safely 
can.  The downside is that's a bigger change than I'd like at this stage 
in our development cycle.

I wouldn't be surprised if other direct uses of gen_lowpart have similar 
problems.





> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> index f79b4c142ae..624a00a4f32 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> @@ -1,5 +1,5 @@
>   /* { dg-do compile } */
> -/* { dg-options "-O -fdump-tree-fre1" } */
> +/* { dg-options "-O -fdump-tree-fre1 -O3 

[PATCH v2] RISC-V: Introduce gcc option mrvv-vector-bits for RVV

2024-02-27 Thread pan2 . li
From: Pan Li 

This patch would like to introduce one new gcc option for RVV. To
appoint the bits size of one RVV vector register. Valid arguments to
'-mrvv-vector-bits=' are:

* zvl

The zvl will pick up the zvl*b from the march option. For example,
the mrvv-vector-bits will be 1024 when march=rv64gcv_zvl1024b.

The below test are passed for this patch.

* The riscv fully regression test.

gcc/ChangeLog:

* config/riscv/riscv-opts.h (enum rvv_vector_bits_enum): New enum for
different RVV vector bits.
* config/riscv/riscv.cc (riscv_convert_vector_bits): New func to
get the RVV vector bits, with given min_vlen.
(riscv_convert_vector_chunks): Combine the mrvv-vector-bits
option with min_vlen to RVV vector chunks.
(riscv_override_options_internal): Update comments and rename the
vector chunks.
* config/riscv/riscv.opt: Add option mrvv-vector-bits.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/rvv-vector-bits-1.c: New test.
* gcc.target/riscv/rvv/base/rvv-vector-bits-2.c: New test.
* gcc.target/riscv/rvv/base/rvv-vector-bits-3.c: New test.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/riscv-opts.h |  7 +
 gcc/config/riscv/riscv.cc | 31 +++
 gcc/config/riscv/riscv.opt| 11 +++
 .../riscv/rvv/base/rvv-vector-bits-1.c|  7 +
 .../riscv/rvv/base/rvv-vector-bits-2.c|  7 +
 .../riscv/rvv/base/rvv-vector-bits-3.c| 25 +++
 6 files changed, 82 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/rvv-vector-bits-3.c

diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
index 4edddbadc37..0162e00515b 100644
--- a/gcc/config/riscv/riscv-opts.h
+++ b/gcc/config/riscv/riscv-opts.h
@@ -129,6 +129,13 @@ enum vsetvl_strategy_enum {
   VSETVL_OPT_NO_FUSION,
 };
 
+/* RVV vector bits for option -mrvv-vector-bits
+   zvl indicates take the bits of zvl*b provided by march as vector bits.
+ */
+enum rvv_vector_bits_enum {
+  RVV_VECTOR_BITS_ZVL,
+};
+
 #define TARGET_ZICOND_LIKE (TARGET_ZICOND || (TARGET_XVENTANACONDOPS && 
TARGET_64BIT))
 
 /* Bit of riscv_zvl_flags will set contintuly, N-1 bit will set if N-bit is
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 5e984ee2a55..d18e5226bce 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -8801,13 +8801,32 @@ riscv_init_machine_status (void)
   return ggc_cleared_alloc ();
 }
 
-/* Return the VLEN value associated with -march.
+static int
+riscv_convert_vector_bits (int min_vlen)
+{
+  int rvv_bits = 0;
+
+  switch (rvv_vector_bits)
+{
+  case RVV_VECTOR_BITS_ZVL:
+   rvv_bits = min_vlen;
+   break;
+  default:
+   gcc_unreachable ();
+}
+
+  return rvv_bits;
+}
+
+/* Return the VLEN value associated with -march and -mwrvv-vector-bits.
TODO: So far we only support length-agnostic value. */
 static poly_uint16
-riscv_convert_vector_bits (struct gcc_options *opts)
+riscv_convert_vector_chunks (struct gcc_options *opts)
 {
   int chunk_num;
   int min_vlen = TARGET_MIN_VLEN_OPTS (opts);
+  int rvv_bits = riscv_convert_vector_bits (min_vlen);
+
   if (min_vlen > 32)
 {
   /* When targetting minimum VLEN > 32, we should use 64-bit chunk size.
@@ -8826,7 +8845,7 @@ riscv_convert_vector_bits (struct gcc_options *opts)
   - TARGET_MIN_VLEN = 2048bit: [256,256]
   - TARGET_MIN_VLEN = 4096bit: [512,512]
   FIXME: We currently DON'T support TARGET_MIN_VLEN > 4096bit.  */
-  chunk_num = min_vlen / 64;
+  chunk_num = rvv_bits / 64;
 }
   else
 {
@@ -8848,7 +8867,7 @@ riscv_convert_vector_bits (struct gcc_options *opts)
   if (TARGET_VECTOR_OPTS_P (opts))
 {
   if (opts->x_riscv_autovec_preference == RVV_FIXED_VLMAX)
-   return (int) min_vlen / (riscv_bytes_per_vector_chunk * 8);
+   return (int) rvv_bits / (riscv_bytes_per_vector_chunk * 8);
   else
return poly_uint16 (chunk_num, chunk_num);
 }
@@ -8920,8 +8939,8 @@ riscv_override_options_internal (struct gcc_options *opts)
   if (TARGET_VECTOR && TARGET_BIG_ENDIAN)
 sorry ("Current RISC-V GCC does not support RVV in big-endian mode");
 
-  /* Convert -march to a chunks count.  */
-  riscv_vector_chunks = riscv_convert_vector_bits (opts);
+  /* Convert -march and -mrvv-vector-bits to a chunks count.  */
+  riscv_vector_chunks = riscv_convert_vector_chunks (opts);
 }
 
 /* Implement TARGET_OPTION_OVERRIDE.  */
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 20685c42aed..42ea8efd05d 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -607,3 +607,14 @@ Enum(stringop_strategy) String(vector) 

[PATCH v3] c++/modules: Support lambdas attached to more places in modules [PR111710]

2024-02-27 Thread Nathaniel Shead
On Tue, Feb 27, 2024 at 11:59:46AM -0500, Patrick Palka wrote:
> On Fri, 16 Feb 2024, Nathaniel Shead wrote:
> 
> > On Tue, Feb 13, 2024 at 07:52:01PM -0500, Jason Merrill wrote:
> > > On 2/10/24 17:57, Nathaniel Shead wrote:
> > > > The fix for PR107398 weakened the restrictions that lambdas must belong
> > > > to namespace scope. However this was not sufficient: we also need to
> > > > allow lambdas keyed to FIELD_DECLs or PARM_DECLs.
> > > 
> > > I wonder about keying such lambdas to the class and function, 
> > > respectively,
> > > rather than specifically to the field or parameter, but I suppose it 
> > > doesn't
> > > matter.
> > 
> > I did some more testing and realised my testcase didn't properly
> > exercise whether I'd properly deduplicated or not, and an improved
> > testcase proved that actually keying to the field rather than the class
> > did cause issues. (Parameter vs. function doesn't seem to have mattered
> > however.)
> > 
> > Here's an updated patch that fixes this, and includes the changes for
> > lambdas in base classes that I'd had as a separate patch earlier. I've
> > also added some concepts testcases just in case.
> > 
> > Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?
> > 
> > -- >8 --
> > 
> > The fix for PR107398 weakened the restrictions that lambdas must belong
> > to namespace scope. However this was not sufficient: we also need to
> > allow lambdas attached to FIELD_DECLs, PARM_DECLs, and TYPE_DECLs.
> > 
> > For field decls we key the lambda to its class rather than the field
> > itself. This avoids some errors with deduplicating fields.
> > 
> > Additionally, by [basic.link] p15.2 a lambda defined anywhere in a
> > class-specifier should not be TU-local, which includes base-class
> > declarations, so ensure that lambdas declared there are keyed
> > appropriately as well.
> > 
> > Because this now requires 'DECL_MODULE_KEYED_DECLS_P' to be checked on a
> > fairly large number of different kinds of DECLs, and that in general
> > it's safe to just get 'false' as a result of a check on an unexpected
> > DECL type, this patch also removes the tree checking from the accessor.
> > 
> > Finally, to handle deduplicating templated lambda fields, we need to
> > ensure that we can determine that two lambdas from different field decls
> > match. The modules code does not attempt to deduplicate expression
> > nodes, which causes issues as the LAMBDA_EXPRs are then considered to be
> > different. However, rather than checking the LAMBDA_EXPR directly we can
> > instead check its type: the generated RECORD_TYPE for a LAMBDA_EXPR must
> > also be unique, and /is/ deduplicated on import, so we can just check
> > for that instead.
> 
> We probably should be deduplicating LAMBDA_EXPR on stream-in, perhaps
> something like
> 
> diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc
> index e8eabb1f6f9..1b2ba2e0fa8 100644
> --- a/gcc/cp/module.cc
> +++ b/gcc/cp/module.cc
> @@ -9183,6 +9183,13 @@ trees_in::tree_value ()
>return NULL_TREE;
>  }
>  
> +  if (TREE_CODE (t) == LAMBDA_EXPR
> +  && CLASSTYPE_LAMBDA_EXPR (TREE_TYPE (t)))
> +{
> +  existing = CLASSTYPE_LAMBDA_EXPR (TREE_TYPE (t));
> +  back_refs[~tag] = existing;
> +}
> +
>dump (dumper::TREE) && dump ("Read tree:%d %C:%N", tag, TREE_CODE (t), t);
>  
>if (TREE_CODE (existing) == INTEGER_CST && !TREE_OVERFLOW (existing))
> 
> would suffice?  If not we probably need to take inspiration from the
> TREE_BINFO streaming, and handle LAMBDA_EXPR similarly..
> 

Ah yup, right, that makes more sense. Your suggestion seems to work,
thanks! Here's an updated patch.

Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

-- >8 --

The fix for PR107398 weakened the restrictions that lambdas must belong
to namespace scope. However this was not sufficient: we also need to
allow lambdas attached to FIELD_DECLs, PARM_DECLs, and TYPE_DECLs.

For field decls we key the lambda to its class rather than the field
itself. This avoids some errors with deduplicating fields.

Additionally, by [basic.link] p15.2 a lambda defined anywhere in a
class-specifier should not be TU-local, which includes base-class
declarations, so ensure that lambdas declared there are keyed
appropriately as well.

Because this now requires 'DECL_MODULE_KEYED_DECLS_P' to be checked on a
fairly large number of different kinds of DECLs, and that in general
it's safe to just get 'false' as a result of a check on an unexpected
DECL type, this patch also removes the tree checking from the accessor.

Finally, to handle deduplicating templated lambda fields, we need to
ensure that we can determine that two lambdas from different field decls
match, so we ensure that we deduplicate LAMBDA_EXPRs on stream in.

PR c++/111710

gcc/cp/ChangeLog:

* cp-tree.h (DECL_MODULE_KEYED_DECLS_P): Remove tree checking.
(struct lang_decl_base): Update comments and fix whitespace.
* module.cc 

Re: [PATCH] Fortran testsuite: fix invalid Fortran in testcase

2024-02-27 Thread Jerry D

On 2/27/24 1:00 PM, Harald Anlauf wrote:

Dear all,

the attached patch fixes invalid Fortran in testcase
gfortran.dg/pr101026.f, which might prohibit progress
in fixing pr111781.  (Note that the testcase was for a
tree-optimizer issue, not the Fortran frontend.)

OK for mainline?

Will commit within 24h unless there are comments.

Thanks,
Harald



OK, simple.


Re: [PATCH v1 08/13] aarch64: Add Cygwin and MinGW environments for AArch64

2024-02-27 Thread NightStrike
> -Original Message-
> Friday, February 23, 2024 6:16 PM
> Richard Sandiford wrote:
>
> > +
> > +#undef TARGET_SEH
> > +#define TARGET_SEH 0
> > +
> > +#define SSE_REGNO_P(N) 0
> > +#define GENERAL_REGNO_P(N) 0
>
> Could you add a comment to explain how these two macros are consumed?
> What is the effect of saying that everything is neither a general register 
> nor an SSE register?
>
> > +#define SEH_MAX_FRAME_SIZE 0

On Tue, Feb 27, 2024 at 4:17 PM Evgeny Karpov
 wrote:
>
> SEH is not implemented yet and needs to be disabled in mingw/winnt.cc. 
> Disabling every SEH function that uses references to these macros might 
> trigger significant refactoring, and to avoid this, required macros are 
> defined with 0. It is needed only for compilation. A comment with an 
> explanation will be added.

This is all the more reason why you need to show the full testsuite
run for x86/x64 mingw as per my previous message.  We do use SEH by
default on x64.

Also, this is a friendly reminder that the GCC list is a bottom
posting list, not top posting, which is the default for your email
client.  I fixed this email for you.


RE: [PATCH v2] DSE: Bugfix ICE after allow vector type in get_stored_val

2024-02-27 Thread Li, Pan2
> Pan, can you confirm what path we take through extract_low_bits?

Thanks Jeff for comments, will have a try soon and keep you posted.

Pan

-Original Message-
From: Jeff Law  
Sent: Tuesday, February 27, 2024 11:03 PM
To: Li, Pan2 ; gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; richard.guent...@gmail.com; 
Wang, Yanzhang ; rdapp@gmail.com; Liu, Hongtao 

Subject: Re: [PATCH v2] DSE: Bugfix ICE after allow vector type in 
get_stored_val



On 2/26/24 07:22, pan2...@intel.com wrote:
> From: Pan Li 
> 
> We allowed vector type for get_stored_val when read is less than or
> equal to store in previous.  Unfortunately, we missed to adjust the
> validate_subreg part accordingly.  When the vector type's size is
> less than vector register, it will be considered as invalid in the
> validate_subreg.
> 
> Consider the validate_subreg is kind of a can with worms and we are
> in stage 4.  We will fix the issue from the DES side, and make sure
> the subreg is valid for both the read_mode and store_mode before
> perform the real gen_lowpart.
> 
> The below test are passed for this patch:
> 
> * The x86 bootstrap test.
> * The x86 regression test.
> * The riscv regression test.
> * The aarch64 regression test.
> 
> gcc/ChangeLog:
> 
>   * dse.cc (get_stored_val): Add validate_subreg check before
>   perform the gen_lowpart for rtl.
> 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.dg/tree-ssa/ssa-fre-44.c: Add compile option to trigger
>   the ICE.
>   * gcc.target/riscv/rvv/base/bug-6.c: New test.
> 
> Signed-off-by: Pan Li 
> ---
>   gcc/dse.cc|  4 +++-
>   gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c|  2 +-
>   .../gcc.target/riscv/rvv/base/bug-6.c | 22 +++
>   3 files changed, 26 insertions(+), 2 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c
> 
> diff --git a/gcc/dse.cc b/gcc/dse.cc
> index edc7a1dfecf..1596da91da0 100644
> --- a/gcc/dse.cc
> +++ b/gcc/dse.cc
> @@ -1946,7 +1946,9 @@ get_stored_val (store_info *store_info, machine_mode 
> read_mode,
>copy_rtx (store_info->const_rhs));
> else if (VECTOR_MODE_P (read_mode) && VECTOR_MODE_P (store_mode)
>   && known_le (GET_MODE_BITSIZE (read_mode), GET_MODE_BITSIZE 
> (store_mode))
> -&& targetm.modes_tieable_p (read_mode, store_mode))
> +&& targetm.modes_tieable_p (read_mode, store_mode)
> +&& validate_subreg (read_mode, store_mode, copy_rtx (store_info->rhs),
> + subreg_lowpart_offset (read_mode, store_mode)))
>   read_reg = gen_lowpart (read_mode, copy_rtx (store_info->rhs));
> else
>   read_reg = extract_low_bits (read_mode, store_mode,

So we're just changing whether or not we call gen_lowpart directly or go 
through extract_low_bits, which may in turn generate subreg, call 
gen_lowpart itself and a few other things.

I'm guessing that extract_low_bits is going to return NULL in this case 
via this code (specifically the second test).

>   if (!targetm.modes_tieable_p (src_int_mode, src_mode))
> return NULL_RTX;
>   if (!targetm.modes_tieable_p (int_mode, mode))
> return NULL_RTX;


Pan, can you confirm what path we take through extract_low_bits?

One might argue that we should just call into extract_low_bits 
unconditionally since it'll ultimately call gen_lowpart when it safely 
can.  The downside is that's a bigger change than I'd like at this stage 
in our development cycle.

I wouldn't be surprised if other direct uses of gen_lowpart have similar 
problems.





> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> index f79b4c142ae..624a00a4f32 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
> @@ -1,5 +1,5 @@
>   /* { dg-do compile } */
> -/* { dg-options "-O -fdump-tree-fre1" } */
> +/* { dg-options "-O -fdump-tree-fre1 -O3 -ftree-vectorize" } */
>   
>   struct A { float x, y; };
>   struct B { struct A u; };
So this may compromise the original intent of this test.  What I would 
suggest instead is to create a new test with the dg-do & dg-options you 
want with a #include "ssa-fre-44.c".

So to move forward.  Let's confirm the path we take through 
extract_low_bits matches expectations and fixup the testsuite change.

Jeff


[PATCH v2] C/C++: add hints for strerror

2024-02-27 Thread Oskari Pirhonen
Add proper hints for implicit declaration of strerror.

The results could be confusing depending on the other included headers.
These example messages are from compiling a trivial program to print the
string for an errno value. It only includes stdio.h (cstdio for C++).

Before:
$ /tmp/gcc-master/bin/gcc test.c -o test_c
test.c: In function ‘main’:
test.c:4:20: warning: implicit declaration of function ‘strerror’; did you mean 
‘perror’? [-Wimplicit-function-declaration]
4 | printf("%s\n", strerror(0));
  |^~~~
  |perror

$ /tmp/gcc-master/bin/g++ test.cpp -o test_cpp
test.cpp: In function ‘int main()’:
test.cpp:4:20: error: ‘strerror’ was not declared in this scope; did you mean 
‘stderr’?
4 | printf("%s\n", strerror(0));
  |^~~~
  |stderr

After:
$ /tmp/gcc-known-headers/bin/gcc test.c -o test_c
test.c: In function ‘main’:
test.c:4:20: warning: implicit declaration of function ‘strerror’ 
[-Wimplicit-function-declaration]
4 | printf("%s\n", strerror(0));
  |^~~~
test.c:2:1: note: ‘strerror’ is defined in header ‘’; this is 
probably fixable by adding ‘#include ’
1 | #include 
  +++ |+#include 
2 |

$ /tmp/gcc-known-headers/bin/g++ test.cpp -o test_cpp
test.cpp: In function ‘int main()’:
test.cpp:4:20: error: ‘strerror’ was not declared in this scope
4 | printf("%s\n", strerror(0));
  |^~~~
test.cpp:2:1: note: ‘strerror’ is defined in header ‘’; this is 
probably fixable by adding ‘#include ’
1 | #include 
  +++ |+#include 
2 |

gcc/c-family/ChangeLog:

* known-headers.cc (get_stdlib_header_for_name): Add strerror.

gcc/testsuite/ChangeLog:

* g++.dg/spellcheck-stdlib.C: Add check for strerror.
* gcc.dg/spellcheck-stdlib-2.c: New test.

Signed-off-by: Oskari Pirhonen 
---
v2:
- check for error instead of warning in gcc.dg/spellcheck-stdlib-2.c
- from linaro ci notification email

 gcc/c-family/known-headers.cc  | 1 +
 gcc/testsuite/g++.dg/spellcheck-stdlib.C   | 2 ++
 gcc/testsuite/gcc.dg/spellcheck-stdlib-2.c | 8 
 3 files changed, 11 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/spellcheck-stdlib-2.c

diff --git a/gcc/c-family/known-headers.cc b/gcc/c-family/known-headers.cc
index dbc42eacde1..871fd714eb5 100644
--- a/gcc/c-family/known-headers.cc
+++ b/gcc/c-family/known-headers.cc
@@ -182,6 +182,7 @@ get_stdlib_header_for_name (const char *name, enum stdlib 
lib)
 {"strchr", {"", ""} },
 {"strcmp", {"", ""} },
 {"strcpy", {"", ""} },
+{"strerror", {"", ""} },
 {"strlen", {"", ""} },
 {"strncat", {"", ""} },
 {"strncmp", {"", ""} },
diff --git a/gcc/testsuite/g++.dg/spellcheck-stdlib.C 
b/gcc/testsuite/g++.dg/spellcheck-stdlib.C
index fd0f3a9b8c9..33718b8034e 100644
--- a/gcc/testsuite/g++.dg/spellcheck-stdlib.C
+++ b/gcc/testsuite/g++.dg/spellcheck-stdlib.C
@@ -104,6 +104,8 @@ void test_cstring (char *dest, char *src)
   // { dg-message "'#include '" "" { target *-*-* } .-1 }
   strcpy(dest, "test"); // { dg-error "was not declared" }
   // { dg-message "'#include '" "" { target *-*-* } .-1 }
+  strerror(0); // { dg-error "was not declared" }
+  // { dg-message "'#include '" "" { target *-*-* } .-1 }
   strlen("test"); // { dg-error "was not declared" }
   // { dg-message "'#include '" "" { target *-*-* } .-1 }
   strncat(dest, "test", 3); // { dg-error "was not declared" }
diff --git a/gcc/testsuite/gcc.dg/spellcheck-stdlib-2.c 
b/gcc/testsuite/gcc.dg/spellcheck-stdlib-2.c
new file mode 100644
index 000..4762e2ddbbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/spellcheck-stdlib-2.c
@@ -0,0 +1,8 @@
+/* { dg-options "-Wimplicit-function-declaration" } */
+
+/* Missing .  */
+void test_string_h (void)
+{
+  strerror (0); /* { dg-error "implicit declaration of function 'strerror'" } 
*/
+  /* { dg-message "'strerror' is defined in header ''" "" { target 
*-*-* } .-1 } */
+}
-- 
2.43.0



Re: [PATCH v1 08/13] aarch64: Add Cygwin and MinGW environments for AArch64

2024-02-27 Thread Andrew Pinski
On Tue, Feb 27, 2024 at 1:18 PM Evgeny Karpov
 wrote:
>
> SEH is not implemented yet and needs to be disabled in mingw/winnt.cc. 
> Disabling every SEH function that uses references to these macros might 
> trigger significant refactoring, and to avoid this, required macros are 
> defined with 0. It is needed only for compilation. A comment with an 
> explanation will be added.

What does this mean with respect to C++ exceptions? Or you using SJLJ
exceptions support or the dwarf unwinding ones without SEH support?
I am not sure if SJLJ exceptions is well tested any more in GCC either.

Also I have a question if you ran the full GCC/G++ testsuites and what
were the results?
If you did run it, did you use a cross compiler or the native
compiler? Did you do a bootstrap (GCC uses C++ but no exceptions
though)?
If you run using a cross compiler, did you use ssh or some other route
to run the applications?

Thanks,
Andrew Pinski

>
> It looks like IL32P64 works. It has been tested on OpenSSL, OpenBLAS, 
> libjpeg-turbo, and FFmpeg packages. No issues have been detected with it.
>
> Correct, stack checking cannot be explicitly disabled by the user. It will be 
> interesting to know cases when it is needed. GCC uses stack probing only when 
> the stack size is exceeded; size optimization is not an option then.
>
> Regards,
> Evgeny
>
>
> -Original Message-
> Friday, February 23, 2024 6:16 PM
> Richard Sandiford wrote:
>
> > +
> > +#undef TARGET_SEH
> > +#define TARGET_SEH 0
> > +
> > +#define SSE_REGNO_P(N) 0
> > +#define GENERAL_REGNO_P(N) 0
>
> Could you add a comment to explain how these two macros are consumed?
> What is the effect of saying that everything is neither a general register 
> nor an SSE register?
>
> > +#define SEH_MAX_FRAME_SIZE 0
>
>
> > +/* Windows64 continues to use a 32-bit long type.  */ #undef
> > +LONG_TYPE_SIZE #define LONG_TYPE_SIZE 32
>
> Just curious: this is AFAIK the first IL32P64 ABI for AArch64.
> Do things Just Work, including for things like arm_neon.h and other ACLE 
> header files?  I'm pleasantly surprised if so :)
>
> I suppose this is more of a generic mingw/cygwin question, but does this mean 
> that stack checking cannot be explicitly disabled by a user who "knows what 
> they are doing"?
>
> Thanks,
> Richard
>


Rejects ASSOCIATE and a complex part%ref when target is a function

2024-02-27 Thread Steve Kargl
All,

Consider,

! { dg-do run }
program foo
   implicit none
   real y
   associate (x => log(cmplx(-1,0)))
  y = x%im
  if (int(100*y)-314 /= 0) stop 1
   end associate
end program

% gfcx -c a.f90
a.f90:6:13:

6 |   y = x%im
  | 1
Error: Symbol 'x' at (1) has no IMPLICIT type


'x' has the type of thi selector, which is COMPLEX.
I have created the following bug report

https://gcc.gnu.org/pipermail/gcc-bugs/2024-February/855452.html

and attached a patch that fixes the problem.  The patch has been
regression tested against x86_64-*-freebsd.  Please commit.

-- 
Steve


Re: [PATCH] RISC-V: Update test expectancies with recent scheduler change

2024-02-27 Thread Palmer Dabbelt

On Tue, 27 Feb 2024 15:53:19 PST (-0800), jeffreya...@gmail.com wrote:



On 2/27/24 15:56, 钟居哲 wrote:

 >> I don't think it's that simple.  On some uarchs vsetvls are nearly free

while on others they can be fairly expensive.  It's not clear (to me)
yet if one approach or the other is going to be the more common.


That's uarch dependent which is not the stuff I am talking about.
What's I want to say is that this patch breaks those testcases I added 
for VSETVL PASS testing.

And those testcases are uarch independent.
No, uarch impacts things like latency, which in turn impacts scheduling, 
which in turn impacts vsetvl generation/optimization.


Ya, and I think that's just what's expected for this sort of approach.  
Edwin and I were working through that possibility in the office earlier, 
but we didn't have the code up.  So I figured I'd just go through one in 
more detail to see if what we were talking about was sane.  Grabbing 
some arbitrary function in the changed set:


   void
   test_vbool1_then_vbool64(int8_t * restrict in, int8_t * restrict out) {
   vbool1_t v1 = *(vbool1_t*)in;
   vbool64_t v2 = *(vbool64_t*)in;
   
   *(vbool1_t*)(out + 100) = v1;

   *(vbool64_t*)(out + 200) = v2;
   }

we currently get (from generic-ooo)

   test_vbool1_then_vbool64:
   vsetvli a4,zero,e8,m8,ta,ma
   vlm.v   v2,0(a0)
   vsetvli a5,zero,e8,mf8,ta,ma
   vlm.v   v1,0(a0)
   addia3,a1,100
   vsetvli a4,zero,e8,m8,ta,ma
   addia1,a1,200
   vsm.v   v2,0(a3)
   vsetvli a5,zero,e8,mf8,ta,ma
   vsm.v   v1,0(a1)
   ret

but we could generate correct code with 2, 3, or 4 vsetvli instructions 
depending on how things are scheduled.  For example, with 
-fno-schedule-insns I happen to get 3


   test_vbool1_then_vbool64:
   vsetvli a5,zero,e8,mf8,ta,ma
   vlm.v   v1,0(a0)
   vsetvli a4,zero,e8,m8,ta,ma
   vlm.v   v2,0(a0)
   addia3,a1,100
   addia1,a1,200
   vsm.v   v2,0(a3)
   vsetvli a5,zero,e8,mf8,ta,ma
   vsm.v   v1,0(a1)
   ret

because the load/store with the same vcfg end up scheduled back-to-back.  
I don't see any reason why something along the lines of


   test_vbool1_then_vbool64:
   vsetvli a4,zero,e8,m8,ta,ma
   vlm.v   v2,0(a0)
   addia3,a1,100
   vsm.v   v2,0(a3)
   vsetvli a5,zero,e8,mf8,ta,ma
   vlm.v   v1,0(a0)
   addia1,a1,200
   vsm.v   v1,0(a1)
   ret

wouldn't be correct (though I just reordered the loads/stores and then 
removed the redundant vsetvlis, so I might have some address calculation 
wrong in there).  The validity of removing a vsetvli depends on how the 
dependant instructions get scheduled, which is very much under the 
control of the pipeline model -- it's entirely possible the code with 
more vsetvlis is faster, if vsetvli is cheap and scheduling ends up 
hiding latency better.


So IMO it's completely reasonable to have vsetvli count ranges for a 
test like this.  I haven't looked at the others in any detail, but I 
remember seeing similar things elsewhere last time I was poking around 
these tests.  We should probably double-check all these and write some 
comments, just to make sure we're not missing any bugs, but I'd bet 
there's a bunch of valid testsuite changes.


Like we talked about in the call this morning we should probably make 
the tests more precise, but that's a larger effort.  After working 
through this I'm thinking it's a bit higher priority, though, as in this 
case the bounds are so wide we're not even really testing the pass any 
more.




jeff


Re: ping: [PATCH] libcpp: Fix __has_include_next ICE in the last directory of the path [PR80755]

2024-02-27 Thread Lewis Hyatt
Hello-

https://gcc.gnu.org/pipermail/gcc-patches/2023-December/641247.html

There was a request on the PR
(https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80755#c5) for me to ping
this again, so I am complying :). Might anyone have a minute to take a
look please? Thanks...


-Lewis


On Thu, Jan 11, 2024 at 7:34 AM Lewis Hyatt  wrote:
>
> Can I please ping this one? Thanks...
> https://gcc.gnu.org/pipermail/gcc-patches/2023-December/641247.html
>
> -Lewis
>
> On Thu, Dec 21, 2023 at 7:37 AM Lewis Hyatt  wrote:
> >
> > Hello-
> >
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80755
> >
> > Here is a short fix for the ICE in libcpp noted in the PR. Bootstrap +
> > regtest all languages on x86-64 Linux. Is it OK please? Thanks!
> >
> > -Lewis
> >
> > -- >8 --
> >
> > In libcpp/files.cc, the function _cpp_has_header(), which implements
> > __has_include and __has_include_next, does not check for a NULL return value
> > from search_path_head(), leading to an ICE tripping an assert when
> > _cpp_find_file() tries to use it. Fix it by checking for that case and
> > silently returning false instead.
> >
> > As suggested by the PR author, it is easiest to make a testcase by using
> > the -idirafter option. To enable that, also modify the dg-additional-options
> > testsuite procedure to make the global $srcdir available, since -idirafter
> > requires the full path.
> >
> > libcpp/ChangeLog:
> >
> > PR preprocessor/80755
> > * files.cc (search_path_head): Add SUPPRESS_DIAGNOSTIC argument
> > defaulting to false.
> > (_cpp_has_header): Silently return false if the search path has been
> > exhausted, rather than issuing a diagnostic and then hitting an
> > assert.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * lib/gcc-defs.exp (dg-additional-options): Make $srcdir usable in a
> > dg-additional-options directive.
> > * c-c++-common/cpp/has-include-next-2-dir/has-include-next-2.h: New 
> > test.
> > * c-c++-common/cpp/has-include-next-2.c: New test.
> > ---
> >  libcpp/files.cc  | 12 
> >  .../cpp/has-include-next-2-dir/has-include-next-2.h  |  3 +++
> >  gcc/testsuite/c-c++-common/cpp/has-include-next-2.c  |  4 
> >  gcc/testsuite/lib/gcc-defs.exp   |  1 +
> >  4 files changed, 16 insertions(+), 4 deletions(-)
> >  create mode 100644 
> > gcc/testsuite/c-c++-common/cpp/has-include-next-2-dir/has-include-next-2.h
> >  create mode 100644 gcc/testsuite/c-c++-common/cpp/has-include-next-2.c
> >
> > diff --git a/libcpp/files.cc b/libcpp/files.cc
> > index 27301d79fa4..aaab4b13a6a 100644
> > --- a/libcpp/files.cc
> > +++ b/libcpp/files.cc
> > @@ -181,7 +181,8 @@ static bool read_file_guts (cpp_reader *pfile, 
> > _cpp_file *file,
> >  static bool read_file (cpp_reader *pfile, _cpp_file *file,
> >location_t loc);
> >  static struct cpp_dir *search_path_head (cpp_reader *, const char *fname,
> > -int angle_brackets, enum include_type);
> > +int angle_brackets, enum 
> > include_type,
> > +bool suppress_diagnostic = false);
> >  static const char *dir_name_of_file (_cpp_file *file);
> >  static void open_file_failed (cpp_reader *pfile, _cpp_file *file, int,
> >   location_t);
> > @@ -1041,7 +1042,7 @@ _cpp_mark_file_once_only (cpp_reader *pfile, 
> > _cpp_file *file)
> > nothing left in the path, returns NULL.  */
> >  static struct cpp_dir *
> >  search_path_head (cpp_reader *pfile, const char *fname, int angle_brackets,
> > - enum include_type type)
> > + enum include_type type, bool suppress_diagnostic)
> >  {
> >cpp_dir *dir;
> >_cpp_file *file;
> > @@ -1070,7 +1071,7 @@ search_path_head (cpp_reader *pfile, const char 
> > *fname, int angle_brackets,
> >  return make_cpp_dir (pfile, dir_name_of_file (file),
> >  pfile->buffer ? pfile->buffer->sysp : 0);
> >
> > -  if (dir == NULL)
> > +  if (dir == NULL && !suppress_diagnostic)
> >  cpp_error (pfile, CPP_DL_ERROR,
> >"no include path in which to search for %s", fname);
> >
> > @@ -2164,7 +2165,10 @@ bool
> >  _cpp_has_header (cpp_reader *pfile, const char *fname, int angle_brackets,
> >  enum include_type type)
> >  {
> > -  cpp_dir *start_dir = search_path_head (pfile, fname, angle_brackets, 
> > type);
> > +  cpp_dir *start_dir = search_path_head (pfile, fname, angle_brackets, 
> > type,
> > +/* suppress_diagnostic = */ true);
> > +  if (!start_dir)
> > +return false;
> >_cpp_file *file = _cpp_find_file (pfile, fname, start_dir, 
> > angle_brackets,
> > _cpp_FFK_HAS_INCLUDE, 0);
> >return file->err_no != ENOENT;
> > diff --git 
> > 

Re: [PATCH] RISC-V: Update test expectancies with recent scheduler change

2024-02-27 Thread Jeff Law




On 2/27/24 15:56, 钟居哲 wrote:

 >> I don't think it's that simple.  On some uarchs vsetvls are nearly free

while on others they can be fairly expensive.  It's not clear (to me)
yet if one approach or the other is going to be the more common.


That's uarch dependent which is not the stuff I am talking about.
What's I want to say is that this patch breaks those testcases I added 
for VSETVL PASS testing.

And those testcases are uarch independent.
No, uarch impacts things like latency, which in turn impacts scheduling, 
which in turn impacts vsetvl generation/optimization.


jeff



Re: Re: [PATCH] RISC-V: Update test expectancies with recent scheduler change

2024-02-27 Thread 钟居哲
>> I don't think it's that simple.  On some uarchs vsetvls are nearly free
>>while on others they can be fairly expensive.  It's not clear (to me)
>>yet if one approach or the other is going to be the more common.

That's uarch dependent which is not the stuff I am talking about.
What's I want to say is that this patch breaks those testcases I added for 
VSETVL PASS testing.
And those testcases are uarch independent.



juzhe.zh...@rivai.ai
 
From: Jeff Law
Date: 2024-02-27 23:22
To: juzhe.zh...@rivai.ai; Robin Dapp; Edwin Lu; gcc-patches
CC: gnu-toolchain; pan2.li
Subject: Re: [PATCH] RISC-V: Update test expectancies with recent scheduler 
change
 
 
On 2/26/24 18:21, juzhe.zh...@rivai.ai wrote:
> If the scheduling model increases the vsetvls, we shouldn't set it as 
> default scheduling model
I don't think it's that simple.  On some uarchs vsetvls are nearly free 
while on others they can be fairly expensive.  It's not clear (to me) 
yet if one approach or the other is going to be the more common.
 
jeff
 
 


Re: [PATCH V2] rs6000: Don't allow immediate value in the vsx_splat pattern [PR113950]

2024-02-27 Thread Peter Bergner
On 2/27/24 6:40 AM, Segher Boessenkool wrote:
> On Tue, Feb 27, 2024 at 02:02:38AM +0530, jeevitha wrote:
>> There is no immediate value splatting instruction in Power. Currently, those
>> values need to be stored in a register or memory. To address this issue, I
>> have updated the predicate for the second operand in vsx_splat to
>> splat_input_operand and corrected the assignment of op1 to operands[1].
>> These changes ensure that operand1 is stored in a register.
> 
> input_operand allows a lot of things that splat_input_operand does not,
> not just immediate operands.  NAK.
> 
> (For example, *all* memory is okay for input_operand, always).
> 
> I'm not saying we do not want to restrict these things, but a commit
> that doesn't discuss this at all is not okay.  Sorry.

So it seems you're not NAKing the use of splat_input_operand, but
just that it needs more explanation in the git log entry, correct?

Yes, input_operand accepts a lot more things than splat_input_operand
does, but the multiple define_insns this define_expand feeds, uses
gpc_reg_operand, memory_operand and splat_input_operand for their
operands[1] operand (splat_input_operand accepts reg and mem too),
so it seems to match better what the patterns will be accepting and
I always thought that using predicates that more accurately reflect
what the define_insns expect/accept lead to better code gen.

Mike, was it just an oversight to not use splat_input_operand for the
vsx_splat_ expander or was input_operand a conscious decision?

If input_operand was used purposely, then we can just fall back to
the s/op1/operands[1]/ change which we already know fixes the bug.


Peter




[pushed] analyzer: use correct format code for string literal indices [PR110483, PR111802]

2024-02-27 Thread David Malcolm
On e.g. gcc211 the use of "%li" with unsigned HOST_WIDE_INT led to this warning:
../../src/gcc/analyzer/access-diagram.cc: In member function ‘void 
ana::string_literal_spatial_item::add_column_for_byte(text_art::table&, const 
ana::bit_to_table_map&, text_art::style_manager&, ana::byte_offset_t, 
ana::byte_offset_t, int, int) const’:
../../src/gcc/analyzer/access-diagram.cc:1909:40: warning: format ‘%li’ expects 
argument of type ‘long int’, but argument 3 has type ‘long long unsigned int’ 
[-Wformat=]
  byte_idx_within_string.ulow ()));
^
and to all values being erroneously printed as "0".

Fixed thusly.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Verified that this fixes the analyzer.exp=out-of-bounds-diagram*.c tests
on gcc211 (sparc-sun-solaris2.11).
Pushed to trunk as r14-9199-g939439a90f234f.

gcc/analyzer/ChangeLog:
PR analyzer/110483
PR analyzer/111802
* access-diagram.cc
(string_literal_spatial_item::add_column_for_byte): Use %wu for
printing unsigned HOST_WIDE_INT.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/access-diagram.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/analyzer/access-diagram.cc b/gcc/analyzer/access-diagram.cc
index 9555ee823931..24d203f9325a 100644
--- a/gcc/analyzer/access-diagram.cc
+++ b/gcc/analyzer/access-diagram.cc
@@ -1905,7 +1905,7 @@ private:
const table::rect_t idx_table_rect
  = btm.get_table_rect (_string_reg, bytes, byte_idx_table_y, 1);
t.set_cell_span (idx_table_rect,
-fmt_styled_string (sm, "[%li]",
+fmt_styled_string (sm, "[%wu]",
byte_idx_within_string.ulow ()));
   }
 
-- 
2.26.3



Re: [wwwdocs] gcc-14/changes.html + projects/gomp/: OpenMP + OpenACC update

2024-02-27 Thread Gerald Pfeifer
On Tue, 27 Feb 2024, Tobias Burnus wrote:
> Minor update for older and more recent changes.

>   supported for stack variables in C and Fortran, including the OpenMP 5.1
> -  align modifier. For Fortran, OpenMP allocators can now be
> +  align modifier. In C and C++, the map clause 
> now
> +  accepts lvalue expressions. For Fortran, OpenMP allocators can now be

I would omit the comma after "C and C++".

+  acc_memcpy_to_device_async,
+  acc_memcyp_from_device and
+  acc_memcyp_from_device_async.

Oxford comma (i.e., comma before "and")?

Looks good to me (with or without my recommendations).

Thanks,
Gerald


Re: [PATCH] RISC-V: Add initial cost handling for segment loads/stores.

2024-02-27 Thread Robin Dapp
> This patch looks odd to me.
> I don't see memrefs in the trunk code.

It's on top of the vle/vse offset handling patch from
a while back that I haven't committed yet.

> Also, I prefer list all cost in cost tune info for NF = 2 ~ 8 like ARM SVE 
> does:
I don't mind having separate costs for each but I figured they
scale anyway with the number of vectors already.  Attached v2
is more similar to aarch64.

Regards
 Robin

Subject: [PATCH v2] RISC-V: Add initial cost handling for segment
 loads/stores.

This patch makes segment loads and stores more expensive.  It adds
segment_permute_2 (as well as 4 and 8) cost fields to the common vector
costs and adds handling to adjust_stmt_cost.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (struct common_vector_cost): Add
segment_permute cost.
* config/riscv/riscv-vector-costs.cc (costs::adjust_stmt_cost):
Handle segment loads/stores.
* config/riscv/riscv.cc: Initialize segment_permute_[248] to 1.
---
 gcc/config/riscv/riscv-protos.h|   5 +
 gcc/config/riscv/riscv-vector-costs.cc | 139 +
 gcc/config/riscv/riscv.cc  |   6 ++
 3 files changed, 108 insertions(+), 42 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 80efdf2b7e5..9b737aca1a3 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -218,6 +218,11 @@ struct common_vector_cost
   const int gather_load_cost;
   const int scatter_store_cost;
 
+  /* Segment load/store permute cost.  */
+  const int segment_permute_2;
+  const int segment_permute_4;
+  const int segment_permute_8;
+
   /* Cost of a vector-to-scalar operation.  */
   const int vec_to_scalar_cost;
 
diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index adf9c197df5..c8178d71101 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1043,6 +1043,25 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
   return vector_costs::better_main_loop_than_p (other);
 }
 
+/* Returns the group size i.e. the number of vectors to be loaded by a
+   segmented load/store instruction.  Return 0 if it is no segmented
+   load/store.  */
+static int
+segment_loadstore_group_size (enum vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info)
+{
+  if (stmt_info
+  && (kind == vector_load || kind == vector_store)
+  && STMT_VINFO_DATA_REF (stmt_info))
+{
+  stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+  if (stmt_info
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
+   return DR_GROUP_SIZE (stmt_info);
+}
+  return 0;
+}
+
 /* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
For some statement, we would like to further fine-grain tweak the cost on
top of riscv_builtin_vectorization_cost handling which doesn't have any
@@ -1067,55 +1086,91 @@ costs::adjust_stmt_cost (enum vect_cost_for_stmt kind, 
loop_vec_info loop,
 case vector_load:
 case vector_store:
{
- /* Unit-stride vector loads and stores do not have offset addressing
-as opposed to scalar loads and stores.
-If the address depends on a variable we need an additional
-add/sub for each load/store in the worst case.  */
- if (stmt_info && stmt_info->stmt)
+ if (stmt_info && stmt_info->stmt && STMT_VINFO_DATA_REF (stmt_info))
{
- data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
- class loop *father = stmt_info->stmt->bb->loop_father;
- if (!loop && father && !father->inner && father->superloops)
+ /* Segment loads and stores.  When the group size is > 1
+the vectorizer will add a vector load/store statement for
+each vector in the group.  Here we additionally add permute
+costs for each.  */
+ /* TODO: Indexed and ordered/unordered cost.  */
+ int group_size = segment_loadstore_group_size (kind, stmt_info);
+ if (group_size > 1)
+   {
+ switch (group_size)
+   {
+   case 2:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+   stmt_cost += costs->vla->segment_permute_2;
+ else
+   stmt_cost += costs->vls->segment_permute_2;
+ break;
+   case 4:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+   stmt_cost += costs->vla->segment_permute_4;
+ else
+   stmt_cost += costs->vls->segment_permute_4;
+ break;
+   case 8:
+ if (riscv_v_ext_vector_mode_p (loop->vector_mode))
+   

Re: [PATCH v2 4/5] bpf: implementation of func_info in .BTF.ext.

2024-02-27 Thread David Faust
Hi Cupertino,

On 2/27/24 11:04, Cupertino Miranda wrote:
> Kernel verifier complains in some particular cases for missing func_info
> implementation in .BTF.ext. This patch implements it.
> 
> Strings are cached locally in coreout.cc to avoid adding duplicated
> strings in the string list. This string deduplication should eventually
> be moved to the CTFC functions such that this happens widely.
> 
> With this implementation, the CO-RE relocations information was also
> simplified and integrated with the FuncInfo structures.
>

I have just a couple small comments inline in the patch below, but they
are very minor and only suggestions/nits.

The ChangeLog has the same past/present tense issue as the other patches
in the series, but apart from that I see no issues. Great work! Thanks
for implementing this.

Patch is OK with the ChangeLog fixed up, and the inline nits - if 
you agree.
Thanks!
 
> gcc/Changelog:
> 
>   PR target/113453
>   * config/bpf/bpf.cc (bpf_function_prologue): Defined target
>   hook.
>   * config/bpf/coreout.cc (brf_ext_info_section)
>   (btf_ext_info): Moved from coreout.h
>   (btf_ext_funcinfo, btf_ext_lineinfo): Added struct.
>   (bpf_core_reloc): Renamed to btf_ext_core_reloc.
>   (btf_ext): Added static variable.
>   (btfext_info_sec_find_or_add, SEARCH_NODE_AND_RETURN)
>   (bpf_create_or_find_funcinfo, bpt_create_core_reloc)
>   (btf_ext_add_string, btf_funcinfo_type_callback)
>   (btf_add_func_info_for, btf_validate_funcinfo)
>   (btf_ext_info_len, output_btfext_func_info): Added function.
>   (output_btfext_header, bpf_core_reloc_add)
>   (output_btfext_core_relocs, btf_ext_init, btf_ext_output):
>   Changed to support new structs.
>   * config/bpf/coreout.h (btf_ext_funcinfo, btf_ext_lineinfo):
>   Moved and changed in coreout.cc.
>   (btf_add_func_info_for, btf_ext_add_string): Added prototypes.
> 
> gcc/testsuite/ChangeLog:
>   PR target/113453
>   * gcc.target/bpf/btfext-funcinfo-nocore.c: Added.
>   * gcc.target/bpf/btfext-funcinfo.c: Added.
>   * gcc.target/bpf/core-attr-5.c: Fixed regexp.
>   * gcc.target/bpf/core-attr-6.c: Fixed regexp.
>   * gcc.target/bpf/core-builtin-fieldinfo-offset-1.c: Fixed regexp.
>   * gcc.target/bpf/core-section-1.c: Fixed regexp
> ---
>  gcc/config/bpf/bpf.cc |  12 +
>  gcc/config/bpf/coreout.cc | 518 +-
>  gcc/config/bpf/coreout.h  |  20 +-
>  .../gcc.target/bpf/btfext-funcinfo-nocore.c   |  42 ++
>  .../gcc.target/bpf/btfext-funcinfo.c  |  46 ++
>  gcc/testsuite/gcc.target/bpf/core-attr-5.c|   9 +-
>  gcc/testsuite/gcc.target/bpf/core-attr-6.c|   6 +-
>  .../bpf/core-builtin-fieldinfo-offset-1.c |  13 +-
>  gcc/testsuite/gcc.target/bpf/core-section-1.c |   2 +-
>  9 files changed, 506 insertions(+), 162 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/bpf/btfext-funcinfo-nocore.c
>  create mode 100644 gcc/testsuite/gcc.target/bpf/btfext-funcinfo.c
> 
> diff --git a/gcc/config/bpf/bpf.cc b/gcc/config/bpf/bpf.cc
> index 4318b26b9cda..ea47e3a8dbfb 100644
> --- a/gcc/config/bpf/bpf.cc
> +++ b/gcc/config/bpf/bpf.cc
> @@ -385,6 +385,18 @@ bpf_compute_frame_layout (void)
>  #undef TARGET_COMPUTE_FRAME_LAYOUT
>  #define TARGET_COMPUTE_FRAME_LAYOUT bpf_compute_frame_layout
>  
> +/* Defined to initialize data for func_info region in .BTF.ext section.  */
> +
> +static void
> +bpf_function_prologue (FILE *f ATTRIBUTE_UNUSED)
> +{
> +  if (btf_debuginfo_p ())
> +btf_add_func_info_for (cfun->decl, current_function_func_begin_label);
> +}
> +
> +#undef TARGET_ASM_FUNCTION_PROLOGUE
> +#define TARGET_ASM_FUNCTION_PROLOGUE bpf_function_prologue
> +
>  /* Expand to the instructions in a function prologue.  This function
> is called when expanding the 'prologue' pattern in bpf.md.  */
>  
> diff --git a/gcc/config/bpf/coreout.cc b/gcc/config/bpf/coreout.cc
> index 2f06ec2a0f29..31b2abc3151b 100644
> --- a/gcc/config/bpf/coreout.cc
> +++ b/gcc/config/bpf/coreout.cc
> @@ -31,6 +31,7 @@
>  #include "btf.h"
>  #include "rtl.h"
>  #include "tree-pretty-print.h"
> +#include "cgraph.h"
>  
>  #include "coreout.h"
>  
> @@ -95,64 +96,193 @@
> result, a single .BTF.ext section can contain CO-RE relocations for 
> multiple
> programs in distinct sections.  */
>  
> -/* Internal representation of a BPF CO-RE relocation record.  */
> +/* BTF.ext debug info section.  */
> +static GTY (()) section * btf_ext_info_section;
> +
> +#ifndef BTF_EXT_INFO_SECTION_NAME
> +#define BTF_EXT_INFO_SECTION_NAME ".BTF.ext"
> +#endif
> +#define BTF_EXT_INFO_SECTION_FLAGS (SECTION_DEBUG)
> +
> +#ifndef BTF_EXT_INFO_SECTION_LABEL
> +#define BTF_EXT_INFO_SECTION_LABEL "Lbtfext"
> +#endif
> +
> +#define MAX_BTF_EXT_LABEL_BYTES 40
> +static char btf_ext_info_section_label[MAX_BTF_EXT_LABEL_BYTES];
> +
> +/* A funcinfo record, in the .BTF.ext funcinfo section.  */

[PATCH] Add myself to write after approval and DCO

2024-02-27 Thread Fangrui Song
From: Fangrui Song 

ChangeLog:

* MAINTAINERS: Add myself.

Signed-off-by: Fangrui Song 
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 986e8d0a725..b01fab16061 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -669,6 +669,7 @@ Edward Smith-Rowland

 Anatoly Sokolov
 Michael Sokolov

 Jayant Sonar   
+Fangrui Song   
 Richard Stallman   
 Basile Starynkevitch   
 Jakub Staszak  
@@ -779,6 +780,7 @@ Bill Schmidt

 Nathaniel Shead

 Nathan Sidwell 
 Edward Smith-Rowland   
+Fangrui Song   
 Petter Tomner  
 Martin Uecker  
 Jonathan Wakely
-- 
2.44.0.rc1.240.g4c46232300-goog



RE: [PATCH] developer option: -fdump-generic-nodes; initial incorporation

2024-02-27 Thread Robert Dubner
Richard,

Thank you very much for your comments.

When I set out to create the capability, I had a "specification" in mind.

I didn't have a clue how to create a GENERIC tree that could be fed to the 
middle end in a way that would successfully result in an executable.  And I 
needed to be able to do that in order to proceed with the project of 
creating a COBOL front end.

So, I came up with the idea of using GCC to compile simple programs, and to 
hook into the compiler to examine the trees fed to the middle end, and to 
display those trees in the human-readable format I needed to understand 
them.  And that's what I did.

My first incarnation generated pure text files, and I used that to get 
going.

After a while I realized that when I used the output file, I was spending a 
lot of time searching through the text files.  And I had the brainstorm! 
Hyperlinks!  HTML files!  We have the technology!  So, I created the .HTML 
files as well.

I found this useful to the point of necessity in order to learn how to 
generate the GENERIC trees.  I believe it would be equally useful to the 
next developer who, for whatever reason, needs to understand, on a "You need 
to learn the alphabet before you can learn how to read" level, what the 
middle end requires from a GENERIC tree generated by a front end.

But I've never used it on a complex program. I've used it only to learn how 
to create the GENERIC nodes for very particular things, and so I would use 
the -fdump-generic-nodes feature on a very simple C program that 
demonstrated, in isolation, the feature I needed.  Once I figured it out, I 
would create front end C routines or macros that used the tree.h/tree.cc 
features to build those GENERIC trees, and then I would move on.

I decided to offer it up here, in order to to learn how to create patches 
and to get
to know the people and the process, as well as from the desire to share it. 
And instantly I got the "How about a machine-readable format?" comments. 
Which are reasonable.  So, because it wasn't hard, I hacked at the existing 
code to create a JSON output.  (But I remind you that up until now, nobody 
seems to have needed a JSON representation.)

And your observation that the human readable representation could be made 
from the JSON representation is totally accurate.

But that wasn't my specification.  My specification was "A tool so that a 
human being can examine a simple GENERIC tree to learn how it's done."

But it seems to me that we are now moving into the realm of a new 
specification.

Said another way:  To go from "A human readable representation of a simple 
GENERIC tree" to "A machine readable JSON representation of an arbitrarily 
complex GENERIC tree, from which a human readable representation can be 
created" means, in effect, starting over on a different project that I don't 
need.  I already *have* a project that I am working on -- the COBOL front 
end.

The complexity of GENERIC trees is, in my experienced opinion, an obstacle 
for the creation of front ends.  The GCC Internals document has a lot of 
information, but to go from it to a front end is like using the maintenance 
manual for an F16 fighter to try to learn to fly the aircraft.

The program "main(){}" generates a tree with over seventy nodes.  I see no 
way to document why that's true; it's all arbitrary in the sense that "this 
is how GCC works".  -fdump-generic-nodes made it possible for me to figure 
out how those nodes are connected and, thus, how to create a new front end. 
I figure that other developers might find it useful, as well.

I guess I am saying that I am not, at this time, able to work on a whole 
different tool.  I think what I have done so far does something useful that 
doesn't seem to otherwise exist in GCC.

I suppose the question for you is, "Is it useful enough?"

I won't be offended if the answer is "No" and I hope you won't be offended 
by my not having the bandwidth to address your very thoughtful and valid 
observations about how it could be better.

-Original Message-
From: Richard Biener 
Sent: Tuesday, February 27, 2024 04:11
To: Robert Dubner 
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [PATCH] developer option: -fdump-generic-nodes; initial 
incorporation

On Thu, Feb 22, 2024 at 5:46 PM Robert Dubner  wrote:
>
> As part of an effort to learn how create a GENERIC tree in order to
> implement a
> COBOL front end, I created the dump_generic_nodes(), which accepts a
> function_decl at the point it is provided to the middle end.  The routine
> generates three files.  One is ASCII, the second is HTML; they contain the
> tree
> in a human-readable form.  The third is JSON.
>
> This commit modifies common.opt to accept the -fdump-generic-nodes
> command-line
> option, creates the dump-generic-nodes.cc and .h files to implement it,
> and
> inserts a call to the dump_generic_nodes() function near the top of
> gimplify_function_tree() in gcc/gimplify.cc

While I think that's good and probably 

[PATCH v1 08/13] aarch64: Add Cygwin and MinGW environments for AArch64

2024-02-27 Thread Evgeny Karpov
SEH is not implemented yet and needs to be disabled in mingw/winnt.cc. 
Disabling every SEH function that uses references to these macros might trigger 
significant refactoring, and to avoid this, required macros are defined with 0. 
It is needed only for compilation. A comment with an explanation will be added.

It looks like IL32P64 works. It has been tested on OpenSSL, OpenBLAS, 
libjpeg-turbo, and FFmpeg packages. No issues have been detected with it.

Correct, stack checking cannot be explicitly disabled by the user. It will be 
interesting to know cases when it is needed. GCC uses stack probing only when 
the stack size is exceeded; size optimization is not an option then.

Regards,
Evgeny


-Original Message-
Friday, February 23, 2024 6:16 PM 
Richard Sandiford wrote:
 
> +
> +#undef TARGET_SEH
> +#define TARGET_SEH 0
> +
> +#define SSE_REGNO_P(N) 0
> +#define GENERAL_REGNO_P(N) 0

Could you add a comment to explain how these two macros are consumed?
What is the effect of saying that everything is neither a general register nor 
an SSE register?

> +#define SEH_MAX_FRAME_SIZE 0


> +/* Windows64 continues to use a 32-bit long type.  */ #undef 
> +LONG_TYPE_SIZE #define LONG_TYPE_SIZE 32

Just curious: this is AFAIK the first IL32P64 ABI for AArch64.
Do things Just Work, including for things like arm_neon.h and other ACLE header 
files?  I'm pleasantly surprised if so :)

I suppose this is more of a generic mingw/cygwin question, but does this mean 
that stack checking cannot be explicitly disabled by a user who "knows what 
they are doing"?

Thanks,
Richard



[PATCH] Fortran testsuite: fix invalid Fortran in testcase

2024-02-27 Thread Harald Anlauf
Dear all,

the attached patch fixes invalid Fortran in testcase
gfortran.dg/pr101026.f, which might prohibit progress
in fixing pr111781.  (Note that the testcase was for a
tree-optimizer issue, not the Fortran frontend.)

OK for mainline?

Will commit within 24h unless there are comments.

Thanks,
Harald

From 75724b6b42a1c46383d8e6deedbfb8d2ebd0fa12 Mon Sep 17 00:00:00 2001
From: Harald Anlauf 
Date: Tue, 27 Feb 2024 21:51:53 +0100
Subject: [PATCH] Fortran testsuite: fix invalid Fortran in testcase

gcc/testsuite/ChangeLog:

	* gfortran.dg/pr101026.f: Let variables used in specification
	expression be passed as dummy arguments
---
 gcc/testsuite/gfortran.dg/pr101026.f | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gfortran.dg/pr101026.f b/gcc/testsuite/gfortran.dg/pr101026.f
index 9576d8802ca..e05e21c898a 100644
--- a/gcc/testsuite/gfortran.dg/pr101026.f
+++ b/gcc/testsuite/gfortran.dg/pr101026.f
@@ -1,6 +1,6 @@
 ! { dg-do compile }
 ! { dg-options "-Ofast -frounding-math" }
-  SUBROUTINE PASSB4 (CC,CH)
+  SUBROUTINE PASSB4 (CC,CH,IDO,L1)
   DIMENSION CC(IDO,4,L1), CH(IDO,L1,*)
  DO 103 I=2,IDO,2
 TI4 = CC0-CC(I,4,K)
--
2.35.3



[PATCH] c++: auto(x) partial substitution [PR110025, PR114138]

2024-02-27 Thread Patrick Palka
Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look
OK for trunk and perhaps 13?

-- >8 --

In r12-6773-g09845ad7569bac we gave CTAD placeholders a level of 0 and
ensured we never replaced them via tsubst.  It turns out that autos
representing an explicit cast need the same treatment and for the same
reason: such autos appear in an expression context and so their level
gets easily messed up after partial substitution, leading to premature
replacement via an incidental tsubst instead of via do_auto_deduction.

This patch fixes this by extending the r12-6773 approach to auto(x) and
auto{x}.

PR c++/110025
PR c++/114138

gcc/cp/ChangeLog:

* cp-tree.h (make_cast_auto): Declare.
* parser.cc (cp_parser_functional_cast): Replace a parsed auto
with a level-less one via make_cast_auto.
* pt.cc (find_parameter_packs_r): Don't treat level-less auto
as a type parameter pack.
(tsubst) : Generalized CTAD placeholder
handling to all level-less autos.
(make_cast_auto): Define.
(do_auto_deduction): Handle deduction of a level-less non-CTAD
auto.

gcc/testsuite/ChangeLog:

* g++.dg/cpp23/auto-fncast16.C: New test.
* g++.dg/cpp23/auto-fncast17.C: New test.
* g++.dg/cpp23/auto-fncast18.C: New test.
---
 gcc/cp/cp-tree.h   |  1 +
 gcc/cp/parser.cc   | 11 
 gcc/cp/pt.cc   | 31 +-
 gcc/testsuite/g++.dg/cpp23/auto-fncast16.C | 12 
 gcc/testsuite/g++.dg/cpp23/auto-fncast17.C | 15 +
 gcc/testsuite/g++.dg/cpp23/auto-fncast18.C | 71 ++
 6 files changed, 138 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp23/auto-fncast16.C
 create mode 100644 gcc/testsuite/g++.dg/cpp23/auto-fncast17.C
 create mode 100644 gcc/testsuite/g++.dg/cpp23/auto-fncast18.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 04c3aa6cd91..6f1da1c7bad 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -7476,6 +7476,7 @@ extern tree make_decltype_auto(void);
 extern tree make_constrained_auto  (tree, tree);
 extern tree make_constrained_decltype_auto (tree, tree);
 extern tree make_template_placeholder  (tree);
+extern tree make_cast_auto (void);
 extern bool template_placeholder_p (tree);
 extern bool ctad_template_p(tree);
 extern bool unparenthesized_id_or_class_member_access_p (tree);
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index 3ee9d49fb8e..1e518e6ef51 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -33314,6 +33314,17 @@ cp_parser_functional_cast (cp_parser* parser, tree 
type)
   if (!type)
 type = error_mark_node;
 
+  if (TREE_CODE (type) == TYPE_DECL
+  && is_auto (TREE_TYPE (type)))
+type = TREE_TYPE (type);
+
+  if (is_auto (type)
+  && !AUTO_IS_DECLTYPE (type)
+  && !PLACEHOLDER_TYPE_CONSTRAINTS (type)
+  && !CLASS_PLACEHOLDER_TEMPLATE (type))
+/* auto(x) and auto{x} are represented using a level-less auto.  */
+type = make_cast_auto ();
+
   if (cp_lexer_next_token_is (parser->lexer, CPP_OPEN_BRACE))
 {
   cp_lexer_set_source_position (parser->lexer);
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 2803824d11e..620fe5cdbfa 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -3921,7 +3921,8 @@ find_parameter_packs_r (tree *tp, int *walk_subtrees, 
void* data)
 parameter pack (14.6.3), or the type-specifier-seq of a type-id that
 is a pack expansion, the invented template parameter is a template
 parameter pack.  */
-  if (ppd->type_pack_expansion_p && is_auto (t))
+  if (ppd->type_pack_expansion_p && is_auto (t)
+ && TEMPLATE_TYPE_LEVEL (t) != 0)
TEMPLATE_TYPE_PARAMETER_PACK (t) = true;
   if (TEMPLATE_TYPE_PARAMETER_PACK (t))
 parameter_pack_p = true;
@@ -16297,9 +16298,14 @@ tsubst (tree t, tree args, tsubst_flags_t complain, 
tree in_decl)
   }
 
 case TEMPLATE_TYPE_PARM:
-  if (template_placeholder_p (t))
+  if (TEMPLATE_TYPE_LEVEL (t) == 0)
{
+ /* Level-less auto must be replaced via do_auto_deduction.  */
+ gcc_checking_assert (is_auto (t));
  tree tmpl = CLASS_PLACEHOLDER_TEMPLATE (t);
+ if (!tmpl)
+   return t;
+
  tmpl = tsubst_expr (tmpl, args, complain, in_decl);
  if (TREE_CODE (tmpl) == TEMPLATE_TEMPLATE_PARM)
tmpl = TEMPLATE_TEMPLATE_PARM_TEMPLATE_DECL (tmpl);
@@ -29311,6 +29317,17 @@ template_placeholder_p (tree t)
   return is_auto (t) && CLASS_PLACEHOLDER_TEMPLATE (t);
 }
 
+/* Return an auto for an explicit cast, e.g. auto(x) or auto{x}.
+   Like CTAD placeholders, these have level 0 so that they're not
+   accidentally replaced via tsubst, and are always directly resolved
+   via do_auto_deduction.  */
+
+tree
+make_cast_auto ()
+{
+  return make_auto_1 

[PATCH v1 00/13] Add aarch64-w64-mingw32 target

2024-02-27 Thread Evgeny Karpov
Richard, thank you for the initial review! Hopefully, the required actions have 
been addressed, which should improve the patch quality in v2.

Regards,
Evgeny

-Original Message-
Thursday, February 22, 2024 2:40 PM 
Richard Earnshaw (lists) wrote:

Thanks for posting this.

I've only read quickly through this patch series and responded where I think 
some action is obviously required.  That doesn't necessarily mean the other 
patches are perfect, though, just that nothing immediately caught my attention.

R.



Re: [PATCH] RISC-V: add option -m(no-)autovec-segment

2024-02-27 Thread Greg McGary

On 2/27/24 8:25 AM, Jeff Law wrote:




On 2/25/24 21:53, Greg McGary wrote:

Add option -m(no-)autovec-segment to enable/disable autovectorizer
from emitting vector segment load/store instructions. This is useful for
performance experiments.

gcc/ChangeLog:
* config/riscv/autovec.md (vec_mask_len_load_lanes, 
vec_mask_len_store_lanes):

  Predicate with TARGET_VECTOR_AUTOVEC_SEGMENT
* gcc/config/riscv/riscv-opts.h (TARGET_VECTOR_AUTOVEC_SEGMENT): 
New macro.

* gcc/config/riscv/riscv.opt (-m(no-)autovec-segment): New option.
* gcc/tree-vect-stmts.cc (gcc/tree-vect-stmts.cc): Prevent 
divide-by-zero.

* testsuite/gcc.target/riscv/rvv/autovec/struct/*_noseg*.c,
testsuite/gcc.target/riscv/rvv/autovec/no-segment.c: New tests.
I don't mind having options to do this kind of selection (we've done 
similar things internally for other RVV features).  But I don't think 
now is the time to be introducing this stuff.  We're in stage4 of the 
development cycle after all.



No problemo. Will you take the simple bugfix?

  gcc/tree-vect-stmts.cc (gcc/tree-vect-stmts.cc): Prevent divide-by-zero.
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc

index 1dbe1115da4..6303d82d959 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -11521,7 +11521,8 @@ vectorizable_load (vec_info *vinfo,
 - (vec_num * j + i) * nunits);
/* remain should now be > 0 and < nunits.  */
unsigned num;
-   if (constant_multiple_p (nunits, remain, ))
+   if (known_gt (remain, 0)
+   && constant_multiple_p (nunits, remain, ))
  {
tree ptype;
new_vtype


I am unaware of a testcase that triggers it without disabling segmented 
load,

so LMK if you are cool with the fix without a test case.

G



Re: [PATCH v2 5/5] bpf: renamed coreout.* files to btfext-out.*.

2024-02-27 Thread David Faust
On 2/27/24 11:04, Cupertino Miranda wrote:
> gcc/ChangeLog:
> 
>   * config.gcc (target_gtfiles): Changes coreout to btfext-out.
>   (extra_objs): Changes coreout to btfext-out.
>   * config/bpf/coreout.cc: Renamed to btfext-out.cc.
>   * config/bpf/btfext-out.cc: Added.
>   * config/bpf/coreout.h: Renamed to btfext-out.h.
>   * config/bpf/btfext-out.h: Added.
>   * config/bpf/core-builtins.cc: Changes include.
>   * config/bpf/core-builtins.h: Changes include.
>   * config/bpf/t-bpf: Renamed file.
This last entry is confusing, sounds like t-bpf is renamed, which it
isn't. I'd suggest to just say "accomodate renamed files" or so.

Similar to prior patches, there is a mix of present and past tenses here.
Please stick with the present.

Changes -> Change
Added -> Add  (or just "New.")
Renamed -> Rename.

OK with those changes.
Thanks.

> ---
>  gcc/config.gcc   | 4 ++--
>  gcc/config/bpf/{coreout.cc => btfext-out.cc} | 4 ++--
>  gcc/config/bpf/{coreout.h => btfext-out.h}   | 2 +-
>  gcc/config/bpf/core-builtins.cc  | 2 +-
>  gcc/config/bpf/core-builtins.h   | 2 +-
>  gcc/config/bpf/t-bpf | 4 ++--
>  6 files changed, 9 insertions(+), 9 deletions(-)
>  rename gcc/config/bpf/{coreout.cc => btfext-out.cc} (99%)
>  rename gcc/config/bpf/{coreout.h => btfext-out.h} (98%)
> 
> diff --git a/gcc/config.gcc b/gcc/config.gcc
> index a0f9c6723083..1ca033d75b66 100644
> --- a/gcc/config.gcc
> +++ b/gcc/config.gcc
> @@ -1653,8 +1653,8 @@ bpf-*-*)
>  tmake_file="${tmake_file} bpf/t-bpf"
>  use_collect2=no
>  use_gcc_stdint=provide
> -extra_objs="coreout.o core-builtins.o"
> -target_gtfiles="$target_gtfiles \$(srcdir)/config/bpf/coreout.cc 
> \$(srcdir)/config/bpf/core-builtins.cc"
> +extra_objs="btfext-out.o core-builtins.o"
> +target_gtfiles="$target_gtfiles \$(srcdir)/config/bpf/btfext-out.cc 
> \$(srcdir)/config/bpf/core-builtins.cc"
>  ;;
>  cris-*-elf | cris-*-none)
>   tm_file="elfos.h newlib-stdint.h ${tm_file}"
> diff --git a/gcc/config/bpf/coreout.cc b/gcc/config/bpf/btfext-out.cc
> similarity index 99%
> rename from gcc/config/bpf/coreout.cc
> rename to gcc/config/bpf/btfext-out.cc
> index 31b2abc3151b..4281cca83e13 100644
> --- a/gcc/config/bpf/coreout.cc
> +++ b/gcc/config/bpf/btfext-out.cc
> @@ -33,7 +33,7 @@
>  #include "tree-pretty-print.h"
>  #include "cgraph.h"
>  
> -#include "coreout.h"
> +#include "btfext-out.h"
>  
>  /* This file contains data structures and routines for construction and 
> output
> of BPF Compile Once - Run Everywhere (BPF CO-RE) information.
> @@ -618,4 +618,4 @@ btf_ext_output (void)
>dw2_asm_output_data (4, 0, "Required padding by libbpf structs");
>  }
>  
> -#include "gt-coreout.h"
> +#include "gt-btfext-out.h"
> diff --git a/gcc/config/bpf/coreout.h b/gcc/config/bpf/btfext-out.h
> similarity index 98%
> rename from gcc/config/bpf/coreout.h
> rename to gcc/config/bpf/btfext-out.h
> index 1c26b9274739..b36309475c97 100644
> --- a/gcc/config/bpf/coreout.h
> +++ b/gcc/config/bpf/btfext-out.h
> @@ -1,4 +1,4 @@
> -/* coreout.h - Declarations and definitions related to
> +/* btfext-out.h - Declarations and definitions related to
> BPF Compile Once - Run Everywhere (CO-RE) support.
> Copyright (C) 2021-2024 Free Software Foundation, Inc.
>  
> diff --git a/gcc/config/bpf/core-builtins.cc b/gcc/config/bpf/core-builtins.cc
> index aa75fd68cae6..8d8c54c1fb3d 100644
> --- a/gcc/config/bpf/core-builtins.cc
> +++ b/gcc/config/bpf/core-builtins.cc
> @@ -45,7 +45,7 @@ along with GCC; see the file COPYING3.  If not see
>  
>  #include "ctfc.h"
>  #include "btf.h"
> -#include "coreout.h"
> +#include "btfext-out.h"
>  #include "core-builtins.h"
>  
>  /* BPF CO-RE builtins definition.
> diff --git a/gcc/config/bpf/core-builtins.h b/gcc/config/bpf/core-builtins.h
> index c54f6ddac812..e56b55b94e0c 100644
> --- a/gcc/config/bpf/core-builtins.h
> +++ b/gcc/config/bpf/core-builtins.h
> @@ -1,7 +1,7 @@
>  #ifndef BPF_CORE_BUILTINS_H
>  #define BPF_CORE_BUILTINS_H
>  
> -#include "coreout.h"
> +#include "btfext-out.h"
>  
>  enum bpf_builtins
>  {
> diff --git a/gcc/config/bpf/t-bpf b/gcc/config/bpf/t-bpf
> index 18f1fa67794d..dc50332350c4 100644
> --- a/gcc/config/bpf/t-bpf
> +++ b/gcc/config/bpf/t-bpf
> @@ -1,7 +1,7 @@
>  
> -TM_H += $(srcdir)/config/bpf/coreout.h $(srcdir)/config/bpf/core-builtins.h
> +TM_H += $(srcdir)/config/bpf/btfext-out.h 
> $(srcdir)/config/bpf/core-builtins.h
>  
> -coreout.o: $(srcdir)/config/bpf/coreout.cc
> +btfext-out.o: $(srcdir)/config/bpf/btfext-out.cc
>   $(COMPILE) $<
>   $(POSTCOMPILE)
>  


Re: [PATCH v2 3/5] bpf: Always emit .BTF.ext section if generating BTF

2024-02-27 Thread David Faust



On 2/27/24 11:04, Cupertino Miranda wrote:
> BPF applications, when generating BTF information should always create a
> .BTF.ext section.
> Current implementation was only creating it when -mco-re option was used.
> This patch makes .BTF.ext always be generated for BPF target objects.
> The patch also adds conditions around btf_finalize function call
> such that BTF deallocation happens later for BPF target.
> For BPF, btf_finalize is only called after .BTF.ext is generated.

Thank you, this version makes it much more clear what the patch does.

> 
> gcc/ChangeLog:
> 
>   * config/bpf/bpf.cc (bpf_option_override): Make .BTF.ext
>   enabled by default for BPF.
>   (bpf_file_end): Call BTF deallocation.
>   * dwarf2ctf.cc (ctf_debug_finalize): Conditionally execute BTF
>   deallocation.

You are missing ChangeLog entries for bpf_asm_init_sections and
ctf_debug_finish.

The script contrib/gcc-changelog/git_check_commit.py may help
to catch those.

The code changes LGTM, so OK with the ChangeLog fixed.
Thanks.

> ---
>  gcc/config/bpf/bpf.cc | 20 +---
>  gcc/dwarf2ctf.cc  | 12 ++--
>  2 files changed, 15 insertions(+), 17 deletions(-)
> 
> diff --git a/gcc/config/bpf/bpf.cc b/gcc/config/bpf/bpf.cc
> index d6ca47eeecbe..4318b26b9cda 100644
> --- a/gcc/config/bpf/bpf.cc
> +++ b/gcc/config/bpf/bpf.cc
> @@ -195,10 +195,8 @@ bpf_option_override (void)
>if (TARGET_BPF_CORE && !btf_debuginfo_p ())
>  error ("BPF CO-RE requires BTF debugging information, use %<-gbtf%>");
>  
> -  /* To support the portability needs of BPF CO-RE approach, BTF debug
> - information includes the BPF CO-RE relocations.  */
> -  if (TARGET_BPF_CORE)
> -write_symbols |= BTF_WITH_CORE_DEBUG;
> +  /* BPF applications always generate .BTF.ext.  */
> +  write_symbols |= BTF_WITH_CORE_DEBUG;
>  
>/* Unlike much of the other BTF debug information, the information 
> necessary
>   for CO-RE relocations is added to the CTF container by the BPF backend.
> @@ -218,10 +216,7 @@ bpf_option_override (void)
>/* -gbtf implies -mcore when using the BPF backend, unless -mno-co-re
>   is specified.  */
>if (btf_debuginfo_p () && !(target_flags_explicit & MASK_BPF_CORE))
> -{
> -  target_flags |= MASK_BPF_CORE;
> -  write_symbols |= BTF_WITH_CORE_DEBUG;
> -}
> +target_flags |= MASK_BPF_CORE;
>  
>/* Determine available features from ISA setting (-mcpu=).  */
>if (bpf_has_jmpext == -1)
> @@ -267,7 +262,7 @@ bpf_option_override (void)
>  static void
>  bpf_asm_init_sections (void)
>  {
> -  if (TARGET_BPF_CORE)
> +  if (btf_debuginfo_p () && btf_with_core_debuginfo_p ())
>  btf_ext_init ();
>  }
>  
> @@ -279,8 +274,11 @@ bpf_asm_init_sections (void)
>  static void
>  bpf_file_end (void)
>  {
> -  if (TARGET_BPF_CORE)
> -btf_ext_output ();
> +  if (btf_debuginfo_p () && btf_with_core_debuginfo_p ())
> +{
> +  btf_ext_output ();
> +  btf_finalize ();
> +}
>  }
>  
>  #undef TARGET_ASM_FILE_END
> diff --git a/gcc/dwarf2ctf.cc b/gcc/dwarf2ctf.cc
> index 93e5619933fa..dca86edfffa9 100644
> --- a/gcc/dwarf2ctf.cc
> +++ b/gcc/dwarf2ctf.cc
> @@ -944,7 +944,10 @@ ctf_debug_finalize (const char *filename, bool btf)
>if (btf)
>  {
>btf_output (filename);
> -  btf_finalize ();
> +  /* btf_finalize when compiling BPF applciations gets deallocated by the
> +  BPF target in bpf_file_end.  */
> +  if (btf_debuginfo_p () && !btf_with_core_debuginfo_p ())
> + btf_finalize ();
>  }
>  
>else
> @@ -1027,11 +1030,8 @@ ctf_debug_finish (const char * filename)
>/* Emit BTF debug info here when CO-RE relocations need to be generated.
>   BTF with CO-RE relocations needs to be generated when CO-RE is in effect
>   for the BPF target.  */
> -  if (btf_with_core_debuginfo_p ())
> -{
> -  gcc_assert (btf_debuginfo_p ());
> -  ctf_debug_finalize (filename, btf_debuginfo_p ());
> -}
> +  if (btf_debuginfo_p () && btf_with_core_debuginfo_p ())
> +ctf_debug_finalize (filename, btf_debuginfo_p ());
>  }
>  
>  #include "gt-dwarf2ctf.h"


Re: [PATCH v2 2/5] btf: added KIND_FUNC traversal function.

2024-02-27 Thread David Faust
Hi Cupertino,

Similar to patch 1, please use present tense to match the style of
existing commits, in commit message and in ChangeLog.

On 2/27/24 11:04, Cupertino Miranda wrote:
> Added a traversal function to traverse all BTF_KIND_FUNC nodes with a
> callback function. Used for .BTF.ext section content creation.

Added -> Add

> 
> gcc/ChangeLog:
> 
>   * btfout.cc (output_btf_func_types): Use FOR_EACH_VEC_ELT.
>   (traverse_btf_func_types): Defined function.
>   * ctfc.h (funcs_traverse_callback): Typedef for function
>   prototype.
>   (traverse_btf_func_types): Added prototype.

Mix of present and past tenses here, please stick to the present:
Defined -> Define
Added -> Add

The code changes LGTM, so OK with those nits fixed.
Thanks.

> ---
>  gcc/btfout.cc | 22 --
>  gcc/ctfc.h|  3 +++
>  2 files changed, 23 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/btfout.cc b/gcc/btfout.cc
> index 7e114e224449..7aabd99f3e7c 100644
> --- a/gcc/btfout.cc
> +++ b/gcc/btfout.cc
> @@ -1276,8 +1276,10 @@ output_btf_types (ctf_container_ref ctfc)
>  static void
>  output_btf_func_types (ctf_container_ref ctfc)
>  {
> -  for (size_t i = 0; i < vec_safe_length (funcs); i++)
> -btf_asm_func_type (ctfc, (*funcs)[i], i);
> +  ctf_dtdef_ref ref;
> +  unsigned i;
> +  FOR_EACH_VEC_ELT (*funcs, i, ref)
> +btf_asm_func_type (ctfc, ref, i);
>  }
>  
>  /* Output all BTF_KIND_DATASEC records.  */
> @@ -1452,4 +1454,20 @@ btf_finalize (void)
>tu_ctfc = NULL;
>  }
>  
> +/* Traversal function for all BTF_KIND_FUNC type records.  */
> +
> +bool
> +traverse_btf_func_types (funcs_traverse_callback callback, void *data)
> +{
> +  ctf_dtdef_ref ref;
> +  unsigned i;
> +  FOR_EACH_VEC_ELT (*funcs, i, ref)
> +{
> +  bool stop = callback (ref, data);
> +  if (stop == true)
> + return true;
> +}
> +  return false;
> +}
> +
>  #include "gt-btfout.h"
> diff --git a/gcc/ctfc.h b/gcc/ctfc.h
> index 7aac57edac55..fa188bf2f5a4 100644
> --- a/gcc/ctfc.h
> +++ b/gcc/ctfc.h
> @@ -441,6 +441,9 @@ extern int ctf_add_variable (ctf_container_ref, const 
> char *, ctf_id_t,
>  extern ctf_id_t ctf_lookup_tree_type (ctf_container_ref, const tree);
>  extern ctf_id_t get_btf_id (ctf_id_t);
>  
> +typedef bool (*funcs_traverse_callback) (ctf_dtdef_ref, void *);
> +bool traverse_btf_func_types (funcs_traverse_callback, void *);
> +
>  /* CTF section does not emit location information; at this time, location
> information is needed for BTF CO-RE use-cases.  */
>  


[PATCH v1 10/13] Rename "x86 Windows Options" to "Cygwin and MinGW Options"

2024-02-27 Thread Evgeny Karpov
@xref{Cygwin and MinGW Options} re-direct from "x86 Windows Options" will be 
added in v2. Thanks!

Regards,
Evgeny


-Original Message-
Thursday, February 22, 2024 2:32 PM 
Richard Earnshaw (lists) wrote:

For this change you might want to put some form of re-direct in the manual 
under the old name so that anybody used to looking for the old entry will know 
where things have been moved to.  Something like

x86 Windows Options
  See xref(Cygwin and MinGW Options).

R.


Re: [PATCH v2 1/5] btf: fixed type id in BTF_KIND_FUNC struct data.

2024-02-27 Thread David Faust
Hi Cupertino,

Just some nits below. Apologies for incoming pedantry.

On 2/27/24 11:04, Cupertino Miranda wrote:
> This patch correct the aditition of +1 on the type id, which originally
> was done in the wrong location and leaded to func_sts->dtd_type for
> BTF_KIND_FUNCS struct data to contain the type id of the previous entry.

Multiple typos here:
  correct -> corrects
  aditition -> addition
  ...leaded to.. -> ..led to..
  func_sts -> func_dtd
  BTF_KIND_FUNCS -> BTF_KIND_FUNC

> 
> gcc/ChangeLog:
> 
>   * btfout.cc (btf_collect_dataset): Corrected BTF type id.

Please use present tense in the ChangeLog entries, to match GNU style
guidelines and existing entries,
i.e. "Correct..." instead of "Corrected..."

The same goes for the commit header, please use present tense to match
the style of existing commits,
i.e. "btf: fix type id..." instead of "fixed".

The patch itself LGTM, so OK with above changes.
Thanks!

> ---
>  gcc/btfout.cc | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/gcc/btfout.cc b/gcc/btfout.cc
> index dcf751f8fe0d..7e114e224449 100644
> --- a/gcc/btfout.cc
> +++ b/gcc/btfout.cc
> @@ -457,7 +457,8 @@ btf_collect_datasec (ctf_container_ref ctfc)
>func_dtd->dtd_data.ctti_type = dtd->dtd_type;
>func_dtd->linkage = dtd->linkage;
>func_dtd->dtd_name = dtd->dtd_name;
> -  func_dtd->dtd_type = num_types_added + num_types_created;
> +  /* +1 for the sentinel type not in the types map.  */
> +  func_dtd->dtd_type = num_types_added + num_types_created + 1;
>  
>/* Only the BTF_KIND_FUNC type actually references the name. The
>BTF_KIND_FUNC_PROTO is always anonymous.  */
> @@ -480,8 +481,7 @@ btf_collect_datasec (ctf_container_ref ctfc)
>  
> struct btf_var_secinfo info;
>  
> -   /* +1 for the sentinel type not in the types map.  */
> -   info.type = func_dtd->dtd_type + 1;
> +   info.type = func_dtd->dtd_type;
>  
> /* Both zero at compile time.  */
> info.size = 0;


[wwwdocs] gcc-14/changes.html + projects/gomp/: OpenMP + OpenACC update

2024-02-27 Thread Tobias Burnus

Minor update for older and more recent changes.

Comments?

Tobias
gcc-14/changes.html + projects/gomp/: OpenMP + OpenACC update

Update OpenMP for two meanwhile implemented features (lvalue-expr in map,
indirect now also in Fortran).
Update OpenACC for one new feature (Fortran interface to exisiting
C/C++ routines).

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index 85ccc54d..1c2059b6 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -79,7 +79,8 @@ a work-in-progress.
 
 OpenMP 5.0: The allocate directive is now
   supported for stack variables in C and Fortran, including the OpenMP 5.1
-  align modifier. For Fortran, OpenMP allocators can now be
+  align modifier. In C and C++, the map clause now
+  accepts lvalue expressions. For Fortran, OpenMP allocators can now be
   used for allocatables and pointers using the allocate
   directive and its OpenMP 5.2 replacement, the allocators
   directive; files using this allocator and all files that might directly
@@ -91,8 +92,8 @@ a work-in-progress.
 
   OpenMP 5.1: Support was added for collapsing imperfectly nested loops and
   using present as map-type modifier and in
-  defaultmap. The indirect clause is now supported
-  for C and C++.  The performance of copying strided data from or to nvptx
+  defaultmap. The indirect clause is now
+  supported. The performance of copying strided data from or to nvptx
   and AMD GPU devices using the OpenMP 5.1 routine
   omp_target_memcpy_rect has been improved.
 
@@ -117,6 +118,14 @@ a work-in-progress.
 OpenACC 2.7: The self clause was added to be used on
   compute constructs and the default clause for data
   constructs.
+OpenACC 3.2: The following API routines are now available in
+  Fortran using the openacc module or the
+  open_lib.h header file: acc_alloc,
+  acc_free, acc_hostptr,
+  acc_deviceptr, acc_memcpy_to_device,
+  acc_memcpy_to_device_async,
+  acc_memcyp_from_device and
+  acc_memcyp_from_device_async.
   
   
   For offload-device code generated via OpenMP and OpenACC, the math
diff --git a/htdocs/projects/gomp/index.html b/htdocs/projects/gomp/index.html
index bf20bb88..8fdfb95a 100644
--- a/htdocs/projects/gomp/index.html
+++ b/htdocs/projects/gomp/index.html
@@ -489,7 +489,7 @@ than listed, depending on resolved corner cases and optimizations.
   
   
 C/C++'s lvalue expressions in to, from and map clauses
-No
+GCC14
 
   
   
@@ -714,8 +714,8 @@ than listed, depending on resolved corner cases and optimizations.
   
   
 Indirect calls to the device version of a procedure or function in target regions
-GCC14
-Only C and C++
+GCC14
+
   
   
 interop directive
@@ -756,8 +756,8 @@ than listed, depending on resolved corner cases and optimizations.
   
   
 indirect clause in declare target
-GCC14
-Only C and C++
+GCC14
+
   
   
 device_type(nohost)/device_type(host) for variables


[PATCH v2 5/5] bpf: renamed coreout.* files to btfext-out.*.

2024-02-27 Thread Cupertino Miranda
gcc/ChangeLog:

* config.gcc (target_gtfiles): Changes coreout to btfext-out.
(extra_objs): Changes coreout to btfext-out.
* config/bpf/coreout.cc: Renamed to btfext-out.cc.
* config/bpf/btfext-out.cc: Added.
* config/bpf/coreout.h: Renamed to btfext-out.h.
* config/bpf/btfext-out.h: Added.
* config/bpf/core-builtins.cc: Changes include.
* config/bpf/core-builtins.h: Changes include.
* config/bpf/t-bpf: Renamed file.
---
 gcc/config.gcc   | 4 ++--
 gcc/config/bpf/{coreout.cc => btfext-out.cc} | 4 ++--
 gcc/config/bpf/{coreout.h => btfext-out.h}   | 2 +-
 gcc/config/bpf/core-builtins.cc  | 2 +-
 gcc/config/bpf/core-builtins.h   | 2 +-
 gcc/config/bpf/t-bpf | 4 ++--
 6 files changed, 9 insertions(+), 9 deletions(-)
 rename gcc/config/bpf/{coreout.cc => btfext-out.cc} (99%)
 rename gcc/config/bpf/{coreout.h => btfext-out.h} (98%)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index a0f9c6723083..1ca033d75b66 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -1653,8 +1653,8 @@ bpf-*-*)
 tmake_file="${tmake_file} bpf/t-bpf"
 use_collect2=no
 use_gcc_stdint=provide
-extra_objs="coreout.o core-builtins.o"
-target_gtfiles="$target_gtfiles \$(srcdir)/config/bpf/coreout.cc 
\$(srcdir)/config/bpf/core-builtins.cc"
+extra_objs="btfext-out.o core-builtins.o"
+target_gtfiles="$target_gtfiles \$(srcdir)/config/bpf/btfext-out.cc 
\$(srcdir)/config/bpf/core-builtins.cc"
 ;;
 cris-*-elf | cris-*-none)
tm_file="elfos.h newlib-stdint.h ${tm_file}"
diff --git a/gcc/config/bpf/coreout.cc b/gcc/config/bpf/btfext-out.cc
similarity index 99%
rename from gcc/config/bpf/coreout.cc
rename to gcc/config/bpf/btfext-out.cc
index 31b2abc3151b..4281cca83e13 100644
--- a/gcc/config/bpf/coreout.cc
+++ b/gcc/config/bpf/btfext-out.cc
@@ -33,7 +33,7 @@
 #include "tree-pretty-print.h"
 #include "cgraph.h"
 
-#include "coreout.h"
+#include "btfext-out.h"
 
 /* This file contains data structures and routines for construction and output
of BPF Compile Once - Run Everywhere (BPF CO-RE) information.
@@ -618,4 +618,4 @@ btf_ext_output (void)
   dw2_asm_output_data (4, 0, "Required padding by libbpf structs");
 }
 
-#include "gt-coreout.h"
+#include "gt-btfext-out.h"
diff --git a/gcc/config/bpf/coreout.h b/gcc/config/bpf/btfext-out.h
similarity index 98%
rename from gcc/config/bpf/coreout.h
rename to gcc/config/bpf/btfext-out.h
index 1c26b9274739..b36309475c97 100644
--- a/gcc/config/bpf/coreout.h
+++ b/gcc/config/bpf/btfext-out.h
@@ -1,4 +1,4 @@
-/* coreout.h - Declarations and definitions related to
+/* btfext-out.h - Declarations and definitions related to
BPF Compile Once - Run Everywhere (CO-RE) support.
Copyright (C) 2021-2024 Free Software Foundation, Inc.
 
diff --git a/gcc/config/bpf/core-builtins.cc b/gcc/config/bpf/core-builtins.cc
index aa75fd68cae6..8d8c54c1fb3d 100644
--- a/gcc/config/bpf/core-builtins.cc
+++ b/gcc/config/bpf/core-builtins.cc
@@ -45,7 +45,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #include "ctfc.h"
 #include "btf.h"
-#include "coreout.h"
+#include "btfext-out.h"
 #include "core-builtins.h"
 
 /* BPF CO-RE builtins definition.
diff --git a/gcc/config/bpf/core-builtins.h b/gcc/config/bpf/core-builtins.h
index c54f6ddac812..e56b55b94e0c 100644
--- a/gcc/config/bpf/core-builtins.h
+++ b/gcc/config/bpf/core-builtins.h
@@ -1,7 +1,7 @@
 #ifndef BPF_CORE_BUILTINS_H
 #define BPF_CORE_BUILTINS_H
 
-#include "coreout.h"
+#include "btfext-out.h"
 
 enum bpf_builtins
 {
diff --git a/gcc/config/bpf/t-bpf b/gcc/config/bpf/t-bpf
index 18f1fa67794d..dc50332350c4 100644
--- a/gcc/config/bpf/t-bpf
+++ b/gcc/config/bpf/t-bpf
@@ -1,7 +1,7 @@
 
-TM_H += $(srcdir)/config/bpf/coreout.h $(srcdir)/config/bpf/core-builtins.h
+TM_H += $(srcdir)/config/bpf/btfext-out.h $(srcdir)/config/bpf/core-builtins.h
 
-coreout.o: $(srcdir)/config/bpf/coreout.cc
+btfext-out.o: $(srcdir)/config/bpf/btfext-out.cc
$(COMPILE) $<
$(POSTCOMPILE)
 
-- 
2.39.2



[PATCH v2 4/5] bpf: implementation of func_info in .BTF.ext.

2024-02-27 Thread Cupertino Miranda
Kernel verifier complains in some particular cases for missing func_info
implementation in .BTF.ext. This patch implements it.

Strings are cached locally in coreout.cc to avoid adding duplicated
strings in the string list. This string deduplication should eventually
be moved to the CTFC functions such that this happens widely.

With this implementation, the CO-RE relocations information was also
simplified and integrated with the FuncInfo structures.

gcc/Changelog:

PR target/113453
* config/bpf/bpf.cc (bpf_function_prologue): Defined target
hook.
* config/bpf/coreout.cc (brf_ext_info_section)
(btf_ext_info): Moved from coreout.h
(btf_ext_funcinfo, btf_ext_lineinfo): Added struct.
(bpf_core_reloc): Renamed to btf_ext_core_reloc.
(btf_ext): Added static variable.
(btfext_info_sec_find_or_add, SEARCH_NODE_AND_RETURN)
(bpf_create_or_find_funcinfo, bpt_create_core_reloc)
(btf_ext_add_string, btf_funcinfo_type_callback)
(btf_add_func_info_for, btf_validate_funcinfo)
(btf_ext_info_len, output_btfext_func_info): Added function.
(output_btfext_header, bpf_core_reloc_add)
(output_btfext_core_relocs, btf_ext_init, btf_ext_output):
Changed to support new structs.
* config/bpf/coreout.h (btf_ext_funcinfo, btf_ext_lineinfo):
Moved and changed in coreout.cc.
(btf_add_func_info_for, btf_ext_add_string): Added prototypes.

gcc/testsuite/ChangeLog:
PR target/113453
* gcc.target/bpf/btfext-funcinfo-nocore.c: Added.
* gcc.target/bpf/btfext-funcinfo.c: Added.
* gcc.target/bpf/core-attr-5.c: Fixed regexp.
* gcc.target/bpf/core-attr-6.c: Fixed regexp.
* gcc.target/bpf/core-builtin-fieldinfo-offset-1.c: Fixed regexp.
* gcc.target/bpf/core-section-1.c: Fixed regexp
---
 gcc/config/bpf/bpf.cc |  12 +
 gcc/config/bpf/coreout.cc | 518 +-
 gcc/config/bpf/coreout.h  |  20 +-
 .../gcc.target/bpf/btfext-funcinfo-nocore.c   |  42 ++
 .../gcc.target/bpf/btfext-funcinfo.c  |  46 ++
 gcc/testsuite/gcc.target/bpf/core-attr-5.c|   9 +-
 gcc/testsuite/gcc.target/bpf/core-attr-6.c|   6 +-
 .../bpf/core-builtin-fieldinfo-offset-1.c |  13 +-
 gcc/testsuite/gcc.target/bpf/core-section-1.c |   2 +-
 9 files changed, 506 insertions(+), 162 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/bpf/btfext-funcinfo-nocore.c
 create mode 100644 gcc/testsuite/gcc.target/bpf/btfext-funcinfo.c

diff --git a/gcc/config/bpf/bpf.cc b/gcc/config/bpf/bpf.cc
index 4318b26b9cda..ea47e3a8dbfb 100644
--- a/gcc/config/bpf/bpf.cc
+++ b/gcc/config/bpf/bpf.cc
@@ -385,6 +385,18 @@ bpf_compute_frame_layout (void)
 #undef TARGET_COMPUTE_FRAME_LAYOUT
 #define TARGET_COMPUTE_FRAME_LAYOUT bpf_compute_frame_layout
 
+/* Defined to initialize data for func_info region in .BTF.ext section.  */
+
+static void
+bpf_function_prologue (FILE *f ATTRIBUTE_UNUSED)
+{
+  if (btf_debuginfo_p ())
+btf_add_func_info_for (cfun->decl, current_function_func_begin_label);
+}
+
+#undef TARGET_ASM_FUNCTION_PROLOGUE
+#define TARGET_ASM_FUNCTION_PROLOGUE bpf_function_prologue
+
 /* Expand to the instructions in a function prologue.  This function
is called when expanding the 'prologue' pattern in bpf.md.  */
 
diff --git a/gcc/config/bpf/coreout.cc b/gcc/config/bpf/coreout.cc
index 2f06ec2a0f29..31b2abc3151b 100644
--- a/gcc/config/bpf/coreout.cc
+++ b/gcc/config/bpf/coreout.cc
@@ -31,6 +31,7 @@
 #include "btf.h"
 #include "rtl.h"
 #include "tree-pretty-print.h"
+#include "cgraph.h"
 
 #include "coreout.h"
 
@@ -95,64 +96,193 @@
result, a single .BTF.ext section can contain CO-RE relocations for multiple
programs in distinct sections.  */
 
-/* Internal representation of a BPF CO-RE relocation record.  */
+/* BTF.ext debug info section.  */
+static GTY (()) section * btf_ext_info_section;
+
+#ifndef BTF_EXT_INFO_SECTION_NAME
+#define BTF_EXT_INFO_SECTION_NAME ".BTF.ext"
+#endif
+#define BTF_EXT_INFO_SECTION_FLAGS (SECTION_DEBUG)
+
+#ifndef BTF_EXT_INFO_SECTION_LABEL
+#define BTF_EXT_INFO_SECTION_LABEL "Lbtfext"
+#endif
+
+#define MAX_BTF_EXT_LABEL_BYTES 40
+static char btf_ext_info_section_label[MAX_BTF_EXT_LABEL_BYTES];
+
+/* A funcinfo record, in the .BTF.ext funcinfo section.  */
+struct GTY ((chain_next ("%h.next"))) btf_ext_funcinfo
+{
+  uint32_t type; /* Type ID of a BTF_KIND_FUNC type.  */
+  const char *fnname;
+  const char *label;
+
+  struct btf_ext_funcinfo *next; /* Linked list to collect func_info elems.  */
+};
+
+/* A lineinfo record, in the .BTF.ext lineinfo section.  */
+struct GTY ((chain_next ("%h.next"))) btf_ext_lineinfo
+{
+  uint32_t insn_off;  /* Offset of the instruction.  */
+  uint32_t file_name_off; /* Offset of file name in BTF string table.  */
+  uint32_t line_off;  /* Offset of source line in BTF string table.  */
+  uint32_t 

[PATCH v2 3/5] bpf: Always emit .BTF.ext section if generating BTF

2024-02-27 Thread Cupertino Miranda
BPF applications, when generating BTF information should always create a
.BTF.ext section.
Current implementation was only creating it when -mco-re option was used.
This patch makes .BTF.ext always be generated for BPF target objects.
The patch also adds conditions around btf_finalize function call
such that BTF deallocation happens later for BPF target.
For BPF, btf_finalize is only called after .BTF.ext is generated.

gcc/ChangeLog:

* config/bpf/bpf.cc (bpf_option_override): Make .BTF.ext
enabled by default for BPF.
(bpf_file_end): Call BTF deallocation.
* dwarf2ctf.cc (ctf_debug_finalize): Conditionally execute BTF
deallocation.
---
 gcc/config/bpf/bpf.cc | 20 +---
 gcc/dwarf2ctf.cc  | 12 ++--
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/gcc/config/bpf/bpf.cc b/gcc/config/bpf/bpf.cc
index d6ca47eeecbe..4318b26b9cda 100644
--- a/gcc/config/bpf/bpf.cc
+++ b/gcc/config/bpf/bpf.cc
@@ -195,10 +195,8 @@ bpf_option_override (void)
   if (TARGET_BPF_CORE && !btf_debuginfo_p ())
 error ("BPF CO-RE requires BTF debugging information, use %<-gbtf%>");
 
-  /* To support the portability needs of BPF CO-RE approach, BTF debug
- information includes the BPF CO-RE relocations.  */
-  if (TARGET_BPF_CORE)
-write_symbols |= BTF_WITH_CORE_DEBUG;
+  /* BPF applications always generate .BTF.ext.  */
+  write_symbols |= BTF_WITH_CORE_DEBUG;
 
   /* Unlike much of the other BTF debug information, the information necessary
  for CO-RE relocations is added to the CTF container by the BPF backend.
@@ -218,10 +216,7 @@ bpf_option_override (void)
   /* -gbtf implies -mcore when using the BPF backend, unless -mno-co-re
  is specified.  */
   if (btf_debuginfo_p () && !(target_flags_explicit & MASK_BPF_CORE))
-{
-  target_flags |= MASK_BPF_CORE;
-  write_symbols |= BTF_WITH_CORE_DEBUG;
-}
+target_flags |= MASK_BPF_CORE;
 
   /* Determine available features from ISA setting (-mcpu=).  */
   if (bpf_has_jmpext == -1)
@@ -267,7 +262,7 @@ bpf_option_override (void)
 static void
 bpf_asm_init_sections (void)
 {
-  if (TARGET_BPF_CORE)
+  if (btf_debuginfo_p () && btf_with_core_debuginfo_p ())
 btf_ext_init ();
 }
 
@@ -279,8 +274,11 @@ bpf_asm_init_sections (void)
 static void
 bpf_file_end (void)
 {
-  if (TARGET_BPF_CORE)
-btf_ext_output ();
+  if (btf_debuginfo_p () && btf_with_core_debuginfo_p ())
+{
+  btf_ext_output ();
+  btf_finalize ();
+}
 }
 
 #undef TARGET_ASM_FILE_END
diff --git a/gcc/dwarf2ctf.cc b/gcc/dwarf2ctf.cc
index 93e5619933fa..dca86edfffa9 100644
--- a/gcc/dwarf2ctf.cc
+++ b/gcc/dwarf2ctf.cc
@@ -944,7 +944,10 @@ ctf_debug_finalize (const char *filename, bool btf)
   if (btf)
 {
   btf_output (filename);
-  btf_finalize ();
+  /* btf_finalize when compiling BPF applciations gets deallocated by the
+BPF target in bpf_file_end.  */
+  if (btf_debuginfo_p () && !btf_with_core_debuginfo_p ())
+   btf_finalize ();
 }
 
   else
@@ -1027,11 +1030,8 @@ ctf_debug_finish (const char * filename)
   /* Emit BTF debug info here when CO-RE relocations need to be generated.
  BTF with CO-RE relocations needs to be generated when CO-RE is in effect
  for the BPF target.  */
-  if (btf_with_core_debuginfo_p ())
-{
-  gcc_assert (btf_debuginfo_p ());
-  ctf_debug_finalize (filename, btf_debuginfo_p ());
-}
+  if (btf_debuginfo_p () && btf_with_core_debuginfo_p ())
+ctf_debug_finalize (filename, btf_debuginfo_p ());
 }
 
 #include "gt-dwarf2ctf.h"
-- 
2.39.2



[PATCH v2 2/5] btf: added KIND_FUNC traversal function.

2024-02-27 Thread Cupertino Miranda
Added a traversal function to traverse all BTF_KIND_FUNC nodes with a
callback function. Used for .BTF.ext section content creation.

gcc/ChangeLog:

* btfout.cc (output_btf_func_types): Use FOR_EACH_VEC_ELT.
(traverse_btf_func_types): Defined function.
* ctfc.h (funcs_traverse_callback): Typedef for function
prototype.
(traverse_btf_func_types): Added prototype.
---
 gcc/btfout.cc | 22 --
 gcc/ctfc.h|  3 +++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/gcc/btfout.cc b/gcc/btfout.cc
index 7e114e224449..7aabd99f3e7c 100644
--- a/gcc/btfout.cc
+++ b/gcc/btfout.cc
@@ -1276,8 +1276,10 @@ output_btf_types (ctf_container_ref ctfc)
 static void
 output_btf_func_types (ctf_container_ref ctfc)
 {
-  for (size_t i = 0; i < vec_safe_length (funcs); i++)
-btf_asm_func_type (ctfc, (*funcs)[i], i);
+  ctf_dtdef_ref ref;
+  unsigned i;
+  FOR_EACH_VEC_ELT (*funcs, i, ref)
+btf_asm_func_type (ctfc, ref, i);
 }
 
 /* Output all BTF_KIND_DATASEC records.  */
@@ -1452,4 +1454,20 @@ btf_finalize (void)
   tu_ctfc = NULL;
 }
 
+/* Traversal function for all BTF_KIND_FUNC type records.  */
+
+bool
+traverse_btf_func_types (funcs_traverse_callback callback, void *data)
+{
+  ctf_dtdef_ref ref;
+  unsigned i;
+  FOR_EACH_VEC_ELT (*funcs, i, ref)
+{
+  bool stop = callback (ref, data);
+  if (stop == true)
+   return true;
+}
+  return false;
+}
+
 #include "gt-btfout.h"
diff --git a/gcc/ctfc.h b/gcc/ctfc.h
index 7aac57edac55..fa188bf2f5a4 100644
--- a/gcc/ctfc.h
+++ b/gcc/ctfc.h
@@ -441,6 +441,9 @@ extern int ctf_add_variable (ctf_container_ref, const char 
*, ctf_id_t,
 extern ctf_id_t ctf_lookup_tree_type (ctf_container_ref, const tree);
 extern ctf_id_t get_btf_id (ctf_id_t);
 
+typedef bool (*funcs_traverse_callback) (ctf_dtdef_ref, void *);
+bool traverse_btf_func_types (funcs_traverse_callback, void *);
+
 /* CTF section does not emit location information; at this time, location
information is needed for BTF CO-RE use-cases.  */
 
-- 
2.39.2



bpf: PR target/113453 func_info .BTF.ext implementation

2024-02-27 Thread Cupertino Miranda
Hi everyone,

Just an updated version of the patches based on recent reviews from
David Faust.
Thanks for the feedback.

Regards,
Cupertino




[PATCH v2 1/5] btf: fixed type id in BTF_KIND_FUNC struct data.

2024-02-27 Thread Cupertino Miranda
This patch correct the aditition of +1 on the type id, which originally
was done in the wrong location and leaded to func_sts->dtd_type for
BTF_KIND_FUNCS struct data to contain the type id of the previous entry.

gcc/ChangeLog:

* btfout.cc (btf_collect_dataset): Corrected BTF type id.
---
 gcc/btfout.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/btfout.cc b/gcc/btfout.cc
index dcf751f8fe0d..7e114e224449 100644
--- a/gcc/btfout.cc
+++ b/gcc/btfout.cc
@@ -457,7 +457,8 @@ btf_collect_datasec (ctf_container_ref ctfc)
   func_dtd->dtd_data.ctti_type = dtd->dtd_type;
   func_dtd->linkage = dtd->linkage;
   func_dtd->dtd_name = dtd->dtd_name;
-  func_dtd->dtd_type = num_types_added + num_types_created;
+  /* +1 for the sentinel type not in the types map.  */
+  func_dtd->dtd_type = num_types_added + num_types_created + 1;
 
   /* Only the BTF_KIND_FUNC type actually references the name. The
 BTF_KIND_FUNC_PROTO is always anonymous.  */
@@ -480,8 +481,7 @@ btf_collect_datasec (ctf_container_ref ctfc)
 
  struct btf_var_secinfo info;
 
- /* +1 for the sentinel type not in the types map.  */
- info.type = func_dtd->dtd_type + 1;
+ info.type = func_dtd->dtd_type;
 
  /* Both zero at compile time.  */
  info.size = 0;
-- 
2.39.2



Re: [PATCH] c++/modules: local class merging [PR99426]

2024-02-27 Thread Patrick Palka
On Mon, 26 Feb 2024, Patrick Palka wrote:

> Bootstrapped and regtested on x86_64-pc-linux-gnu, does this approach
> look reasonable?
> 
> -- >8 --
> 
> One known missing piece in the modules implementation is merging of a
> streamed-in local class with the corresponding in-TU version of the
> local class.  This missing piece turns out to cause a hard-to-reduce
> use-after-free GC issue due to the entity_ary not being marked as a GC
> root (deliberately), and manifests as a serialization error on stream-in
> as in PR99426 (see comment #6 for a reduction).  It's also reproducible
> on trunk when running the xtreme-header tests without -fno-module-lazy.
> 
> This patch makes us merge such local classes according to their position
> within the containing function's definition, similar to how we merge
> FIELD_DECLs of a class according to their index in the TYPE_FIELDS
> list.
> 
>   PR c++/99426
> 
> gcc/cp/ChangeLog:
> 
>   * module.cc (merge_kind::MK_local_class): New enumerator.
>   (merge_kind_name): Update.
>   (trees_out::chained_decls): Move BLOCK-specific handling
>   of DECL_LOCAL_DECL_P decls to ...
>   (trees_out::core_vals) : ... here.  Stream
>   BLOCK_VARS manually.
>   (trees_in::core_vals) : Stream BLOCK_VARS
>   manually.  Handle deduplicated local classes.
>   (trees_out::key_local_class): Define.
>   (trees_in::key_local_class): Define.
>   (trees_out::get_merge_kind) : Return
>   MK_local_class for a local class.
>   (trees_out::key_mergeable) : Use
>   key_local_class.
>   (trees_in::key_mergeable) : Likewise.
>   (trees_in::is_matching_decl): Be flexible with type mismatches
>   for local entities.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.dg/modules/xtreme-header-7_a.H: New test.
>   * g++.dg/modules/xtreme-header-7_b.C: New test.

> ---
>  gcc/cp/module.cc  | 167 +++---
>  .../g++.dg/modules/xtreme-header-7_a.H|   4 +
>  .../g++.dg/modules/xtreme-header-7_b.C|   6 +
>  3 files changed, 149 insertions(+), 28 deletions(-)
>  create mode 100644 gcc/testsuite/g++.dg/modules/xtreme-header-7_a.H
>  create mode 100644 gcc/testsuite/g++.dg/modules/xtreme-header-7_b.C
> 
> diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc
> index fa91c6ff9cb..f77f73a59ed 100644
> --- a/gcc/cp/module.cc
> +++ b/gcc/cp/module.cc
> @@ -2771,6 +2771,7 @@ enum merge_kind
>  
>MK_enum,   /* Found by CTX, & 1stMemberNAME.  */
>MK_keyed, /* Found by key & index.  */
> +  MK_local_class, /* Found by CTX, index.  */
>  
>MK_friend_spec,  /* Like named, but has a tmpl & args too.  */
>MK_local_friend, /* Found by CTX, index.  */
> @@ -2799,7 +2800,7 @@ static char const *const merge_kind_name[MK_hwm] =
>  "unique", "named", "field", "vtable",/* 0...3  */
>  "asbase", "partial", "enum", "attached", /* 4...7  */
>  
> -"friend spec", "local friend", NULL, NULL,  /* 8...11 */
> +"local class", "friend spec", "local friend", NULL,  /* 8...11 */
>  NULL, NULL, NULL, NULL,
>  
>  "type spec", "type tmpl spec",   /* 16,17 type (template).  */
> @@ -2928,6 +2929,7 @@ public:
>unsigned binfo_mergeable (tree *);
>  
>  private:
> +  tree key_local_class (const merge_key&, tree);
>uintptr_t *find_duplicate (tree existing);
>void register_duplicate (tree decl, tree existing);
>/* Mark as an already diagnosed bad duplicate.  */
> @@ -3086,6 +3088,7 @@ public:
>void binfo_mergeable (tree binfo);
>  
>  private:
> +  void key_local_class (merge_key&, tree, tree);
>bool decl_node (tree, walk_kind ref);
>void type_node (tree);
>void tree_value (tree);
> @@ -4952,18 +4955,7 @@ void
>  trees_out::chained_decls (tree decls)
>  {
>for (; decls; decls = DECL_CHAIN (decls))
> -{
> -  if (VAR_OR_FUNCTION_DECL_P (decls)
> -   && DECL_LOCAL_DECL_P (decls))
> - {
> -   /* Make sure this is the first encounter, and mark for
> -  walk-by-value.  */
> -   gcc_checking_assert (!TREE_VISITED (decls)
> -&& !DECL_TEMPLATE_INFO (decls));
> -   mark_by_value (decls);
> - }
> -  tree_node (decls);
> -}
> +tree_node (decls);
>tree_node (NULL_TREE);
>  }
>  
> @@ -6204,7 +6196,21 @@ trees_out::core_vals (tree t)
>  
>/* DECL_LOCAL_DECL_P decls are first encountered here and
>   streamed by value.  */
> -  chained_decls (t->block.vars);
> +  for (tree decls = t->block.vars; decls; decls = DECL_CHAIN (decls))
> + {
> +   if (VAR_OR_FUNCTION_DECL_P (decls)
> +   && DECL_LOCAL_DECL_P (decls))
> + {
> +   /* Make sure this is the first encounter, and mark for
> +  walk-by-value.  */
> +   gcc_checking_assert (!TREE_VISITED (decls)
> +&& !DECL_TEMPLATE_INFO (decls));
> +   mark_by_value (decls);
> + }
> +   tree_node (decls);
> + }
> +   

[PATCH v2] libstdc++: Fix error handling in std::print

2024-02-27 Thread Jonathan Wakely
On Tue, 27 Feb 2024 at 15:21, Tim Song wrote:
>
> [print.fun] requires a system_error, but I don't think 
> [ostream.formatted.print] does?

Yeah it looks like I got confused (again) jumping back and forth
between [print.fun] and [ostream.formatted.print]. So we're doing the
right thing, and should just add tests to verify that.
commit e1c689dbeb5b6364eb2a2f0af20ced07b8096b82
Author: Jonathan Wakely 
Date:   Thu Feb 22 13:06:59 2024

libstdc++: Test error handling in std::print

The standard requires an exception if std::print fails to write to a
FILE*. When writing to a std::ostream, failure to format the arguments
doesn't affect the stream state, but failure to write to the streadm
sets badbit.

libstdc++-v3/ChangeLog:

* testsuite/27_io/basic_ostream/print/1.cc: Check error
handling.
* testsuite/27_io/print/1.cc: Likewise.

diff --git a/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc 
b/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
index 71a4daa04c9..cd4b116ac1c 100644
--- a/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
+++ b/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
@@ -103,6 +103,40 @@ test_locale()
   }
 }
 
+void
+test_errors()
+{
+  // Failure to generate output is reported by setting badbit.
+  std::stringstream in(std::ios::in);
+  std::print(in, "{}", "nope"); // No exception here.
+  VERIFY(in.bad());
+#ifdef __cpp_exceptions
+  in.clear();
+  in.exceptions(std::ios::badbit);
+  try
+  {
+std::print(in, "{}", "nope"); // Should throw now.
+VERIFY(false);
+  }
+  catch (const std::ios::failure&)
+  {
+  }
+
+  // An exception thrown when formatting the string is propagated
+  // without setting badbit.
+  std::ostringstream out;
+  try
+  {
+std::vprint_nonunicode(out, "{}", std::make_format_args());
+VERIFY(false);
+  }
+  catch (const std::format_error&)
+  {
+  }
+  VERIFY(out.good());
+#endif
+}
+
 int main()
 {
   test_print_ostream();
@@ -111,4 +145,5 @@ int main()
   test_print_no_padding();
   test_vprint_nonunicode();
   test_locale();
+  test_errors();
 }
diff --git a/libstdc++-v3/testsuite/27_io/print/1.cc 
b/libstdc++-v3/testsuite/27_io/print/1.cc
index 6a294e0454b..d570f7938be 100644
--- a/libstdc++-v3/testsuite/27_io/print/1.cc
+++ b/libstdc++-v3/testsuite/27_io/print/1.cc
@@ -74,6 +74,21 @@ test_vprint_nonunicode()
   // { dg-output "garbage in . garbage out" }
 }
 
+void
+test_errors()
+{
+#ifdef __cpp_exceptions
+  try
+  {
+std::print(stdin, "{}", "nope");
+VERIFY(false);
+  }
+  catch (const std::system_error&)
+  {
+  }
+#endif
+}
+
 int main()
 {
   test_print_default();
@@ -82,4 +97,5 @@ int main()
   test_println_file();
   test_print_raw();
   test_vprint_nonunicode();
+  test_errors();
 }


Re: [PATCH] c++/modules: optimize tree flag streaming

2024-02-27 Thread Patrick Palka
On Fri, 16 Feb 2024, Patrick Palka wrote:

> On Thu, 15 Feb 2024, Patrick Palka wrote:
> 
> > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look
> > OK for trunk?
> > 
> > -- >8 --
> > 
> > One would expect consecutive calls to bytes_in/out::b for streaming
> > adjacent bits, as we do for tree flag streaming, to at least be
> > optimized by the compiler into individual bit operations using
> > statically known bit positions (and ideally merged into larger sized
> > reads/writes).
> > 
> > Unfortunately this doesn't happen because the compiler has trouble
> > tracking the values of this->bit_pos and this->bit_val across such
> > calls, likely because the compiler doesn't know 'this' and so it's
> > treated as global memory.  This means for each consecutive bit stream
> > operation, bit_pos and bit_val are loaded from memory, checked if
> > buffering is needed, and finally the bit is extracted from bit_val
> > according to the (unknown) bit_pos, even though relative to the previous
> > operation (if we didn't need to buffer) bit_val is unchanged and bit_pos
> > is just 1 larger.  This ends up being quite slow, with tree_node_bools
> > taking 10% of time when streaming in parts of the std module.
> > 
> > This patch optimizes this by making tracking of bit_pos and bit_val
> > easier for the compiler.  Rather than bit_pos and bit_val being members
> > of the (effectively global) bytes_in/out objects, this patch factors out
> > the bit streaming code/state into separate classes bits_in/out that get
> > constructed locally as needed for bit streaming.  Since these objects
> > are now clearly local, the compiler can more easily track their values.
> > 
> > And since bit streaming is intended to be batched it's natural for these
> > new classes to be RAII-enabled such that the bit stream is flushed upon
> > destruction.
> > 
> > In order to make the most of this improved tracking of bit position,
> > this patch changes parts where we conditionally stream a tree flag
> > to unconditionally stream (the flag or a dummy value).  That way
> > the number of bits streamed and the respective bit positions are as
> > statically known as reasonably possible.  In lang_decl_bools and
> > lang_type_bools we flush the current bit buffer at the start so that
> > subsequent bit positions are statically known.  And in core bools, we
> > can add explicit early exits utilizing invariants that the compiler
> > can't figure out itself (e.g. a tree code can't have both TS_TYPE_COMMON
> > and TS_DECL_COMMON, and if a tree code doesn't have TS_DECL_COMMON then
> > it doesn't have TS_DECL_WITH_VIS).  Finally if we're streaming fewer
> > than 4 bits, it's more space efficient to stream them as individual
> > bytes rather than as packed bits (due to the 32-bit buffer).
> 
> Oops, this last sentence is wrong.  Although the size of the bit buffer
> is 32 bits, upon flushing we rewind unused bytes within the buffer,
> which means streaming 2-8 bits ends up using only one byte not all four.
> So v2 below undoes this pessimization.
> 
> > This patch also moves the definitions of the relevant streaming classes
> > into anonymous namespaces so that the compiler can make more informed
> > decisions about inlining their member functions.
> > 
> > After this patch, compile time for a simple Hello World using the std
> > module is reduced by 7% with a release compiler.  The on-disk size of
> > the std module increases by 0.7% (presumably due to the extra flushing
> > done in lang_decl_bools and lang_type_bools).
> 
> The on-disk std module now only grows 0.4% instead of 0.7%.
> 
> > 
> > The bit stream out performance isn't improved as much as the stream in
> > due to the spans/lengths instrumentation performed on stream out (which
> > probably should be e.g. removed for release builds?)
> 
> -- >8 --
> 
> gcc/cp/ChangeLog:
> 
>   * module.cc: Update comment about classes defined.
>   (class data): Enclose in an anonymous namespace.
>   (data::calc_crc): Moved from bytes::calc_crc.
>   (class bytes): Remove.  Move bit_flush to namespace scope.
>   (class bytes_in): Enclose in an anonymous namespace.  Inherit
>   directly from data and adjust accordingly.  Move b and bflush
>   members to bits_in.
>   (class bytes_out): As above.  Remove is_set static data member.
>   (bit_flush): Moved from class bytes.
>   (struct bits_in): Define.
>   (struct bits_out): Define.
>   (bytes_out::bflush): Moved to bits_out/in.
>   (bytes_in::bflush): Likewise
>   (bytes_in::bfill): Removed.
>   (bytes_out::b): Moved to bits_out/in.
>   (bytes_in::b): Likewise.
>   (class trees_in): Enclose in an anonymous namespace.
>   (class trees_out): Enclose in an anonymous namespace.
>   (trees_out::core_bools): Add bits_out/in parameter and use it.
>   Unconditionally stream a bit for public_flag.  Add early exits
>   as appropriate.
>   (trees_out::core_bools): Likewise.
>

Re: [PATCH] c++/modules: relax diagnostic about GMF contents

2024-02-27 Thread Patrick Palka
On Thu, 15 Feb 2024, Patrick Palka wrote:

> On Thu, 15 Feb 2024, Jason Merrill wrote:
> 
> > On 2/15/24 16:10, Patrick Palka wrote:
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look
> > > OK for trunk?
> > > 
> > > -- >8 --
> > > 
> > > Issuing a hard error when the GMF doesn't contain preprocessing
> > > directives is inconvenient for automated testcase reduction via cvise.
> > > This patch relaxes this diagnostic into a pedwarn.
> > > 
> > > gcc/cp/ChangeLog:
> > > 
> > >   * parser.cc (cp_parser_translation_unit): Relax GMF contents
> > >   error into a pedwarn.
> > > 
> > > gcc/testsuite/ChangeLog:
> > > 
> > >   * g++.dg/modules/friend-6_a.C: Remove now unnecessary
> > >   preprocessing directives from GMF.
> > > ---
> > >   gcc/cp/parser.cc  | 6 +++---
> > >   gcc/testsuite/g++.dg/modules/friend-6_a.C | 2 --
> > >   2 files changed, 3 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
> > > index 9d0914435fb..e60f0425035 100644
> > > --- a/gcc/cp/parser.cc
> > > +++ b/gcc/cp/parser.cc
> > > @@ -5253,9 +5253,9 @@ cp_parser_translation_unit (cp_parser* parser)
> > > if (!warned)
> > >   {
> > > warned = true;
> > > -   error_at (token->location,
> > > - "global module fragment contents must be"
> > > - " from preprocessor inclusion");
> > > +   pedwarn (token->location, OPT_Wpedantic,
> > > +"global module fragment contents must be"
> > > +" from preprocessor inclusion");
> > 
> > Relaxing to pedwarn is fine, but I think it should be on by default, not 
> > just
> > with -pedantic.  So it should get a new option.
> 
> Ah, like so?  I'm not sure about naming the option Wmodules-gmf-contents
> vs just Wgmf-contents, or something else...

Ping.

> 
> -- >8 --
> 
> Subject: [PATCH] c++/modules: relax diagnostic about GMF contents
> 
> Issuing a hard error when the GMF doesn't contain preprocessing
> directives is inconvenient for automated testcase reduction via cvise.
> This patch relaxes this diagnostic into a pedwarn that can be disabled
> with -Wno-modules-gmf-contents.
> 
> gcc/c-family/ChangeLog:
> 
>   * c.opt (Wmodules-gmf-contents): New warning.
> 
> gcc/cp/ChangeLog:
> 
>   * parser.cc (cp_parser_translation_unit): Relax GMF contents
>   error into a pedwarn.
> 
> gcc/ChangeLog:
> 
>   * doc/invoke.texi (-Wno-modules-gmf-contents): Document.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.dg/modules/friend-6_a.C: Pass -Wno-modules-gmf-contents
>   instead of -Wno-pedantic.  Remove now unnecessary preprocessing
>   directives from GMF.
> ---
>  gcc/c-family/c.opt| 4 
>  gcc/cp/parser.cc  | 6 +++---
>  gcc/doc/invoke.texi   | 7 +++
>  gcc/testsuite/g++.dg/modules/friend-6_a.C | 4 +---
>  4 files changed, 15 insertions(+), 6 deletions(-)
> 
> diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
> index b7a4a1a68e3..6d5f3199b33 100644
> --- a/gcc/c-family/c.opt
> +++ b/gcc/c-family/c.opt
> @@ -993,6 +993,10 @@ Wmissing-variable-declarations
>  C ObjC Var(warn_missing_variable_declarations) Warning
>  Warn about global variables without previous declarations.
>  
> +Wmodules-gmf-contents
> +C++ ObjC++ Var(warn_modules_gmf_contents) Warning Init(1)
> +Warn about the global module fragment not containing only preprocessing 
> directives.
> +
>  Wmudflap
>  C ObjC C++ ObjC++ WarnRemoved
>  
> diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
> index 9d0914435fb..eb709fc9e38 100644
> --- a/gcc/cp/parser.cc
> +++ b/gcc/cp/parser.cc
> @@ -5253,9 +5253,9 @@ cp_parser_translation_unit (cp_parser* parser)
> if (!warned)
>   {
> warned = true;
> -   error_at (token->location,
> - "global module fragment contents must be"
> - " from preprocessor inclusion");
> +   pedwarn (token->location, OPT_Wmodules_gmf_contents,
> +"global module fragment contents must be"
> +" from preprocessor inclusion");
>   }
>   }
>   }
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 54cde59..e27fda88bc4 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -261,6 +261,7 @@ in the following sections.
>  -Winvalid-constexpr -Winvalid-imported-macros
>  -Wno-invalid-offsetof  -Wno-literal-suffix
>  -Wmismatched-new-delete -Wmismatched-tags
> +-Wno-modules-gmf-contents
>  -Wmultiple-inheritance  -Wnamespaces  -Wnarrowing
>  -Wnoexcept  -Wnoexcept-type  -Wnon-virtual-dtor
>  -Wpessimizing-move  -Wno-placement-new  -Wplacement-new=@var{n}
> @@ -4568,6 +4569,12 @@ unresolved references due to the difference in the 
> mangling of symbols
>  declared with different class-keys.  The option can be used 

Re: [PATCH] calls: Fix up TYPE_NO_NAMED_ARGS_STDARG_P handling [PR107453]

2024-02-27 Thread Jakub Jelinek
On Tue, Feb 27, 2024 at 06:25:21PM +0100, Jakub Jelinek wrote:
> I guess we need some testsuite coverage for caller/callee ABI match of
> struct S { char p[64]; };
> struct S foo (...);

Maybe the test below?  Passes on x86_64 -m32/-m64, but I guess that doesn't
care at all about the named vs. not named distinction.
The test is a copy of c23-stdarg-4.c, just with all the functions returning
a large struct.

2024-02-27  Jakub Jelinek  

* gcc.dg/c23-stdarg-6.c: New test.

--- gcc/testsuite/gcc.dg/c23-stdarg-6.c.jj  2024-02-27 18:39:04.807821107 
+0100
+++ gcc/testsuite/gcc.dg/c23-stdarg-6.c 2024-02-27 18:51:44.706308490 +0100
@@ -0,0 +1,217 @@
+/* Test C23 variadic functions with no named parameters, or last named
+   parameter with a declaration not allowed in C17.  Execution tests.  */
+/* { dg-do run } */
+/* { dg-options "-std=c23 -pedantic-errors" } */
+
+#include 
+
+extern void abort (void);
+extern void exit (int);
+struct s { char c[1000]; };
+
+struct s
+f (...)
+{
+  va_list ap;
+  va_start (ap);
+  double r = va_arg (ap, int);
+  r += va_arg (ap, double);
+  r += va_arg (ap, int);
+  r += va_arg (ap, double);
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = r;
+  ret.c[999] = 42;
+  return ret;
+}
+
+struct s
+g (...)
+{
+  va_list ap;
+  va_start (ap, random ! ignored, ignored ** text);
+  for (int i = 0; i < 10; i++)
+if (va_arg (ap, double) != i)
+  abort ();
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = 17;
+  ret.c[999] = 58;
+  return ret;
+}
+
+struct s
+h1 (register int x, ...)
+{
+  va_list ap;
+  va_start (ap);
+  for (int i = 0; i < 10; i++)
+{
+  if (va_arg (ap, double) != i)
+   abort ();
+  i++;
+  if (va_arg (ap, int) != i)
+   abort ();
+}
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = 32;
+  ret.c[999] = 95;
+  return ret;
+}
+
+struct s
+h2 (int x(), ...)
+{
+  va_list ap;
+  va_start (ap);
+  for (int i = 0; i < 10; i++)
+{
+  if (va_arg (ap, double) != i)
+   abort ();
+  i++;
+  if (va_arg (ap, int) != i)
+   abort ();
+}
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = 5;
+  ret.c[999] = 125;
+  return ret;
+}
+
+struct s
+h3 (int x[10], ...)
+{
+  va_list ap;
+  va_start (ap);
+  for (int i = 0; i < 10; i++)
+{
+  if (va_arg (ap, double) != i)
+   abort ();
+  i++;
+  if (va_arg (ap, int) != i)
+   abort ();
+}
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = 8;
+  ret.c[999] = 12;
+  return ret;
+}
+
+struct s
+h4 (char x, ...)
+{
+  va_list ap;
+  va_start (ap);
+  for (int i = 0; i < 10; i++)
+{
+  if (va_arg (ap, double) != i)
+   abort ();
+  i++;
+  if (va_arg (ap, int) != i)
+   abort ();
+}
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = 18;
+  ret.c[999] = 28;
+  return ret;
+}
+
+struct s
+h5 (float x, ...)
+{
+  va_list ap;
+  va_start (ap);
+  for (int i = 0; i < 10; i++)
+{
+  if (va_arg (ap, double) != i)
+   abort ();
+  i++;
+  if (va_arg (ap, int) != i)
+   abort ();
+}
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = 38;
+  ret.c[999] = 48;
+  return ret;
+}
+
+struct s
+h6 (volatile long x, ...)
+{
+  va_list ap;
+  va_start (ap);
+  for (int i = 0; i < 10; i++)
+{
+  if (va_arg (ap, double) != i)
+   abort ();
+  i++;
+  if (va_arg (ap, int) != i)
+   abort ();
+}
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = 58;
+  ret.c[999] = 68;
+  return ret;
+}
+
+struct s
+h7 (volatile struct s x, ...)
+{
+  va_list ap;
+  va_start (ap);
+  for (int i = 0; i < 10; i++)
+{
+  if (va_arg (ap, double) != i)
+   abort ();
+  i++;
+  if (va_arg (ap, int) != i)
+   abort ();
+}
+  va_end (ap);
+  struct s ret = {};
+  ret.c[0] = 78;
+  ret.c[999] = 88;
+  return ret;
+}
+
+int
+main ()
+{
+  struct s x = f (1, 2.0, 3, 4.0);
+  if (x.c[0] != 10 || x.c[999] != 42)
+abort ();
+  x = g (0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
+  if (x.c[0] != 17 || x.c[999] != 58)
+abort ();
+  x = g (0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f);
+  if (x.c[0] != 17 || x.c[999] != 58)
+abort ();
+  x = h1 (0, 0.0, 1, 2.0, 3, 4.0, 5, 6.0, 7, 8.0, 9);
+  if (x.c[0] != 32 || x.c[999] != 95)
+abort ();
+  x = h2 (0, 0.0, 1, 2.0, 3, 4.0, 5, 6.0, 7, 8.0, 9);
+  if (x.c[0] != 5 || x.c[999] != 125)
+abort ();
+  x = h3 (0, 0.0, 1, 2.0, 3, 4.0, 5, 6.0, 7, 8.0, 9);
+  if (x.c[0] != 8 || x.c[999] != 12)
+abort ();
+  x = h4 (0, 0.0, 1, 2.0, 3, 4.0, 5, 6.0, 7, 8.0, 9);
+  if (x.c[0] != 18 || x.c[999] != 28)
+abort ();
+  x = h5 (0, 0.0, 1, 2.0, 3, 4.0, 5, 6.0, 7, 8.0, 9);
+  if (x.c[0] != 38 || x.c[999] != 48)
+abort ();
+  x = h6 (0, 0.0, 1, 2.0, 3, 4.0, 5, 6.0, 7, 8.0, 9);
+  if (x.c[0] != 58 || x.c[999] != 68)
+abort ();
+  x = h7 ((struct s) {}, 0.0, 1, 2.0, 3, 4.0, 5, 6.0, 7, 8.0, 9);
+  if (x.c[0] != 78 || x.c[999] != 88)
+abort ();
+  exit (0);
+}


Jakub



[committed] i386: psrlq is not used for PERM [PR113871]

2024-02-27 Thread Uros Bizjak
Also handle V2BF mode.

PR target/113871

gcc/ChangeLog:

* config/i386/mmx.md (V248FI): Add V2BF mode.
(V24FI_32): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr113871-5a.c: New test.
* gcc.target/i386/pr113871-5b.c: New test.

Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Uros.
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 075309cca9f..2856ae6ffef 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -85,9 +85,9 @@ (define_mode_iterator V2FI [V2SF V2SI])
 
 (define_mode_iterator V24FI [V2SF V2SI V4HF V4HI])
 
-(define_mode_iterator V248FI [V2SF V2SI V4HF V4HI V8QI])
+(define_mode_iterator V248FI [V2SF V2SI V4HF V4BF V4HI V8QI])
 
-(define_mode_iterator V24FI_32 [V2HF V2HI V4QI])
+(define_mode_iterator V24FI_32 [V2HF V2BF V2HI V4QI])
 
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
diff --git a/gcc/testsuite/gcc.target/i386/pr113871-5a.c 
b/gcc/testsuite/gcc.target/i386/pr113871-5a.c
new file mode 100644
index 000..25ab82a6eab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113871-5a.c
@@ -0,0 +1,19 @@
+/* PR target/113871 */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+typedef __bf16 vect64 __attribute__((vector_size(8)));
+
+void f (vect64 *a)
+{
+  *a = __builtin_shufflevector(*a, (vect64){0}, 1, 2, 3, 4);
+}
+
+/* { dg-final { scan-assembler "psrlq" } } */
+
+void g(vect64 *a)
+{
+  *a = __builtin_shufflevector((vect64){0}, *a, 3, 4, 5, 6);
+}
+
+/* { dg-final { scan-assembler "psllq" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr113871-5b.c 
b/gcc/testsuite/gcc.target/i386/pr113871-5b.c
new file mode 100644
index 000..363a0f516cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113871-5b.c
@@ -0,0 +1,19 @@
+/* PR target/113871 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef __bf16 vect32 __attribute__((vector_size(4)));
+
+void f (vect32 *a)
+{
+  *a = __builtin_shufflevector(*a, (vect32){0}, 1, 2);
+}
+
+/* { dg-final { scan-assembler "psrld" } } */
+
+void g(vect32 *a)
+{
+  *a = __builtin_shufflevector((vect32){0}, *a, 1, 2);
+}
+
+/* { dg-final { scan-assembler "pslld" } } */


Re: [PATCH] calls: Fix up TYPE_NO_NAMED_ARGS_STDARG_P handling [PR107453]

2024-02-27 Thread Jakub Jelinek
On Tue, Feb 27, 2024 at 04:41:32PM +, Richard Earnshaw wrote:
> > 2023-01-09  Jakub Jelinek  
> > 
> > PR target/107453
> > * calls.cc (expand_call): For calls with
> > TYPE_NO_NAMED_ARGS_STDARG_P (funtype) use zero for n_named_args.
> > Formatting fix.
> 
> This one has been festering for a while; both Alexandre and Torbjorn have 
> attempted to fix it recently, but I'm not sure either is really right...
> 
> On Arm this is causing all anonymous arguments to be passed on the stack,
> which is incorrect per the ABI.  On a target that uses
> 'pretend_outgoing_vararg_named', why is it correct to set n_named_args to
> zero?  Is it enough to guard both the statements you've added with
> !targetm.calls.pretend_outgoing_args_named?

I'm afraid I haven't heard of that target hook before.
All I was doing with that change was fixing a regression reported in the PR
for ppc64le/sparc/nvptx/loongarch at least.

The TYPE_NO_NAMED_ARGS_STDARG_P functions (C23 fns like void foo (...) {})
have NULL type_arg_types, so the list_length (type_arg_types) isn't done for
it, but it should be handled as if it was non-NULL but list length was 0.

So, for the
  if (type_arg_types != 0)
n_named_args
  = (list_length (type_arg_types)
 /* Count the struct value address, if it is passed as a parm.  */
 + structure_value_addr_parm);
  else if (TYPE_NO_NAMED_ARGS_STDARG_P (funtype))
n_named_args = 0;
  else
/* If we know nothing, treat all args as named.  */
n_named_args = num_actuals;
case, I think guarding it by any target hooks is wrong, although
I guess it should have been
n_named_args = structure_value_addr_parm;
instead of
n_named_args = 0;

For the second
  if (type_arg_types != 0
  && targetm.calls.strict_argument_naming (args_so_far))
;
  else if (type_arg_types != 0
   && ! targetm.calls.pretend_outgoing_varargs_named (args_so_far))
/* Don't include the last named arg.  */
--n_named_args;
  else if (TYPE_NO_NAMED_ARGS_STDARG_P (funtype))
n_named_args = 0;
  else
/* Treat all args as named.  */
n_named_args = num_actuals;
bet (but no testing done, don't even know which targets return what for
those hooks) we should treat those as if type_arg_types was non-NULL
with 0 elements in the list, except the --n_named_args doesn't make sense
because that would decrease it to -1.
So perhaps
  if ((type_arg_types != 0 || TYPE_NO_NAMED_ARGS_STDARG_P (funtype))
  && targetm.calls.strict_argument_naming (args_so_far))
;
  else if (type_arg_types != 0
   && ! targetm.calls.pretend_outgoing_varargs_named (args_so_far))
/* Don't include the last named arg.  */
--n_named_args;
  else if (TYPE_NO_NAMED_ARGS_STDARG_P (funtype)
   && ! targetm.calls.pretend_outgoing_varargs_named (args_so_far)))
;
  else
/* Treat all args as named.  */
n_named_args = num_actuals;

(or n_named_args = 0; instead of ; before the final else?  Dunno).
I guess we need some testsuite coverage for caller/callee ABI match of
struct S { char p[64]; };
struct S foo (...);

Jakub



Re: [PATCH] calls: Fix up TYPE_NO_NAMED_ARGS_STDARG_P handling [PR107453]

2024-02-27 Thread Richard Earnshaw
[resending, apologies, I accidentally CC'd the wrong person last time]

On 27/02/2024 16:41, Richard Earnshaw wrote:
> 
> 
> On 09/01/2023 10:32, Jakub Jelinek via Gcc-patches wrote:
>> Hi!
>>
>> On powerpc64le-linux, the following patch fixes
>> -FAIL: gcc.dg/c2x-stdarg-4.c execution test
>> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O0  execution test
>> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O1  execution test
>> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O2  execution test
>> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O2 -flto 
>> -fno-use-linker-plugin -flto-partition=none  execution test
>> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O2 -flto -fuse-linker-plugin 
>> -fno-fat-lto-objects  execution test
>> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O3 -g  execution test
>> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -Os  execution test
>> The problem is mismatch between the caller and callee side.
>> On the callee side, we do:
>>   /* NAMED_ARG is a misnomer.  We really mean 'non-variadic'. */
>>   if (!cfun->stdarg)
>> data->arg.named = 1;  /* No variadic parms.  */
>>   else if (DECL_CHAIN (parm))
>> data->arg.named = 1;  /* Not the last non-variadic parm. */
>>   else if (targetm.calls.strict_argument_naming (all->args_so_far))
>> data->arg.named = 1;  /* Only variadic ones are unnamed.  */
>>   else
>> data->arg.named = 0;  /* Treat as variadic.  */
>> which is later passed to the target hooks to determine if a particular
>> argument is named or not.  Now, cfun->stdarg is determined from the stdarg_p
>> call, which for the new C2X TYPE_NO_NAMED_ARGS_STDARG_P function types
>> (rettype fn (...)) returns true.  Such functions have no named arguments,
>> so data->arg.named will be 0 in function.cc.  But on the caller side,
>> as TYPE_NO_NAMED_ARGS_STDARG_P function types have TYPE_ARG_TYPES NULL,
>> we instead treat those calls as unprototyped even when they are prototyped
>> - /* If we know nothing, treat all args as named.  */ n_named_args = 
>> num_actuals;
>> in 2 spots.  We need to treat the TYPE_NO_NAMED_ARGS_STDARG_P cases as
>> prototyped with no named arguments.
>>
>> Bootstrapped/regtested on x86_64-linux, i686-linux, powerpc64le-linux (where
>> it fixes the above failures), aarch64-linux and s390x-linux, ok for trunk?
>>
>> 2023-01-09  Jakub Jelinek  
>>
>>  PR target/107453
>>  * calls.cc (expand_call): For calls with
>>  TYPE_NO_NAMED_ARGS_STDARG_P (funtype) use zero for n_named_args.
>>  Formatting fix.
> 
> This one has been festering for a while; both Alexandre and Torbjorn have 
> attempted to fix it recently, but I'm not sure either is really right...
> 
> On Arm this is causing all anonymous arguments to be passed on the stack, 
> which is incorrect per the ABI.  On a target that uses 
> 'pretend_outgoing_vararg_named', why is it correct to set n_named_args to 
> zero?  Is it enough to guard both the statements you've added with 
> !targetm.calls.pretend_outgoing_args_named?
> 
> R.
> 
>>
>> --- gcc/calls.cc.jj  2023-01-02 09:32:28.834192105 +0100
>> +++ gcc/calls.cc 2023-01-06 14:52:14.740594896 +0100
>> @@ -2908,8 +2908,8 @@ expand_call (tree exp, rtx target, int i
>>  }
>>  
>>/* Count the arguments and set NUM_ACTUALS.  */
>> -  num_actuals =
>> -call_expr_nargs (exp) + num_complex_actuals + structure_value_addr_parm;
>> +  num_actuals
>> += call_expr_nargs (exp) + num_complex_actuals + 
>> structure_value_addr_parm;
>>  
>>/* Compute number of named args.
>>   First, do a raw count of the args for INIT_CUMULATIVE_ARGS.  */
>> @@ -2919,6 +2919,8 @@ expand_call (tree exp, rtx target, int i
>>= (list_length (type_arg_types)
>>   /* Count the struct value address, if it is passed as a parm.  */
>>   + structure_value_addr_parm);
>> +  else if (TYPE_NO_NAMED_ARGS_STDARG_P (funtype))
>> +n_named_args = 0;
>>else
>>  /* If we know nothing, treat all args as named.  */
>>  n_named_args = num_actuals;
>> @@ -2957,6 +2959,8 @@ expand_call (tree exp, rtx target, int i
>> && ! targetm.calls.pretend_outgoing_varargs_named (args_so_far))
>>  /* Don't include the last named arg.  */
>>  --n_named_args;
>> +  else if (TYPE_NO_NAMED_ARGS_STDARG_P (funtype))
>> +n_named_args = 0;
>>else
>>  /* Treat all args as named.  */
>>  n_named_args = num_actuals;
>>
>>  Jakub
>>


Re: [PATCH v2] c++/modules: Support lambdas attached to more places in modules [PR111710]

2024-02-27 Thread Patrick Palka
On Fri, 16 Feb 2024, Nathaniel Shead wrote:

> On Tue, Feb 13, 2024 at 07:52:01PM -0500, Jason Merrill wrote:
> > On 2/10/24 17:57, Nathaniel Shead wrote:
> > > The fix for PR107398 weakened the restrictions that lambdas must belong
> > > to namespace scope. However this was not sufficient: we also need to
> > > allow lambdas keyed to FIELD_DECLs or PARM_DECLs.
> > 
> > I wonder about keying such lambdas to the class and function, respectively,
> > rather than specifically to the field or parameter, but I suppose it doesn't
> > matter.
> 
> I did some more testing and realised my testcase didn't properly
> exercise whether I'd properly deduplicated or not, and an improved
> testcase proved that actually keying to the field rather than the class
> did cause issues. (Parameter vs. function doesn't seem to have mattered
> however.)
> 
> Here's an updated patch that fixes this, and includes the changes for
> lambdas in base classes that I'd had as a separate patch earlier. I've
> also added some concepts testcases just in case.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?
> 
> -- >8 --
> 
> The fix for PR107398 weakened the restrictions that lambdas must belong
> to namespace scope. However this was not sufficient: we also need to
> allow lambdas attached to FIELD_DECLs, PARM_DECLs, and TYPE_DECLs.
> 
> For field decls we key the lambda to its class rather than the field
> itself. This avoids some errors with deduplicating fields.
> 
> Additionally, by [basic.link] p15.2 a lambda defined anywhere in a
> class-specifier should not be TU-local, which includes base-class
> declarations, so ensure that lambdas declared there are keyed
> appropriately as well.
> 
> Because this now requires 'DECL_MODULE_KEYED_DECLS_P' to be checked on a
> fairly large number of different kinds of DECLs, and that in general
> it's safe to just get 'false' as a result of a check on an unexpected
> DECL type, this patch also removes the tree checking from the accessor.
> 
> Finally, to handle deduplicating templated lambda fields, we need to
> ensure that we can determine that two lambdas from different field decls
> match. The modules code does not attempt to deduplicate expression
> nodes, which causes issues as the LAMBDA_EXPRs are then considered to be
> different. However, rather than checking the LAMBDA_EXPR directly we can
> instead check its type: the generated RECORD_TYPE for a LAMBDA_EXPR must
> also be unique, and /is/ deduplicated on import, so we can just check
> for that instead.

We probably should be deduplicating LAMBDA_EXPR on stream-in, perhaps
something like

diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc
index e8eabb1f6f9..1b2ba2e0fa8 100644
--- a/gcc/cp/module.cc
+++ b/gcc/cp/module.cc
@@ -9183,6 +9183,13 @@ trees_in::tree_value ()
   return NULL_TREE;
 }
 
+  if (TREE_CODE (t) == LAMBDA_EXPR
+  && CLASSTYPE_LAMBDA_EXPR (TREE_TYPE (t)))
+{
+  existing = CLASSTYPE_LAMBDA_EXPR (TREE_TYPE (t));
+  back_refs[~tag] = existing;
+}
+
   dump (dumper::TREE) && dump ("Read tree:%d %C:%N", tag, TREE_CODE (t), t);
 
   if (TREE_CODE (existing) == INTEGER_CST && !TREE_OVERFLOW (existing))

would suffice?  If not we probably need to take inspiration from the
TREE_BINFO streaming, and handle LAMBDA_EXPR similarly..

> 
>   PR c++/111710
> 
> gcc/cp/ChangeLog:
> 
>   * cp-tree.h (DECL_MODULE_KEYED_DECLS_P): Remove tree checking.
>   (struct lang_decl_base): Update comments and fix whitespace.
>   * module.cc (trees_out::lang_decl_bools): Always write
>   module_keyed_decls_p flag...
>   (trees_in::lang_decl_bools): ...and always read it.
>   (trees_out::decl_value): Handle all kinds of keyed decls.
>   (trees_in::decl_value): Likewise.
>   (maybe_key_decl): Also support lambdas attached to fields,
>   parameters, and types. Key lambdas attached to fields to their
>   class.
>   (trees_out::get_merge_kind): Likewise.
>   (trees_out::key_mergeable): Likewise.
>   (trees_in::key_mergeable): Support keyed decls in a TYPE_DECL
> container.
>   * parser.cc (cp_parser_class_head): Start a lambda scope when
>   parsing base classes.
>   * tree.cc (cp_tree_equal): Check equality of the types of
>   LAMBDA_EXPRs instead of the exprs themselves.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.dg/modules/lambda-7.h: New test.
>   * g++.dg/modules/lambda-7_a.H: New test.
>   * g++.dg/modules/lambda-7_b.C: New test.
>   * g++.dg/modules/lambda-7_c.C: New test.
> 
> Signed-off-by: Nathaniel Shead 
> ---
>  gcc/cp/cp-tree.h  | 26 +++
>  gcc/cp/module.cc  | 94 +--
>  gcc/cp/parser.cc  | 10 ++-
>  gcc/cp/tree.cc|  4 +-
>  gcc/testsuite/g++.dg/modules/lambda-7.h   | 42 ++
>  gcc/testsuite/g++.dg/modules/lambda-7_a.H |  4 +
>  gcc/testsuite/g++.dg/modules/lambda-7_b.C |  5 

Re: [PATCH] calls: Fix up TYPE_NO_NAMED_ARGS_STDARG_P handling [PR107453]

2024-02-27 Thread Richard Earnshaw



On 09/01/2023 10:32, Jakub Jelinek via Gcc-patches wrote:
> Hi!
> 
> On powerpc64le-linux, the following patch fixes
> -FAIL: gcc.dg/c2x-stdarg-4.c execution test
> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O0  execution test
> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O1  execution test
> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O2  execution test
> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O2 -flto 
> -fno-use-linker-plugin -flto-partition=none  execution test
> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O2 -flto -fuse-linker-plugin 
> -fno-fat-lto-objects  execution test
> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -O3 -g  execution test
> -FAIL: gcc.dg/torture/c2x-stdarg-split-1a.c   -Os  execution test
> The problem is mismatch between the caller and callee side.
> On the callee side, we do:
>   /* NAMED_ARG is a misnomer.  We really mean 'non-variadic'. */
>   if (!cfun->stdarg)
> data->arg.named = 1;  /* No variadic parms.  */
>   else if (DECL_CHAIN (parm))
> data->arg.named = 1;  /* Not the last non-variadic parm. */
>   else if (targetm.calls.strict_argument_naming (all->args_so_far))
> data->arg.named = 1;  /* Only variadic ones are unnamed.  */
>   else
> data->arg.named = 0;  /* Treat as variadic.  */
> which is later passed to the target hooks to determine if a particular
> argument is named or not.  Now, cfun->stdarg is determined from the stdarg_p
> call, which for the new C2X TYPE_NO_NAMED_ARGS_STDARG_P function types
> (rettype fn (...)) returns true.  Such functions have no named arguments,
> so data->arg.named will be 0 in function.cc.  But on the caller side,
> as TYPE_NO_NAMED_ARGS_STDARG_P function types have TYPE_ARG_TYPES NULL,
> we instead treat those calls as unprototyped even when they are prototyped
> - /* If we know nothing, treat all args as named.  */ n_named_args = 
> num_actuals;
> in 2 spots.  We need to treat the TYPE_NO_NAMED_ARGS_STDARG_P cases as
> prototyped with no named arguments.
> 
> Bootstrapped/regtested on x86_64-linux, i686-linux, powerpc64le-linux (where
> it fixes the above failures), aarch64-linux and s390x-linux, ok for trunk?
> 
> 2023-01-09  Jakub Jelinek  
> 
>   PR target/107453
>   * calls.cc (expand_call): For calls with
>   TYPE_NO_NAMED_ARGS_STDARG_P (funtype) use zero for n_named_args.
>   Formatting fix.

This one has been festering for a while; both Alexandre and Torbjorn have 
attempted to fix it recently, but I'm not sure either is really right...

On Arm this is causing all anonymous arguments to be passed on the stack, which 
is incorrect per the ABI.  On a target that uses 
'pretend_outgoing_vararg_named', why is it correct to set n_named_args to zero? 
 Is it enough to guard both the statements you've added with 
!targetm.calls.pretend_outgoing_args_named?

R.

> 
> --- gcc/calls.cc.jj   2023-01-02 09:32:28.834192105 +0100
> +++ gcc/calls.cc  2023-01-06 14:52:14.740594896 +0100
> @@ -2908,8 +2908,8 @@ expand_call (tree exp, rtx target, int i
>  }
>  
>/* Count the arguments and set NUM_ACTUALS.  */
> -  num_actuals =
> -call_expr_nargs (exp) + num_complex_actuals + structure_value_addr_parm;
> +  num_actuals
> += call_expr_nargs (exp) + num_complex_actuals + 
> structure_value_addr_parm;
>  
>/* Compute number of named args.
>   First, do a raw count of the args for INIT_CUMULATIVE_ARGS.  */
> @@ -2919,6 +2919,8 @@ expand_call (tree exp, rtx target, int i
>= (list_length (type_arg_types)
>/* Count the struct value address, if it is passed as a parm.  */
>+ structure_value_addr_parm);
> +  else if (TYPE_NO_NAMED_ARGS_STDARG_P (funtype))
> +n_named_args = 0;
>else
>  /* If we know nothing, treat all args as named.  */
>  n_named_args = num_actuals;
> @@ -2957,6 +2959,8 @@ expand_call (tree exp, rtx target, int i
>  && ! targetm.calls.pretend_outgoing_varargs_named (args_so_far))
>  /* Don't include the last named arg.  */
>  --n_named_args;
> +  else if (TYPE_NO_NAMED_ARGS_STDARG_P (funtype))
> +n_named_args = 0;
>else
>  /* Treat all args as named.  */
>  n_named_args = num_actuals;
> 
>   Jakub
> 


[PATCH] Output branches and calls in gcov function summaries

2024-02-27 Thread Jørgen Kvalsvik
The gcov function summaries only output the covered lines, not the
branches and calls. Since the function summaries is an opt-in it
probably makes sense to also include branch coverage. Simply adds the
branch- and call information to the function-summaries output.

$ gcc --coverage hello.c -o hello
$ ./hello

Before:
$ gcov -f hello
Function 'main'
Lines executed:100.00% of 4

Function 'fn'
Lines executed:100.00% of 7

File 'hello.c'
Lines executed:100.00% of 11
Creating 'hello.c.gcov'

After:
$ gcov -f hello
Function 'main'
Lines executed:100.00% of 3
No branches
Calls executed:100.00% of 1

Function 'fn'
Lines executed:100.00% of 7
Branches executed:100.00% of 4
Taken at least once:50.00% of 4
No calls

File 'hello.c'
Lines executed:100.00% of 10
Creating 'hello.c.gcov'

Lines executed:100.00% of 10

gcc/ChangeLog:

* gcov.cc (generate_results): Count branches.
(function_summary): Output branch count.
---
 gcc/gcov.cc | 32 +---
 1 file changed, 29 insertions(+), 3 deletions(-)

---

I am unsure if it is intentional for the function-summaries to be line
only, but I did find it a bit odd, so I submit this patch for feedback.
If this is ok we could also look into refactoring the summaries
slightly, to bring together the printing function- and file summaries.

---

diff --git a/gcc/gcov.cc b/gcc/gcov.cc
index 2fad6aa7ede..4b9aafa543c 100644
--- a/gcc/gcov.cc
+++ b/gcc/gcov.cc
@@ -1530,11 +1530,18 @@ generate_results (const char *file_name)
   memset (, 0, sizeof (coverage));
   coverage.name = fn->get_name ();
   add_line_counts (flag_function_summary ?  : NULL, fn);
-  if (flag_function_summary)
+
+  if (!flag_function_summary)
+   continue;
+
+  for (const block_info& block : fn->blocks)
{
- function_summary ();
- fnotice (stdout, "\n");
+ for (arc_info *arc = block.succ; arc; arc = arc->succ_next)
+   add_branch_counts (, arc);
}
+
+  function_summary ();
+  fnotice (stdout, "\n");
 }
 
   name_map needle;
@@ -2528,6 +2535,25 @@ function_summary (const coverage_info *coverage)
 {
   fnotice (stdout, "%s '%s'\n", "Function", coverage->name);
   executed_summary (coverage->lines, coverage->lines_executed);
+
+  if (coverage->branches)
+{
+  fnotice (stdout, "Branches executed:%s of %d\n",
+  format_gcov (coverage->branches_executed, coverage->branches, 2),
+  coverage->branches);
+  fnotice (stdout, "Taken at least once:%s of %d\n",
+  format_gcov (coverage->branches_taken, coverage->branches, 2),
+   coverage->branches);
+}
+  else
+fnotice (stdout, "No branches\n");
+
+  if (coverage->calls)
+fnotice (stdout, "Calls executed:%s of %d\n",
+format_gcov (coverage->calls_executed, coverage->calls, 2),
+coverage->calls);
+  else
+fnotice (stdout, "No calls\n");
 }
 
 /* Output summary info for a file.  */
-- 
2.30.2



[PATCH 2/3, RFC] fsra: support ARG_PARTS

2024-02-27 Thread Jiufu Guo
This patch adds IFN_ARG_PARTS, and generate this IFN for parameters access
in fsra pass.  And this IFN is expanded according to the incoming registers
of the parameter.  "fsra" is tunned for the access of parameters.

PR target/108073

gcc/ChangeLog:

* internal-fn.cc (query_position_in_parallel): New function.
(construct_reg_seq): New function.
(get_incoming_element): New function.
(reference_alias_ptr_type): Extern declare.
(expand_ARG_PARTS): New expand function.
* internal-fn.def (ARG_PARTS): New IFN.
* tree-sra.cc (scan_function): Update for fsra.
(analyze_access_subtree): Enable reading ARG analyze for fsra.
(generate_subtree_copies): Update to generate IFN_ARG_PARTS.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/pr102024.C: Update.
* gcc.target/powerpc/pr108073-1.c: New test.
* gcc.target/powerpc/pr108073.c: New test.

---
 gcc/internal-fn.cc| 164 ++
 gcc/internal-fn.def   |   3 +
 gcc/tree-sra.cc   |  43 -
 gcc/testsuite/g++.target/powerpc/pr102024.C   |   2 +-
 gcc/testsuite/gcc.target/powerpc/pr108073-1.c |  76 
 gcc/testsuite/gcc.target/powerpc/pr108073.c   |  74 
 6 files changed, 354 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index a07f25f3aee..ee19e155628 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3393,6 +3393,170 @@ expand_DEFERRED_INIT (internal_fn, gcall *stmt)
 }
 }
 
+/* In the parallel rtx register series REGS, compute the register position for
+   given {BITPOS, BITSIZE}.  The results are stored into START_INDEX, 
END_INDEX,
+   LEFT_BITS and RIGHT_BITS.  */
+
+void
+query_position_in_parallel (HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+   rtx regs, int _index, int _index,
+   HOST_WIDE_INT _bits, HOST_WIDE_INT _bits)
+{
+  int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1;
+  for (; cur_index < XVECLEN (regs, 0); cur_index++)
+{
+  rtx slot = XVECEXP (regs, 0, cur_index);
+  HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT;
+  machine_mode mode = GET_MODE (XEXP (slot, 0));
+  HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant ();
+  if (off <= bitpos && off + size > bitpos)
+   {
+ start_index = cur_index;
+ left_bits = bitpos - off;
+   }
+  if (off + size >= bitpos + bitsize)
+   {
+ end_index = cur_index;
+ right_bits = off + size - (bitpos + bitsize);
+ break;
+   }
+}
+}
+
+/* Create a serial registers which start at FIRST_REG,
+   and SIZE is the total size of those registers.  */
+static rtx
+construct_reg_seq (HOST_WIDE_INT size, rtx first_reg)
+{
+  int nregs = size / UNITS_PER_WORD + (((size % UNITS_PER_WORD) != 0) ? 1 : 0);
+  rtx *tmps = XALLOCAVEC (rtx, nregs);
+  int regno = REGNO (first_reg);
+  machine_mode mode = word_mode;
+  HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant ();
+  for (int i = 0; i < nregs; i++)
+{
+  rtx reg = gen_rtx_REG (mode, regno + i);
+  rtx off = GEN_INT (word_size * i);
+  tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off);
+}
+  return gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps));
+}
+
+static rtx
+get_incoming_element (tree arg, HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+ bool reversep, tree expr)
+{
+  rtx regs = DECL_INCOMING_RTL (arg);
+  bool has_padding = false;
+  if (REG_P (regs) && GET_MODE (regs) == BLKmode)
+{
+  HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (arg));
+  has_padding = (size % UNITS_PER_WORD) != 0;
+  regs = construct_reg_seq (size, regs);
+}
+
+  if (GET_CODE (regs) != PARALLEL)
+return NULL_RTX;
+
+  int start_index = -1;
+  int end_index = -1;
+  HOST_WIDE_INT left_bits = 0;
+  HOST_WIDE_INT right_bits = 0;
+  query_position_in_parallel (bitpos, bitsize, regs, start_index, end_index,
+ left_bits, right_bits);
+
+  if (start_index < 0 || end_index < 0)
+return NULL_RTX;
+
+  machine_mode expr_mode = TYPE_MODE (TREE_TYPE (expr));
+  /* Just need one reg for the access.  */
+  if (end_index != start_index)
+return NULL_RTX;
+
+  rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0);
+  /* Just need one reg for the access.  */
+  if (left_bits == 0 && right_bits == 0)
+{
+  if (GET_MODE (reg) != expr_mode)
+   reg = gen_lowpart (expr_mode, reg);
+  return reg;
+}
+
+  /* Need to extract bitfield part reg for the access.
+ left_bits != 0 or right_bits != 0 */
+  if (has_padding && end_index == XVECLEN (regs, 0) - 1)
+return NULL_RTX;
+  scalar_int_mode imode;
+  if (!int_mode_for_mode (expr_mode).exists 

Re: [PATCH] combine: Don't simplify paradoxical SUBREG on WORD_REGISTER_OPERATIONS [PR113010]

2024-02-27 Thread Greg McGary



On 2/26/24 5:17 PM, Greg McGary wrote:

diff --git a/gcc/testsuite/gcc.c-torture/execute/pr113010.c 
b/gcc/testsuite/gcc.c-torture/execute/pr113010.c
new file mode 100644
index 000..a95c613c1df
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/execute/pr113010.c
@@ -0,0 +1,9 @@
+int minus_1 = -1;
+
+int
+main ()
+{
+  if ((0, 0xul) >= minus_1)
+__builtin_abort ();
+  return 0;
+}



Note that this is a stale version of the testcase. The constant needs to be
long long 0xull for the sake of 32-bit machines, such as ARM.

G



Re: [PATCH] RISC-V: add option -m(no-)autovec-segment

2024-02-27 Thread Jeff Law




On 2/25/24 21:53, Greg McGary wrote:

Add option -m(no-)autovec-segment to enable/disable autovectorizer
from emitting vector segment load/store instructions. This is useful for
performance experiments.

gcc/ChangeLog:
* config/riscv/autovec.md (vec_mask_len_load_lanes, 
vec_mask_len_store_lanes):
  Predicate with TARGET_VECTOR_AUTOVEC_SEGMENT
* gcc/config/riscv/riscv-opts.h (TARGET_VECTOR_AUTOVEC_SEGMENT): New 
macro.
* gcc/config/riscv/riscv.opt (-m(no-)autovec-segment): New option.
* gcc/tree-vect-stmts.cc (gcc/tree-vect-stmts.cc): Prevent 
divide-by-zero.
* testsuite/gcc.target/riscv/rvv/autovec/struct/*_noseg*.c,
testsuite/gcc.target/riscv/rvv/autovec/no-segment.c: New tests.
I don't mind having options to do this kind of selection (we've done 
similar things internally for other RVV features).  But I don't think 
now is the time to be introducing this stuff.  We're in stage4 of the 
development cycle after all.


jeff



Re: [PATCH] RISC-V: Update test expectancies with recent scheduler change

2024-02-27 Thread Jeff Law




On 2/26/24 18:21, juzhe.zh...@rivai.ai wrote:
If the scheduling model increases the vsetvls, we shouldn't set it as 
default scheduling model
I don't think it's that simple.  On some uarchs vsetvls are nearly free 
while on others they can be fairly expensive.  It's not clear (to me) 
yet if one approach or the other is going to be the more common.


jeff



Re: [PATCH 4/8] libstdc++: Fix error handling in std::print

2024-02-27 Thread Tim Song
[print.fun] requires a system_error, but I don't think
[ostream.formatted.print] does?

On Tue, Feb 27, 2024 at 5:47 AM Jonathan Wakely  wrote:

> Tested x86_64-linux. Reviews invited.
>
> -- >8 --
>
> The standard requires an exception if std::print fails to write to a
> std::ostream.
>
> libstdc++-v3/ChangeLog:
>
> * include/std/ostream (vprint_nonunicode): Throw if stream state
> indicates writing failed.
> * testsuite/27_io/basic_ostream/print/1.cc: Check for exception.
> * testsuite/27_io/print/1.cc: Likewise.
> ---
>  libstdc++-v3/include/std/ostream|  5 +
>  .../testsuite/27_io/basic_ostream/print/1.cc| 17 +
>  libstdc++-v3/testsuite/27_io/print/1.cc | 16 
>  3 files changed, 38 insertions(+)
>
> diff --git a/libstdc++-v3/include/std/ostream
> b/libstdc++-v3/include/std/ostream
> index a136399ad0b..3740ad6edfa 100644
> --- a/libstdc++-v3/include/std/ostream
> +++ b/libstdc++-v3/include/std/ostream
> @@ -901,6 +901,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> __catch(...)
>   { __os._M_setstate(ios_base::badbit); }
>}
> +
> +if (!__os)
> +  __throw_system_error(EIO);
>}
>
>inline void
> @@ -974,6 +977,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> __catch(...)
>   { __os._M_setstate(ios_base::badbit); }
>}
> +if (!__os)
> +  __throw_system_error(EIO);
>  #endif // _WIN32
>}
>
> diff --git a/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
> b/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
> index 71a4daa04c9..14bfb14d556 100644
> --- a/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
> +++ b/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
> @@ -103,6 +103,22 @@ test_locale()
>}
>  }
>
> +void
> +test_errors()
> +{
> +#ifdef __cpp_exceptions
> +  std::stringstream in(std::ios::in);
> +  try
> +  {
> +std::print(in, "{}", "nope");
> +VERIFY(false);
> +  }
> +  catch (const std::system_error&)
> +  {
> +  }
> +#endif
> +}
> +
>  int main()
>  {
>test_print_ostream();
> @@ -111,4 +127,5 @@ int main()
>test_print_no_padding();
>test_vprint_nonunicode();
>test_locale();
> +  test_errors();
>  }
> diff --git a/libstdc++-v3/testsuite/27_io/print/1.cc
> b/libstdc++-v3/testsuite/27_io/print/1.cc
> index 6a294e0454b..d570f7938be 100644
> --- a/libstdc++-v3/testsuite/27_io/print/1.cc
> +++ b/libstdc++-v3/testsuite/27_io/print/1.cc
> @@ -74,6 +74,21 @@ test_vprint_nonunicode()
>// { dg-output "garbage in . garbage out" }
>  }
>
> +void
> +test_errors()
> +{
> +#ifdef __cpp_exceptions
> +  try
> +  {
> +std::print(stdin, "{}", "nope");
> +VERIFY(false);
> +  }
> +  catch (const std::system_error&)
> +  {
> +  }
> +#endif
> +}
> +
>  int main()
>  {
>test_print_default();
> @@ -82,4 +97,5 @@ int main()
>test_println_file();
>test_print_raw();
>test_vprint_nonunicode();
> +  test_errors();
>  }
> --
> 2.43.0
>
>


Re: [PATCH 1/3] Change 'v1' float and int code to fall back to v0

2024-02-27 Thread Jeff Law




On 2/26/24 20:12, Tom Tromey wrote:

While working on another patch, I discovered that the libcc1 plugin
code never did version negotiation correctly.  So, the patches to
introduce v1 never did anything -- the new code, as far as I know, has
never been run.

Making version negotiation work shows that the existing code causes
crashes.  For example, safe_lookup_builtin_type might return
error_mark_node in some cases, which the callers aren't prepared to
accept.

Looking into it some more, I couldn't find any justification for this
v1 code for the C compiler plugin.  Since it's not run at all, it's
also clear that removing it doesn't cause any regressions in gdb.

However, rather than remove it, this patch changes it to handle
ERROR_MARK better, and then to fall back to the v0 code if the new
code fails to find the type it's looking for.

2024-02-26  Tom Tromey  

* libcc1plugin.cc (safe_lookup_builtin_type): Handle ERROR_MARK.
(plugin_int_type): Fall back to plugin_int_type_v0.
(plugin_float_type): Fall back to plugin_float_type_v0.
Given this is all libcc1 related and thus primarily of interest to gdb, 
if you're happy with it, then it's OK for the trunk.


jeff



Re: [PATCH v2] DSE: Bugfix ICE after allow vector type in get_stored_val

2024-02-27 Thread Jeff Law




On 2/26/24 07:22, pan2...@intel.com wrote:

From: Pan Li 

We allowed vector type for get_stored_val when read is less than or
equal to store in previous.  Unfortunately, we missed to adjust the
validate_subreg part accordingly.  When the vector type's size is
less than vector register, it will be considered as invalid in the
validate_subreg.

Consider the validate_subreg is kind of a can with worms and we are
in stage 4.  We will fix the issue from the DES side, and make sure
the subreg is valid for both the read_mode and store_mode before
perform the real gen_lowpart.

The below test are passed for this patch:

* The x86 bootstrap test.
* The x86 regression test.
* The riscv regression test.
* The aarch64 regression test.

gcc/ChangeLog:

* dse.cc (get_stored_val): Add validate_subreg check before
perform the gen_lowpart for rtl.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/ssa-fre-44.c: Add compile option to trigger
the ICE.
* gcc.target/riscv/rvv/base/bug-6.c: New test.

Signed-off-by: Pan Li 
---
  gcc/dse.cc|  4 +++-
  gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c|  2 +-
  .../gcc.target/riscv/rvv/base/bug-6.c | 22 +++
  3 files changed, 26 insertions(+), 2 deletions(-)
  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/bug-6.c

diff --git a/gcc/dse.cc b/gcc/dse.cc
index edc7a1dfecf..1596da91da0 100644
--- a/gcc/dse.cc
+++ b/gcc/dse.cc
@@ -1946,7 +1946,9 @@ get_stored_val (store_info *store_info, machine_mode 
read_mode,
 copy_rtx (store_info->const_rhs));
else if (VECTOR_MODE_P (read_mode) && VECTOR_MODE_P (store_mode)
  && known_le (GET_MODE_BITSIZE (read_mode), GET_MODE_BITSIZE (store_mode))
-&& targetm.modes_tieable_p (read_mode, store_mode))
+&& targetm.modes_tieable_p (read_mode, store_mode)
+&& validate_subreg (read_mode, store_mode, copy_rtx (store_info->rhs),
+   subreg_lowpart_offset (read_mode, store_mode)))
  read_reg = gen_lowpart (read_mode, copy_rtx (store_info->rhs));
else
  read_reg = extract_low_bits (read_mode, store_mode,


So we're just changing whether or not we call gen_lowpart directly or go 
through extract_low_bits, which may in turn generate subreg, call 
gen_lowpart itself and a few other things.


I'm guessing that extract_low_bits is going to return NULL in this case 
via this code (specifically the second test).



  if (!targetm.modes_tieable_p (src_int_mode, src_mode))
return NULL_RTX;
  if (!targetm.modes_tieable_p (int_mode, mode))
return NULL_RTX;



Pan, can you confirm what path we take through extract_low_bits?

One might argue that we should just call into extract_low_bits 
unconditionally since it'll ultimately call gen_lowpart when it safely 
can.  The downside is that's a bigger change than I'd like at this stage 
in our development cycle.


I wouldn't be surprised if other direct uses of gen_lowpart have similar 
problems.







diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
index f79b4c142ae..624a00a4f32 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-fre-44.c
@@ -1,5 +1,5 @@
  /* { dg-do compile } */
-/* { dg-options "-O -fdump-tree-fre1" } */
+/* { dg-options "-O -fdump-tree-fre1 -O3 -ftree-vectorize" } */
  
  struct A { float x, y; };

  struct B { struct A u; };
So this may compromise the original intent of this test.  What I would 
suggest instead is to create a new test with the dg-do & dg-options you 
want with a #include "ssa-fre-44.c".


So to move forward.  Let's confirm the path we take through 
extract_low_bits matches expectations and fixup the testsuite change.


Jeff


[PATCH] i386: Guard noreturn no-callee-saved-registers optimization with -mnoreturn-no-callee-saved-registers [PR38534]

2024-02-27 Thread Jakub Jelinek
On Tue, Feb 27, 2024 at 01:09:09PM +0100, Jakub Jelinek wrote:
> So, IMHO either revert the changes altogether, or guard on -mcmodel=kernel
> (but talk to kernel people on linux-toolchains if that is what they actually
> want).

Here is a patch which guards this by non-default option, so kernel and other
users can choose if they want this or not.  On top of the PR114116 patch.

Only lightly tested so far.

2024-02-27  Jakub Jelinek  

PR target/38534
* config/i386/i386.opt (mnoreturn-no-callee-saved-registers): New
option.
* config/i386/i386-options.cc (ix86_set_func_type): Don't use
TYPE_NO_CALLEE_SAVED_REGISTERS_EXCEPT_BP unless
ix86_noreturn_no_callee_saved_registers is enabled.
* doc/invoke.texi (-mnoreturn-no-callee-saved-registers): Document.

* gcc.target/i386/pr38534-1.c: Add -mnoreturn-no-callee-saved-registers
to dg-options.
* gcc.target/i386/pr38534-2.c: Likewise.
* gcc.target/i386/pr38534-3.c: Likewise.
* gcc.target/i386/pr38534-4.c: Likewise.
* gcc.target/i386/pr38534-5.c: Likewise.
* gcc.target/i386/pr38534-6.c: Likewise.
* gcc.target/i386/pr114097-1.c: Likewise.
* gcc.target/i386/stack-check-17.c: Likewise.

--- gcc/config/i386/i386.opt.jj 2024-01-10 12:19:07.694681189 +0100
+++ gcc/config/i386/i386.opt2024-02-27 14:18:34.439240869 +0100
@@ -659,6 +659,10 @@ mstore-max=
 Target RejectNegative Joined Var(ix86_store_max) Enum(prefer_vector_width) 
Init(PVW_NONE) Save
 Maximum number of bits that can be stored to memory efficiently.
 
+mnoreturn-no-callee-saved-registers
+Target Var(ix86_noreturn_no_callee_saved_registers)
+Optimize noreturn functions by not saving callee-saved registers used in the 
function.
+
 ;; ISA support
 
 m32
--- gcc/config/i386/i386-options.cc.jj  2024-02-27 14:20:59.972228314 +0100
+++ gcc/config/i386/i386-options.cc 2024-02-27 14:23:26.042208182 +0100
@@ -3384,7 +3384,8 @@ ix86_set_func_type (tree fndecl)
 {
   /* No need to save and restore callee-saved registers for a noreturn
  function with nothrow or compiled with -fno-exceptions unless when
- compiling with -O0 or -Og.  So that backtrace works for those at least
+ compiling with -O0 or -Og, except that it interferes with debugging
+ of callers.  So that backtrace works for those at least
  in most cases, save the bp register if it is used, because it often
  is used in callers to compute CFA.
 
@@ -3401,7 +3402,8 @@ ix86_set_func_type (tree fndecl)
   if (lookup_attribute ("no_callee_saved_registers",
TYPE_ATTRIBUTES (TREE_TYPE (fndecl
 no_callee_saved_registers = TYPE_NO_CALLEE_SAVED_REGISTERS;
-  else if (TREE_THIS_VOLATILE (fndecl)
+  else if (ix86_noreturn_no_callee_saved_registers
+  && TREE_THIS_VOLATILE (fndecl)
   && optimize
   && !optimize_debug
   && (TREE_NOTHROW (fndecl) || !flag_exceptions)
--- gcc/doc/invoke.texi.jj  2024-02-23 11:34:34.278287553 +0100
+++ gcc/doc/invoke.texi 2024-02-27 14:29:18.071339182 +0100
@@ -1450,6 +1450,7 @@ See RS/6000 and PowerPC Options.
 -mvzeroupper  -mprefer-avx128  -mprefer-vector-width=@var{opt}
 -mpartial-vector-fp-math
 -mmove-max=@var{bits} -mstore-max=@var{bits}
+-mnoreturn-no-callee-saved-registers
 -mmmx  -msse  -msse2  -msse3  -mssse3  -msse4.1  -msse4.2  -msse4  -mavx
 -mavx2  -mavx512f  -mavx512pf  -mavx512er  -mavx512cd  -mavx512vl
 -mavx512bw  -mavx512dq  -mavx512ifma  -mavx512vbmi  -msha  -maes
@@ -35376,6 +35377,15 @@ Prefer 256-bit vector width for instruct
 Prefer 512-bit vector width for instructions.
 @end table
 
+@opindex mnoreturn-no-callee-saved-registers
+@item -mnoreturn-no-callee-saved-registers
+This option optimizes functions with @code{noreturn} attribute or
+@code{_Noreturn} specifier by not saving in the function prologue callee-saved
+registers which are used in the function (except for the @code{BP}
+register).  This option can interfere with debugging of the caller of the
+@code{noreturn} function or any function further up in the call stack, so it
+is not enabled by default.
+
 @opindex mcx16
 @item -mcx16
 This option enables GCC to generate @code{CMPXCHG16B} instructions in 64-bit
--- gcc/testsuite/gcc.target/i386/pr38534-1.c.jj2024-02-27 
14:21:00.385222600 +0100
+++ gcc/testsuite/gcc.target/i386/pr38534-1.c   2024-02-27 15:39:44.687716915 
+0100
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mtune-ctrl=^prologue_using_move,^epilogue_using_move 
-fomit-frame-pointer" } */
+/* { dg-options "-O2 -mtune-ctrl=^prologue_using_move,^epilogue_using_move 
-fomit-frame-pointer -mnoreturn-no-callee-saved-registers" } */
 
 #define ARRAY_SIZE 256
 
--- gcc/testsuite/gcc.target/i386/pr38534-2.c.jj2024-02-27 
14:21:00.385222600 +0100
+++ gcc/testsuite/gcc.target/i386/pr38534-2.c   2024-02-27 15:39:51.569621585 
+0100
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 

Re: [PATCH 8/8] libstdc++: Do not define lock-free atomic aliases if not fully lock-free [PR114103]

2024-02-27 Thread Jonathan Wakely
Ooops, I forgot to add --no-numbered so these were eight unrelated
patches, not PATCH 1/8 .. PATCH 8/8. Sorry for any confusion.

On Tue, 27 Feb 2024 at 14:33, Jonathan Wakely  wrote:
>
> On Tue, 27 Feb 2024 at 11:49, Jonathan Wakely  wrote:
> >
> > Tested x86_64-linux. I think we should make this change, because
> > otherwise we define the typedefs for platforms with no lock-free
> > atomics, like hppa-hpux. Instead of lying, those typedefs should be
> > absent on that target.
> >
> > -- >8 --
> >
> > libstdc++-v3/ChangeLog:
> >
> > PR libstdc++/114103
> > * include/bits/version.def (atomic_lock_free_type_aliases): Add
> > extra_cond to check for at least one always-lock-free type.
> > * include/bits/version.h: Regenerate.
> > * include/std/atomic (atomic_signed_lock_free)
> > (atomic_unsigned_lock_free): Only use always-lock-free types.
> > ---
> >  libstdc++-v3/include/bits/version.def | 1 +
> >  libstdc++-v3/include/bits/version.h   | 2 +-
> >  libstdc++-v3/include/std/atomic   | 6 +++---
> >  3 files changed, 5 insertions(+), 4 deletions(-)
> >
> > diff --git a/libstdc++-v3/include/bits/version.def 
> > b/libstdc++-v3/include/bits/version.def
> > index 502961eb269..d298420121b 100644
> > --- a/libstdc++-v3/include/bits/version.def
> > +++ b/libstdc++-v3/include/bits/version.def
> > @@ -739,6 +739,7 @@ ftms = {
> >values = {
> >  v = 201907;
> >  cxxmin = 20;
> > +extra_cond = "(__GCC_ATOMIC_INT_LOCK_FREE | 
> > __GCC_ATOMIC_LONG_LOCK_FREE | __GCC_ATOMIC_CHAR_LOCK_FREE) & 2";
>
> Maybe this should be > 1 instead of & 2 in case there are targets that
> define it to 4 or something. I think those are only supposed to be
> defined to 0, 1, or 2 though.
>
>
> >};
> >  };
> >
> > diff --git a/libstdc++-v3/include/bits/version.h 
> > b/libstdc++-v3/include/bits/version.h
> > index 7a6fbd35e2e..9107b45a484 100644
> > --- a/libstdc++-v3/include/bits/version.h
> > +++ b/libstdc++-v3/include/bits/version.h
> > @@ -819,7 +819,7 @@
> >  #undef __glibcxx_want_atomic_float
> >
> >  #if !defined(__cpp_lib_atomic_lock_free_type_aliases)
> > -# if (__cplusplus >= 202002L)
> > +# if (__cplusplus >= 202002L) && ((__GCC_ATOMIC_INT_LOCK_FREE | 
> > __GCC_ATOMIC_LONG_LOCK_FREE | __GCC_ATOMIC_CHAR_LOCK_FREE) & 2)
> >  #  define __glibcxx_atomic_lock_free_type_aliases 201907L
> >  #  if defined(__glibcxx_want_all) || 
> > defined(__glibcxx_want_atomic_lock_free_type_aliases)
> >  #   define __cpp_lib_atomic_lock_free_type_aliases 201907L
> > diff --git a/libstdc++-v3/include/std/atomic 
> > b/libstdc++-v3/include/std/atomic
> > index 559f8370459..1462cf5ec23 100644
> > --- a/libstdc++-v3/include/std/atomic
> > +++ b/libstdc++-v3/include/std/atomic
> > @@ -1774,13 +1774,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> >  = atomic>;
> >using atomic_unsigned_lock_free
> >  = atomic>;
> > -# elif ATOMIC_INT_LOCK_FREE || !(ATOMIC_LONG_LOCK_FREE || 
> > ATOMIC_CHAR_LOCK_FREE)
> > +# elif ATOMIC_INT_LOCK_FREE == 2
>
> Similarly, this could be > 1 but again, I think == 2 is OK.
>
> >using atomic_signed_lock_free = atomic;
> >using atomic_unsigned_lock_free = atomic;
> > -# elif ATOMIC_LONG_LOCK_FREE
> > +# elif ATOMIC_LONG_LOCK_FREE == 2
> >using atomic_signed_lock_free = atomic;
> >using atomic_unsigned_lock_free = atomic;
> > -# elif ATOMIC_CHAR_LOCK_FREE
> > +# elif ATOMIC_CHAR_LOCK_FREE == 2
> >using atomic_signed_lock_free = atomic;
> >using atomic_unsigned_lock_free = atomic;
> >  # else
> > --
> > 2.43.0
> >



RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

2024-02-27 Thread Li, Pan2
Thanks Richard and Tammer for moving this forward.

> That said, I would like to see the bigger picture to be kept in mind
> before altering the GIMPLE IL.

> Adding an internal function for an already present optab is a
> no-brainer.  Adding a vectorizer
> and/or if-conversion pattern to make use of this during vectorization
> is existing practice.
> Adding pattern recognition to ISEL or widening-mul passes for
> instructions the CPU can do
> is existing practice and OK.

Thanks for explaining, got the point here.

> So I'd suggest writing some example of both signed and unsigned saturating 
> add and multiply

> Because signed addition, will likely require a branch and signed 
> multiplication would require a
> larger type.

Ack, will prepare one prototype validation patch for add, sub and mul (both 
unsigned and signed) soon.

Pan

-Original Message-
From: Richard Biener  
Sent: Tuesday, February 27, 2024 9:42 PM
To: Tamar Christina 
Cc: Li, Pan2 ; gcc-patches@gcc.gnu.org; 
juzhe.zh...@rivai.ai; Wang, Yanzhang ; 
kito.ch...@gmail.com; richard.sandiford@arm.com2; jeffreya...@gmail.com
Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation 
US_PLUS

On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina  wrote:
>
> > Thanks Tamar.
> >
> > > Those two cases also *completely* stop vectorization because of either the
> > > control flow or the fact the vectorizer can't handle complex types.
> >
> > Yes, we eventually would like to vectorize the SAT ALU but we start with 
> > scalar part
> > first.
> > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > well with some additions as below.
> > Feel free to correct me if any misunderstandings.
> >
> > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > usadd$a3(as well as gen_int_libfunc) for int.
> > 2. We need to implement a default implementation of SAT_ADD if
> > direct_binary_optab_supported_p is false.
> > It looks like the default implementation is difficult to make every 
> > backend happy.
> > That is why you suggest just normal
> > DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> >
> > Thanks Richard.
> >
> > > But what I'd like to see is that we do more instruction selection on 
> > > GIMPLE
> > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > passes doing what I'd call instruction selection).  But that means not 
> > > adding
> > > match.pd patterns for that or at least have a separate isel-match.pd
> > > machinery for that.
> >
> > > So as a start I would go for a direct optab and see to recognize it during
> > > ISEL?
> >
> > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > to know isel and I am happy to
> > try that once we have conclusion.
> >
>
> So after a lively discussion on IRC, the conclusion is that before we proceed 
> Richi would
> like to see some examples of various operations.  The problem is that 
> unsigned saturating
> addition is the simplest example and it may lead to an implementation 
> strategy that doesn't
> scale.
>
> So I'd suggest writing some example of both signed and unsigned saturating 
> add and multiply
>
> Because signed addition, will likely require a branch and signed 
> multiplication would require a
> larger type.
>
> This would allow us to better understand what kind of gimple would have to to 
> deal with in
> ISEL and VECT if we decide not to lower early.

More specifically before making something like .SAT_ADD a core part of
GIMPLE I'd like
to point out that we have saturating PLUS_EXPR but just for
fixed-point types.  I realize
Joseph thinks that keying this on the type was wrong and it should
have used integer
types and special saturating operations.  Still having both,
type-keyed saturating PLUS_EXPR
and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.

It might be that the way to go is to turn all existing saturating type
*_EXPR into
.SAT_* internal function calls, in the end mapping to the optabs and
eventual RTX codes.

That could work for both integer types and fixed-point types.

I'll also note that "saturating" is just another variant of overflow
behavior of which we have
trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
also (kind-of) sanitized.
We do lack direct IL representation of -ftrapv and -fwrapv, the
semantics on a PLUS_EXPR
depend on per-function flags.  Eventually a common representation
could be found here.
For saturating I was thinking of .ADD_OVERFLOW (a, b,
saturation-value), a "trap" could
be a "trapping" saturation value, "undefined" could be a
"not-a-thing".  But I didn't think much
about this.

That said, I would like to see the bigger picture to be kept in mind
before altering the GIMPLE IL.

Adding an internal function for an already present optab is a
no-brainer.  Adding a vectorizer
and/or if-conversion pattern to make use of this during vectorization
is existing practice.
Adding pattern recognition 

Re: [PATCH 8/8] libstdc++: Do not define lock-free atomic aliases if not fully lock-free [PR114103]

2024-02-27 Thread Jonathan Wakely
On Tue, 27 Feb 2024 at 11:49, Jonathan Wakely  wrote:
>
> Tested x86_64-linux. I think we should make this change, because
> otherwise we define the typedefs for platforms with no lock-free
> atomics, like hppa-hpux. Instead of lying, those typedefs should be
> absent on that target.
>
> -- >8 --
>
> libstdc++-v3/ChangeLog:
>
> PR libstdc++/114103
> * include/bits/version.def (atomic_lock_free_type_aliases): Add
> extra_cond to check for at least one always-lock-free type.
> * include/bits/version.h: Regenerate.
> * include/std/atomic (atomic_signed_lock_free)
> (atomic_unsigned_lock_free): Only use always-lock-free types.
> ---
>  libstdc++-v3/include/bits/version.def | 1 +
>  libstdc++-v3/include/bits/version.h   | 2 +-
>  libstdc++-v3/include/std/atomic   | 6 +++---
>  3 files changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/version.def 
> b/libstdc++-v3/include/bits/version.def
> index 502961eb269..d298420121b 100644
> --- a/libstdc++-v3/include/bits/version.def
> +++ b/libstdc++-v3/include/bits/version.def
> @@ -739,6 +739,7 @@ ftms = {
>values = {
>  v = 201907;
>  cxxmin = 20;
> +extra_cond = "(__GCC_ATOMIC_INT_LOCK_FREE | __GCC_ATOMIC_LONG_LOCK_FREE 
> | __GCC_ATOMIC_CHAR_LOCK_FREE) & 2";

Maybe this should be > 1 instead of & 2 in case there are targets that
define it to 4 or something. I think those are only supposed to be
defined to 0, 1, or 2 though.


>};
>  };
>
> diff --git a/libstdc++-v3/include/bits/version.h 
> b/libstdc++-v3/include/bits/version.h
> index 7a6fbd35e2e..9107b45a484 100644
> --- a/libstdc++-v3/include/bits/version.h
> +++ b/libstdc++-v3/include/bits/version.h
> @@ -819,7 +819,7 @@
>  #undef __glibcxx_want_atomic_float
>
>  #if !defined(__cpp_lib_atomic_lock_free_type_aliases)
> -# if (__cplusplus >= 202002L)
> +# if (__cplusplus >= 202002L) && ((__GCC_ATOMIC_INT_LOCK_FREE | 
> __GCC_ATOMIC_LONG_LOCK_FREE | __GCC_ATOMIC_CHAR_LOCK_FREE) & 2)
>  #  define __glibcxx_atomic_lock_free_type_aliases 201907L
>  #  if defined(__glibcxx_want_all) || 
> defined(__glibcxx_want_atomic_lock_free_type_aliases)
>  #   define __cpp_lib_atomic_lock_free_type_aliases 201907L
> diff --git a/libstdc++-v3/include/std/atomic b/libstdc++-v3/include/std/atomic
> index 559f8370459..1462cf5ec23 100644
> --- a/libstdc++-v3/include/std/atomic
> +++ b/libstdc++-v3/include/std/atomic
> @@ -1774,13 +1774,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>  = atomic>;
>using atomic_unsigned_lock_free
>  = atomic>;
> -# elif ATOMIC_INT_LOCK_FREE || !(ATOMIC_LONG_LOCK_FREE || 
> ATOMIC_CHAR_LOCK_FREE)
> +# elif ATOMIC_INT_LOCK_FREE == 2

Similarly, this could be > 1 but again, I think == 2 is OK.

>using atomic_signed_lock_free = atomic;
>using atomic_unsigned_lock_free = atomic;
> -# elif ATOMIC_LONG_LOCK_FREE
> +# elif ATOMIC_LONG_LOCK_FREE == 2
>using atomic_signed_lock_free = atomic;
>using atomic_unsigned_lock_free = atomic;
> -# elif ATOMIC_CHAR_LOCK_FREE
> +# elif ATOMIC_CHAR_LOCK_FREE == 2
>using atomic_signed_lock_free = atomic;
>using atomic_unsigned_lock_free = atomic;
>  # else
> --
> 2.43.0
>



Re: [r14-9173 Regression] FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr" on Linux/x86_64

2024-02-27 Thread Richard Biener
On Tue, 27 Feb 2024, Jeff Law wrote:

> 
> 
> On 2/27/24 06:53, Richard Biener wrote:
> > On Tue, 27 Feb 2024, Jeff Law wrote:
> > 
> >>
> >>
> >> On 2/27/24 00:43, Richard Biener wrote:
> >>> On Tue, 27 Feb 2024, haochen.jiang wrote:
> >>>
>  On Linux/x86_64,
> 
>  af66ad89e8169f44db723813662917cf4cbb78fc is the first bad commit
>  commit af66ad89e8169f44db723813662917cf4cbb78fc
>  Author: Richard Biener 
>  Date:   Fri Feb 23 16:06:05 2024 +0100
> 
>    middle-end/114070 - folding breaking VEC_COND expansion
> 
>  caused
> 
>  FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"
> >>>
> >>> This shows that the x86 backend is missing vcond_mask_qiqi and friends
> >>> (for AVX512 mask modes).  Either that or both expand_vec_cond_expr_p
> >>> and all the machinery behind it (ISEL pass, lowering) should handle
> >>> pure integer mode VEC_COND_EXPR via bit operations.  I think quite some
> >>> targets now implement patterns for these variants, whatever their
> >>> boolean vector modes are.
> >> There may be more going on than just that.  The andnot-2 test started
> >> regressing on most targets overnight, including on targets without vector
> >> capabilities.
> > 
> > Yes, we fail this generic vector simplification test now (not sure
> > why the test didn't test forwprop1).  As said the problem is that
> > we can't test whether the existing VEC_COND_EXPR is handled
> > (we just can test if it's handled by vcond_mask).
> ACK.  I'll force those targets to regenerate their baselines.
> 
> I hadn't read things too closely and just wanted to raise the issue that this
> impacts additional targets w/o vector support.  It sounds like it's largely
> expected fallout.

No, I didn't expect it (well, kind-of but my own testing came up 
clean...).  I'm going to try the extra patterns.

Richard.


Re: [r14-9173 Regression] FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr" on Linux/x86_64

2024-02-27 Thread Jeff Law




On 2/27/24 06:53, Richard Biener wrote:

On Tue, 27 Feb 2024, Jeff Law wrote:




On 2/27/24 00:43, Richard Biener wrote:

On Tue, 27 Feb 2024, haochen.jiang wrote:


On Linux/x86_64,

af66ad89e8169f44db723813662917cf4cbb78fc is the first bad commit
commit af66ad89e8169f44db723813662917cf4cbb78fc
Author: Richard Biener 
Date:   Fri Feb 23 16:06:05 2024 +0100

  middle-end/114070 - folding breaking VEC_COND expansion

caused

FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"


This shows that the x86 backend is missing vcond_mask_qiqi and friends
(for AVX512 mask modes).  Either that or both expand_vec_cond_expr_p
and all the machinery behind it (ISEL pass, lowering) should handle
pure integer mode VEC_COND_EXPR via bit operations.  I think quite some
targets now implement patterns for these variants, whatever their
boolean vector modes are.

There may be more going on than just that.  The andnot-2 test started
regressing on most targets overnight, including on targets without vector
capabilities.


Yes, we fail this generic vector simplification test now (not sure
why the test didn't test forwprop1).  As said the problem is that
we can't test whether the existing VEC_COND_EXPR is handled
(we just can test if it's handled by vcond_mask).

ACK.  I'll force those targets to regenerate their baselines.

I hadn't read things too closely and just wanted to raise the issue that 
this impacts additional targets w/o vector support.  It sounds like it's 
largely expected fallout.


Jeff




[PATCH v6 5/5] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2024-02-27 Thread Andre Vieira

This patch adds support for MVE Tail-Predicated Low Overhead Loops by using the
doloop funcitonality added to support predicated vectorized hardware loops.

gcc/ChangeLog:

* config/arm/arm-protos.h (arm_target_bb_ok_for_lob): Change
declaration to pass basic_block.
(arm_attempt_dlstp_transform): New declaration.
* config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): Define targethook.
(TARGET_PREDICT_DOLOOP_P): Likewise.
(arm_target_bb_ok_for_lob): Adapt condition.
(arm_mve_get_vctp_lanes): New function.
(arm_dl_usage_type): New internal enum.
(arm_get_required_vpr_reg): New function.
(arm_get_required_vpr_reg_param): New function.
(arm_get_required_vpr_reg_ret_val): New function.
(arm_mve_get_loop_vctp): New function.
(arm_mve_insn_predicated_by): New function.
(arm_mve_across_lane_insn_p): New function.
(arm_mve_load_store_insn_p): New function.
(arm_mve_impl_pred_on_outputs_p): New function.
(arm_mve_impl_pred_on_inputs_p): New function.
(arm_last_vect_def_insn): New function.
(arm_mve_impl_predicated_p): New function.
(arm_mve_check_reg_origin_is_num_elems): New function.
(arm_mve_dlstp_check_inc_counter): New function.
(arm_mve_dlstp_check_dec_counter): New function.
(arm_mve_loop_valid_for_dlstp): New function.
(arm_predict_doloop_p): New function.
(arm_loop_unroll_adjust): New function.
(arm_emit_mve_unpredicated_insn_to_seq): New function.
(arm_attempt_dlstp_transform): New function.
* config/arm/arm.opt (mdlstp): New option.
* config/arm/iteratords.md (dlstp_elemsize, letp_num_lanes,
letp_num_lanes_neg, letp_num_lanes_minus_1): New attributes.
(DLSTP, LETP): New iterators.
(predicated_doloop_end_internal): New pattern.
(dlstp_insn): New pattern.
* config/arm/thumb2.md (doloop_end): Adapt to support tail-predicated
loops.
(doloop_begin): Likewise.
* config/arm/types.md (mve_misc): New mve type to represent
predicated_loop_end insn sequences.
* config/arm/unspecs.md:
(DLSTP8, DLSTP16, DLSTP32, DSLTP64,
LETP8, LETP16, LETP32, LETP64): New unspecs for DLSTP and LETP.

gcc/testsuite/ChangeLog:

* gcc.target/arm/lob.h: Add new helpers.
* gcc.target/arm/lob1.c: Use new helpers.
* gcc.target/arm/lob6.c: Likewise.
* gcc.target/arm/dlstp-compile-asm-1.c: New test.
* gcc.target/arm/dlstp-compile-asm-2.c: New test.
* gcc.target/arm/dlstp-compile-asm-3.c: New test.
* gcc.target/arm/dlstp-int8x16.c: New test.
* gcc.target/arm/dlstp-int8x16-run.c: New test.
* gcc.target/arm/dlstp-int16x8.c: New test.
* gcc.target/arm/dlstp-int16x8-run.c: New test.
* gcc.target/arm/dlstp-int32x4.c: New test.
* gcc.target/arm/dlstp-int32x4-run.c: New test.
* gcc.target/arm/dlstp-int64x2.c: New test.
* gcc.target/arm/dlstp-int64x2-run.c: New test.
* gcc.target/arm/dlstp-invalid-asm.c: New test.

Co-authored-by: Stam Markianos-Wright 
---
 gcc/config/arm/arm-protos.h   |4 +-
 gcc/config/arm/arm.cc | 1249 -
 gcc/config/arm/arm.opt|3 +
 gcc/config/arm/iterators.md   |   15 +
 gcc/config/arm/mve.md |   50 +
 gcc/config/arm/thumb2.md  |  138 +-
 gcc/config/arm/types.md   |6 +-
 gcc/config/arm/unspecs.md |   14 +-
 gcc/testsuite/gcc.target/arm/lob.h|  128 +-
 gcc/testsuite/gcc.target/arm/lob1.c   |   23 +-
 gcc/testsuite/gcc.target/arm/lob6.c   |8 +-
 .../gcc.target/arm/mve/dlstp-compile-asm-1.c  |  146 ++
 .../gcc.target/arm/mve/dlstp-compile-asm-2.c  |  749 ++
 .../gcc.target/arm/mve/dlstp-compile-asm-3.c  |   46 +
 .../gcc.target/arm/mve/dlstp-int16x8-run.c|   44 +
 .../gcc.target/arm/mve/dlstp-int16x8.c|   31 +
 .../gcc.target/arm/mve/dlstp-int32x4-run.c|   45 +
 .../gcc.target/arm/mve/dlstp-int32x4.c|   31 +
 .../gcc.target/arm/mve/dlstp-int64x2-run.c|   48 +
 .../gcc.target/arm/mve/dlstp-int64x2.c|   28 +
 .../gcc.target/arm/mve/dlstp-int8x16-run.c|   44 +
 .../gcc.target/arm/mve/dlstp-int8x16.c|   32 +
 .../gcc.target/arm/mve/dlstp-invalid-asm.c|  521 +++
 23 files changed, 3321 insertions(+), 82 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
 create mode 100644 gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
 create mode 100644 

[PATCH v6 4/5] doloop: Add support for predicated vectorized loops

2024-02-27 Thread Andre Vieira

This patch adds support in the target agnostic doloop pass for the detection of
predicated vectorized hardware loops.  Arm is currently the only target that
will make use of this feature.

gcc/ChangeLog:

* df-core.cc (df_bb_regno_only_def_find): New helper function.
* df.h (df_bb_regno_only_def_find): Declare new function.
* loop-doloop.cc (doloop_condition_get): Add support for detecting
predicated vectorized hardware loops.
(doloop_modify): Add support for GTU condition checks.
(doloop_optimize): Update costing computation to support alterations to
desc->niter_expr by the backend.

Co-authored-by: Stam Markianos-Wright 
---
 gcc/df-core.cc |  15 +
 gcc/df.h   |   1 +
 gcc/loop-doloop.cc | 164 +++--
 3 files changed, 113 insertions(+), 67 deletions(-)

diff --git a/gcc/df-core.cc b/gcc/df-core.cc
index f0eb4c93957..b0e8a88d433 100644
--- a/gcc/df-core.cc
+++ b/gcc/df-core.cc
@@ -1964,6 +1964,21 @@ df_bb_regno_last_def_find (basic_block bb, unsigned int regno)
   return NULL;
 }
 
+/* Return the one and only def of REGNO within BB.  If there is no def or
+   there are multiple defs, return NULL.  */
+
+df_ref
+df_bb_regno_only_def_find (basic_block bb, unsigned int regno)
+{
+  df_ref temp = df_bb_regno_first_def_find (bb, regno);
+  if (!temp)
+return NULL;
+  else if (temp == df_bb_regno_last_def_find (bb, regno))
+return temp;
+  else
+return NULL;
+}
+
 /* Finds the reference corresponding to the definition of REG in INSN.
DF is the dataflow object.  */
 
diff --git a/gcc/df.h b/gcc/df.h
index 84e5aa8b524..c4e690b40cf 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -987,6 +987,7 @@ extern void df_check_cfg_clean (void);
 #endif
 extern df_ref df_bb_regno_first_def_find (basic_block, unsigned int);
 extern df_ref df_bb_regno_last_def_find (basic_block, unsigned int);
+extern df_ref df_bb_regno_only_def_find (basic_block, unsigned int);
 extern df_ref df_find_def (rtx_insn *, rtx);
 extern bool df_reg_defined (rtx_insn *, rtx);
 extern df_ref df_find_use (rtx_insn *, rtx);
diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
index 529e810e530..8953e1de960 100644
--- a/gcc/loop-doloop.cc
+++ b/gcc/loop-doloop.cc
@@ -85,10 +85,10 @@ doloop_condition_get (rtx_insn *doloop_pat)
  forms:
 
  1)  (parallel [(set (pc) (if_then_else (condition)
-	  			(label_ref (label))
-(pc)))
-	 (set (reg) (plus (reg) (const_int -1)))
-	 (additional clobbers and uses)])
+	(label_ref (label))
+	(pc)))
+		 (set (reg) (plus (reg) (const_int -1)))
+		 (additional clobbers and uses)])
 
  The branch must be the first entry of the parallel (also required
  by jump.cc), and the second entry of the parallel must be a set of
@@ -96,19 +96,33 @@ doloop_condition_get (rtx_insn *doloop_pat)
  the loop counter in an if_then_else too.
 
  2)  (set (reg) (plus (reg) (const_int -1))
- (set (pc) (if_then_else (reg != 0)
-	 (label_ref (label))
-			 (pc))).  
+	 (set (pc) (if_then_else (reg != 0)
+ (label_ref (label))
+ (pc))).
 
- Some targets (ARM) do the comparison before the branch, as in the
+ 3) Some targets (Arm) do the comparison before the branch, as in the
  following form:
 
- 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
-   (set (reg) (plus (reg) (const_int -1)))])
-(set (pc) (if_then_else (cc == NE)
-(label_ref (label))
-(pc))) */
-
+ (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0))
+		(set (reg) (plus (reg) (const_int -1)))])
+ (set (pc) (if_then_else (cc == NE)
+			 (label_ref (label))
+			 (pc)))
+
+  4) This form supports a construct that is used to represent a vectorized
+  do loop with predication, however we do not need to care about the
+  details of the predication here.
+  Arm uses this construct to support MVE tail predication.
+
+  (parallel
+   [(set (pc)
+	 (if_then_else (gtu (plus (reg) (const_int -n))
+(const_int n-1))
+			   (label_ref)
+			   (pc)))
+	(set (reg) (plus (reg) (const_int -n)))
+	(additional clobbers and uses)])
+ */
   pattern = PATTERN (doloop_pat);
 
   if (GET_CODE (pattern) != PARALLEL)
@@ -173,15 +187,17 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 
-  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
-  || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !rtx_equal_p (XEXP (inc_src, 0), reg)
+  || 

[PATCH v6 1/5] arm: Add define_attr to to create a mapping between MVE predicated and unpredicated insns

2024-02-27 Thread Andre Vieira

This patch adds an attribute to the mve md patterns to be able to identify
predicable MVE instructions and what their predicated and unpredicated variants
are.  This attribute is used to encode the icode of the unpredicated variant of
an instruction in its predicated variant.

This will make it possible for us to transform VPT-predicated insns in
the insn chain into their unpredicated equivalents when transforming the loop
into a MVE Tail-Predicated Low Overhead Loop. For example:
`mve_vldrbq_z_ -> mve_vldrbq_`.

gcc/ChangeLog:

* config/arm/arm.md (mve_unpredicated_insn): New attribute.
* config/arm/arm.h (MVE_VPT_PREDICATED_INSN_P): New define.
(MVE_VPT_UNPREDICATED_INSN_P): Likewise.
(MVE_VPT_PREDICABLE_INSN_P): Likewise.
* config/arm/vec-common.md (mve_vshlq_): Add attribute.
* config/arm/mve.md (arm_vcx1q_p_v16qi): Add attribute.
(arm_vcx1qv16qi): Likewise.
(arm_vcx1qav16qi): Likewise.
(arm_vcx1qv16qi): Likewise.
(arm_vcx2q_p_v16qi): Likewise.
(arm_vcx2qv16qi): Likewise.
(arm_vcx2qav16qi): Likewise.
(arm_vcx2qv16qi): Likewise.
(arm_vcx3q_p_v16qi): Likewise.
(arm_vcx3qv16qi): Likewise.
(arm_vcx3qav16qi): Likewise.
(arm_vcx3qv16qi): Likewise.
(@mve_q_): Likewise.
(@mve_q_int_): Likewise.
(@mve_q_v4si): Likewise.
(@mve_q_n_): Likewise.
(@mve_q_r_): Likewise.
(@mve_q_f): Likewise.
(@mve_q_m_): Likewise.
(@mve_q_m_n_): Likewise.
(@mve_q_m_r_): Likewise.
(@mve_q_m_f): Likewise.
(@mve_q_int_m_): Likewise.
(@mve_q_p_v4si): Likewise.
(@mve_q_p_): Likewise.
(@mve_q_): Likewise.
(@mve_q_f): Likewise.
(@mve_q_m_): Likewise.
(@mve_q_m_f): Likewise.
(mve_vq_f): Likewise.
(mve_q): Likewise.
(mve_q_f): Likewise.
(mve_vadciq_v4si): Likewise.
(mve_vadciq_m_v4si): Likewise.
(mve_vadcq_v4si): Likewise.
(mve_vadcq_m_v4si): Likewise.
(mve_vandq_): Likewise.
(mve_vandq_f): Likewise.
(mve_vandq_m_): Likewise.
(mve_vandq_m_f): Likewise.
(mve_vandq_s): Likewise.
(mve_vandq_u): Likewise.
(mve_vbicq_): Likewise.
(mve_vbicq_f): Likewise.
(mve_vbicq_m_): Likewise.
(mve_vbicq_m_f): Likewise.
(mve_vbicq_m_n_): Likewise.
(mve_vbicq_n_): Likewise.
(mve_vbicq_s): Likewise.
(mve_vbicq_u): Likewise.
(@mve_vclzq_s): Likewise.
(mve_vclzq_u): Likewise.
(@mve_vcmp_q_): Likewise.
(@mve_vcmp_q_n_): Likewise.
(@mve_vcmp_q_f): Likewise.
(@mve_vcmp_q_n_f): Likewise.
(@mve_vcmp_q_m_f): Likewise.
(@mve_vcmp_q_m_n_): Likewise.
(@mve_vcmp_q_m_): Likewise.
(@mve_vcmp_q_m_n_f): Likewise.
(mve_vctpq): Likewise.
(mve_vctpq_m): Likewise.
(mve_vcvtaq_): Likewise.
(mve_vcvtaq_m_): Likewise.
(mve_vcvtbq_f16_f32v8hf): Likewise.
(mve_vcvtbq_f32_f16v4sf): Likewise.
(mve_vcvtbq_m_f16_f32v8hf): Likewise.
(mve_vcvtbq_m_f32_f16v4sf): Likewise.
(mve_vcvtmq_): Likewise.
(mve_vcvtmq_m_): Likewise.
(mve_vcvtnq_): Likewise.
(mve_vcvtnq_m_): Likewise.
(mve_vcvtpq_): Likewise.
(mve_vcvtpq_m_): Likewise.
(mve_vcvtq_from_f_): Likewise.
(mve_vcvtq_m_from_f_): Likewise.
(mve_vcvtq_m_n_from_f_): Likewise.
(mve_vcvtq_m_n_to_f_): Likewise.
(mve_vcvtq_m_to_f_): Likewise.
(mve_vcvtq_n_from_f_): Likewise.
(mve_vcvtq_n_to_f_): Likewise.
(mve_vcvtq_to_f_): Likewise.
(mve_vcvttq_f16_f32v8hf): Likewise.
(mve_vcvttq_f32_f16v4sf): Likewise.
(mve_vcvttq_m_f16_f32v8hf): Likewise.
(mve_vcvttq_m_f32_f16v4sf): Likewise.
(mve_vdwdupq_m_wb_u_insn): Likewise.
(mve_vdwdupq_wb_u_insn): Likewise.
(mve_veorq_s>): Likewise.
(mve_veorq_u>): Likewise.
(mve_veorq_f): Likewise.
(mve_vidupq_m_wb_u_insn): Likewise.
(mve_vidupq_u_insn): Likewise.
(mve_viwdupq_m_wb_u_insn): Likewise.
(mve_viwdupq_wb_u_insn): Likewise.
(mve_vldrbq_): Likewise.
(mve_vldrbq_gather_offset_): Likewise.
(mve_vldrbq_gather_offset_z_): Likewise.
(mve_vldrbq_z_): Likewise.
(mve_vldrdq_gather_base_v2di): Likewise.
(mve_vldrdq_gather_base_wb_v2di_insn): Likewise.
(mve_vldrdq_gather_base_wb_z_v2di_insn): Likewise.
(mve_vldrdq_gather_base_z_v2di): Likewise.
(mve_vldrdq_gather_offset_v2di): Likewise.
(mve_vldrdq_gather_offset_z_v2di): Likewise.
(mve_vldrdq_gather_shifted_offset_v2di): Likewise.
(mve_vldrdq_gather_shifted_offset_z_v2di): Likewise.
(mve_vldrhq_): Likewise.
(mve_vldrhq_fv8hf): Likewise.
(mve_vldrhq_gather_offset_): 

[PATCH v6 2/5] arm: Annotate instructions with mve_safe_imp_xlane_pred

2024-02-27 Thread Andre Vieira

This patch annotates some MVE across lane instructions with a new attribute.
We use this attribute to let the compiler know that these instructions can be
safely implicitly predicated when tail predicating if their operands are
guaranteed to have zeroed tail predicated lanes.  These instructions were
selected because having the value 0 in those lanes or 'tail-predicating' those
lanes have the same effect.

gcc/ChangeLog:

* config/arm/arm.md (mve_safe_imp_xlane_pred): New attribute.
* config/arm/iterators.md (mve_vmaxmin_safe_imp): New iterator
attribute.
* config/arm/mve.md (vaddvq_s, vaddvq_u, vaddlvq_s, vaddlvq_u,
vaddvaq_s, vaddvaq_u, vmaxavq_s, vmaxvq_u, vmladavq_s, vmladavq_u,
vmladavxq_s, vmlsdavq_s, vmlsdavxq_s, vaddlvaq_s, vaddlvaq_u,
vmlaldavq_u, vmlaldavq_s, vmlaldavq_u, vmlaldavxq_s, vmlsldavq_s,
vmlsldavxq_s, vrmlaldavhq_u, vrmlaldavhq_s, vrmlaldavhxq_s,
vrmlsldavhq_s, vrmlsldavhxq_s, vrmlaldavhaq_s, vrmlaldavhaq_u,
vrmlaldavhaxq_s, vrmlsldavhaq_s, vrmlsldavhaxq_s, vabavq_s, vabavq_u,
vmladavaq_u, vmladavaq_s, vmladavaxq_s, vmlsdavaq_s, vmlsdavaxq_s,
vmlaldavaq_s, vmlaldavaq_u, vmlaldavaxq_s, vmlsldavaq_s,
vmlsldavaxq_s): Added mve_safe_imp_xlane_pred.
---
 gcc/config/arm/arm.md   |  6 ++
 gcc/config/arm/iterators.md |  8 
 gcc/config/arm/mve.md   | 12 
 3 files changed, 26 insertions(+)

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 81290e83818..814e871acea 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -130,6 +130,12 @@ (define_attr "predicated" "yes,no" (const_string "no"))
 ; encode that it is a predicable instruction.
 (define_attr "mve_unpredicated_insn" "" (symbol_ref "CODE_FOR_nothing"))
 
+; An attribute used by the loop-doloop pass when determining whether it is
+; safe to predicate a MVE instruction, that operates across lanes, and was
+; previously not predicated.  The pass will still check whether all inputs
+; are predicated by the VCTP predication mask.
+(define_attr "mve_safe_imp_xlane_pred" "yes,no" (const_string "no"))
+
 ; LENGTH of an instruction (in bytes)
 (define_attr "length" ""
   (const_int 4))
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 7600bf62531..22b3ddf5637 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -869,6 +869,14 @@ (define_code_attr mve_addsubmul [
 		 (plus "vadd")
 		 ])
 
+(define_int_attr mve_vmaxmin_safe_imp [
+		 (VMAXVQ_U "yes")
+		 (VMAXVQ_S "no")
+		 (VMAXAVQ_S "yes")
+		 (VMINVQ_U "no")
+		 (VMINVQ_S "no")
+		 (VMINAVQ_S "no")])
+
 (define_int_attr mve_cmp_op1 [
 		 (VCMPCSQ_M_U "cs")
 		 (VCMPEQQ_M_S "eq") (VCMPEQQ_M_U "eq")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 8aa0bded7f0..d7bdcd862f8 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -393,6 +393,7 @@ (define_insn "@mve_q_"
   "TARGET_HAVE_MVE"
   ".%#\t%0, %q1"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_"))
+  (set_attr "mve_safe_imp_xlane_pred" "yes")
   (set_attr "type" "mve_move")
 ])
 
@@ -529,6 +530,7 @@ (define_insn "@mve_q_v4si"
   "TARGET_HAVE_MVE"
   ".32\t%Q0, %R0, %q1"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_v4si"))
+  (set_attr "mve_safe_imp_xlane_pred" "yes")
   (set_attr "type" "mve_move")
 ])
 
@@ -802,6 +804,7 @@ (define_insn "@mve_q_"
   "TARGET_HAVE_MVE"
   ".%#\t%0, %q2"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_"))
+  (set_attr "mve_safe_imp_xlane_pred" "yes")
   (set_attr "type" "mve_move")
 ])
 
@@ -1014,6 +1017,7 @@ (define_insn "@mve_q_"
   "TARGET_HAVE_MVE"
   ".%#\t%0, %q2"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_"))
+  (set_attr "mve_safe_imp_xlane_pred" "")
   (set_attr "type" "mve_move")
 ])
 
@@ -1033,6 +1037,7 @@ (define_insn "@mve_q_"
   "TARGET_HAVE_MVE"
   ".%#\t%0, %q1, %q2"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_"))
+  (set_attr "mve_safe_imp_xlane_pred" "yes")
   (set_attr "type" "mve_move")
 ])
 
@@ -1219,6 +1224,7 @@ (define_insn "@mve_q_v4si"
   "TARGET_HAVE_MVE"
   ".32\t%Q0, %R0, %q2"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_v4si"))
+  (set_attr "mve_safe_imp_xlane_pred" "yes")
   (set_attr "type" "mve_move")
 ])
 
@@ -1450,6 +1456,7 @@ (define_insn "@mve_q_"
   "TARGET_HAVE_MVE"
   ".%#\t%Q0, %R0, %q1, %q2"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_"))
+  (set_attr "mve_safe_imp_xlane_pred" "yes")
   (set_attr "type" "mve_move")
 ])
 
@@ -1588,6 +1595,7 @@ (define_insn "@mve_q_v4si"
   "TARGET_HAVE_MVE"
   ".32\t%Q0, %R0, %q1, %q2"
  [(set (attr "mve_unpredicated_insn") (symbol_ref "CODE_FOR_mve_q_v4si"))
+  (set_attr "mve_safe_imp_xlane_pred" "yes")
   (set_attr "type" "mve_move")
 ])
 
@@ -1725,6 +1733,7 @@ (define_insn "@mve_q_v4si"
   "TARGET_HAVE_MVE"
   ".32\t%Q0, %R0, %q2, %q3"
  [(set 

[PATCH v6 3/5] arm: Fix a wrong attribute use and remove unused unspecs and iterators

2024-02-27 Thread Andre Vieira

This patch fixes the erroneous use of a mode attribute without a mode iterator
in the pattern and removes unused unspecs and iterators.

gcc/ChangeLog:

* config/arm/iterators.md (supf): Remove VMLALDAVXQ_U, VMLALDAVXQ_P_U,
VMLALDAVAXQ_U cases.
(VMLALDAVXQ): Remove iterator.
(VMLALDAVXQ_P): Likewise.
(VMLALDAVAXQ): Likewise.
* config/arm/mve.md (mve_vstrwq_p_fv4sf): Replace use of 
mode iterator attribute with V4BI mode.
* config/arm/unspecs.md (VMLALDAVXQ_U, VMLALDAVXQ_P_U,
VMLALDAVAXQ_U): Remove unused unspecs.
---
 gcc/config/arm/iterators.md | 9 +++--
 gcc/config/arm/mve.md   | 2 +-
 gcc/config/arm/unspecs.md   | 3 ---
 3 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 22b3ddf5637..3206bcab4cf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2370,7 +2370,7 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		   (VSUBQ_S "s") (VSUBQ_U "u") (VADDVAQ_S "s")
 		   (VADDVAQ_U "u") (VADDLVAQ_S "s") (VADDLVAQ_U "u")
 		   (VBICQ_N_S "s") (VBICQ_N_U "u") (VMLALDAVQ_U "u")
-		   (VMLALDAVQ_S "s") (VMLALDAVXQ_U "u") (VMLALDAVXQ_S "s")
+		   (VMLALDAVQ_S "s") (VMLALDAVXQ_S "s")
 		   (VMOVNBQ_U "u") (VMOVNBQ_S "s") (VMOVNTQ_U "u")
 		   (VMOVNTQ_S "s") (VORRQ_N_S "s") (VORRQ_N_U "u")
 		   (VQMOVNBQ_U "u") (VQMOVNBQ_S "s") (VQMOVNTQ_S "s")
@@ -2412,8 +2412,8 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s")
 		   (VREV16Q_M_S "s") (VREV16Q_M_U "u")
 		   (VQRSHRNTQ_N_U "u") (VMOVNTQ_M_U "u") (VMOVLBQ_M_U "u")
 		   (VMLALDAVAQ_U "u") (VQSHRNBQ_N_U "u") (VSHRNBQ_N_U "u")
-		   (VRSHRNBQ_N_U "u") (VMLALDAVXQ_P_U "u")
-		   (VMVNQ_M_N_U "u") (VQSHRNTQ_N_U "u") (VMLALDAVAXQ_U "u")
+		   (VRSHRNBQ_N_U "u")
+		   (VMVNQ_M_N_U "u") (VQSHRNTQ_N_U "u")
 		   (VQMOVNTQ_M_U "u") (VSHRNTQ_N_U "u") (VCVTMQ_M_S "s")
 		   (VCVTMQ_M_U "u") (VCVTNQ_M_S "s") (VCVTNQ_M_U "u")
 		   (VCVTPQ_M_S "s") (VCVTPQ_M_U "u") (VADDLVAQ_P_S "s")
@@ -2762,7 +2762,6 @@ (define_int_iterator VSUBQ_N [VSUBQ_N_S VSUBQ_N_U])
 (define_int_iterator VADDLVAQ [VADDLVAQ_S VADDLVAQ_U])
 (define_int_iterator VBICQ_N [VBICQ_N_S VBICQ_N_U])
 (define_int_iterator VMLALDAVQ [VMLALDAVQ_U VMLALDAVQ_S])
-(define_int_iterator VMLALDAVXQ [VMLALDAVXQ_U VMLALDAVXQ_S])
 (define_int_iterator VMOVNBQ [VMOVNBQ_U VMOVNBQ_S])
 (define_int_iterator VMOVNTQ [VMOVNTQ_S VMOVNTQ_U])
 (define_int_iterator VORRQ_N [VORRQ_N_U VORRQ_N_S])
@@ -2817,11 +2816,9 @@ (define_int_iterator VMLALDAVAQ [VMLALDAVAQ_S VMLALDAVAQ_U])
 (define_int_iterator VQSHRNBQ_N [VQSHRNBQ_N_U VQSHRNBQ_N_S])
 (define_int_iterator VSHRNBQ_N [VSHRNBQ_N_U VSHRNBQ_N_S])
 (define_int_iterator VRSHRNBQ_N [VRSHRNBQ_N_S VRSHRNBQ_N_U])
-(define_int_iterator VMLALDAVXQ_P [VMLALDAVXQ_P_U VMLALDAVXQ_P_S])
 (define_int_iterator VQMOVNTQ_M [VQMOVNTQ_M_U VQMOVNTQ_M_S])
 (define_int_iterator VMVNQ_M_N [VMVNQ_M_N_U VMVNQ_M_N_S])
 (define_int_iterator VQSHRNTQ_N [VQSHRNTQ_N_U VQSHRNTQ_N_S])
-(define_int_iterator VMLALDAVAXQ [VMLALDAVAXQ_S VMLALDAVAXQ_U])
 (define_int_iterator VSHRNTQ_N [VSHRNTQ_N_S VSHRNTQ_N_U])
 (define_int_iterator VCVTMQ_M [VCVTMQ_M_S VCVTMQ_M_U])
 (define_int_iterator VCVTNQ_M [VCVTNQ_M_S VCVTNQ_M_U])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index d7bdcd862f8..9fe51298cdc 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -4605,7 +4605,7 @@ (define_insn "mve_vstrwq_p_fv4sf"
   [(set (match_operand:V4SI 0 "mve_memory_operand" "=Ux")
 	(unspec:V4SI
 	 [(match_operand:V4SF 1 "s_register_operand" "w")
-	  (match_operand: 2 "vpr_register_operand" "Up")
+	  (match_operand:V4BI 2 "vpr_register_operand" "Up")
 	  (match_dup 0)]
 	 VSTRWQ_F))]
   "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT"
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index b9db306c067..46ac8b37157 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -717,7 +717,6 @@ (define_c_enum "unspec" [
   VCVTBQ_F16_F32
   VCVTTQ_F16_F32
   VMLALDAVQ_U
-  VMLALDAVXQ_U
   VMLALDAVXQ_S
   VMLALDAVQ_S
   VMLSLDAVQ_S
@@ -934,7 +933,6 @@ (define_c_enum "unspec" [
   VSHRNBQ_N_S
   VRSHRNBQ_N_S
   VRSHRNBQ_N_U
-  VMLALDAVXQ_P_U
   VMLALDAVXQ_P_S
   VQMOVNTQ_M_U
   VQMOVNTQ_M_S
@@ -943,7 +941,6 @@ (define_c_enum "unspec" [
   VQSHRNTQ_N_U
   VQSHRNTQ_N_S
   VMLALDAVAXQ_S
-  VMLALDAVAXQ_U
   VSHRNTQ_N_S
   VSHRNTQ_N_U
   VCVTBQ_M_F16_F32


[PATCH v5 0/5] arm: Add support for MVE Tail-Predicated Low Overhead Loops

2024-02-27 Thread Andre Vieira
Hi,

Re-ordered patches, our latest plan is to only commit patches 1-3, and leave
4-5 for GCC 15, as we believe it is too late in Stage 4 to be making changes to
target agnostic parts, especially since these affect so many ports that we can
not easily test.

[1/5] arm: Add define_attr to to create a mapping between MVE predicated and 
unpredicated insns
[2/5] arm: Annotate instructions with mve_safe_imp_xlane_pred
[3/5] arm: Fix a wrong attribute use and remove unused unspecs and iterators
[4/5] doloop: Add support for predicated vectorized loops
[5/5] arm: Add support for MVE Tail-Predicated Low Overhead Loops

Original cover letter:
This patch adds support for Arm's MVE Tail Predicated Low Overhead Loop
feature.

The M-class Arm-ARM:
https://developer.arm.com/documentation/ddi0553/bu/?lang=en
Section B5.5.1 "Loop tail predication" describes the feature
we are adding support for with this patch (although
we only add codegen for DLSTP/LETP instruction loops).

Previously with commit d2ed233cb94 we'd added support for
non-MVE DLS/LE loops through the loop-doloop pass, which, given
a standard MVE loop like:

```
void  __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int 
n)
{
  while (n > 0)
{
  mve_pred16_t p = vctp16q (n);
  int16x8_t va = vldrhq_z_s16 (a, p);
  int16x8_t vb = vldrhq_z_s16 (b, p);
  int16x8_t vc = vaddq_x_s16 (va, vb, p);
  vstrhq_p_s16 (c, vc, p);
  c+=8;
  a+=8;
  b+=8;
  n-=8;
}
}
```
.. would output:

```

dls lr, lr
.L3:
vctp.16 r3
vmrsip, P0  @ movhi
sxthip, ip
vmsr P0, ip @ movhi
mov r4, r0
vpst
vldrht.16   q2, [r4]
mov r4, r1
vmovq3, q0
vpst
vldrht.16   q1, [r4]
mov r4, r2
vpst
vaddt.i16   q3, q2, q1
subsr3, r3, #8
vpst
vstrht.16   q3, [r4]
addsr0, r0, #16
addsr1, r1, #16
addsr2, r2, #16
le  lr, .L3
```

where the LE instruction will decrement LR by 1, compare and
branch if needed.

(there are also other inefficiencies with the above code, like the
pointless vmrs/sxth/vmsr on the VPR and the adds not being merged
into the vldrht/vstrht as a #16 offsets and some random movs!
But that's different problems...)

The MVE version is similar, except that:
* Instead of DLS/LE the instructions are DLSTP/LETP.
* Instead of pre-calculating the number of iterations of the
  loop, we place the number of elements to be processed by the
  loop into LR.
* Instead of decrementing the LR by one, LETP will decrement it
  by FPSCR.LTPSIZE, which is the number of elements being
  processed in each iteration: 16 for 8-bit elements, 5 for 16-bit
  elements, etc.
* On the final iteration, automatic Loop Tail Predication is
  performed, as if the instructions within the loop had been VPT
  predicated with a VCTP generating the VPR predicate in every
  loop iteration.

The dlstp/letp loop now looks like:

```

dlstp.16lr, r3
.L14:
mov r3, r0
vldrh.16q3, [r3]
mov r3, r1
vldrh.16q2, [r3]
mov r3, r2
vadd.i16  q3, q3, q2
addsr0, r0, #16
vstrh.16q3, [r3]
addsr1, r1, #16
addsr2, r2, #16
letplr, .L14

```

Since the loop tail predication is automatic, we have eliminated
the VCTP that had been specified by the user in the intrinsic
and converted the VPT-predicated instructions into their
unpredicated equivalents (which also saves us from VPST insns).

The LE instruction here decrements LR by 8 in each iteration.

Stam Markianos-Wright (1):
  arm: Add define_attr to to create a mapping between MVE predicated and
unpredicated insns

Andre Vieira (4):
  arm: Annotate instructions with mve_safe_imp_xlane_pred
  arm: Fix a wrong attribute use and remove unused unspecs and iterators
  doloop: Add support for predicated vectorized loops
  arm: Add support for MVE Tail-Predicated Low Overhead Loops


-- 
2.17.1


Re: [r14-9173 Regression] FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr" on Linux/x86_64

2024-02-27 Thread Richard Biener
On Tue, 27 Feb 2024, Jeff Law wrote:

> 
> 
> On 2/27/24 00:43, Richard Biener wrote:
> > On Tue, 27 Feb 2024, haochen.jiang wrote:
> > 
> >> On Linux/x86_64,
> >>
> >> af66ad89e8169f44db723813662917cf4cbb78fc is the first bad commit
> >> commit af66ad89e8169f44db723813662917cf4cbb78fc
> >> Author: Richard Biener 
> >> Date:   Fri Feb 23 16:06:05 2024 +0100
> >>
> >>  middle-end/114070 - folding breaking VEC_COND expansion
> >>
> >> caused
> >>
> >> FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"
> > 
> > This shows that the x86 backend is missing vcond_mask_qiqi and friends
> > (for AVX512 mask modes).  Either that or both expand_vec_cond_expr_p
> > and all the machinery behind it (ISEL pass, lowering) should handle
> > pure integer mode VEC_COND_EXPR via bit operations.  I think quite some
> > targets now implement patterns for these variants, whatever their
> > boolean vector modes are.
> There may be more going on than just that.  The andnot-2 test started
> regressing on most targets overnight, including on targets without vector
> capabilities.

Yes, we fail this generic vector simplification test now (not sure
why the test didn't test forwprop1).  As said the problem is that
we can't test whether the existing VEC_COND_EXPR is handled
(we just can test if it's handled by vcond_mask).

As can be seen from x86 we don't want to turn the existing working
vcond into unsupported vcond_mask as that will fail to expand
(that's what the patch fixed for SPARC) or similarly bad, would
be lowered during vector lowering into inefficient scalar code.

In the end this is fallout from splitting out the condition from
VEC_COND_EXPR but not getting rid of all vcond{,u,eq} expanders,
rewriting target support to vcmp{,u,eq} + vcond_mask ... (those partial
transitions...).

Grepping shows there's unfortunately plenty of targets with
vcond{,u,eq} patterns, maybe most of them have vcmp{,u,eq}
patterns as well but only cutting those off hard by simply
removing the expansion path will tell who's affected.

One could try to fix this by adding a 2nd set of patterns where
the defining conditionals are visible, so we can check for
vcond[u] expansion support (and compare before/after state),
and allow the patterns w/o visible compares only when vcond_mask
is available.

Richard.

> fr30-elf for example:
> 
> 
> > Tests that now fail, but worked before (2 tests):
> > 
> > fr30-sim: gcc: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3
> > "_expr"
> > fr30-sim: gcc: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3
> > "_expr"
> 
> 
> Jeff


Re: [patch] OpenACC: Add Fortran routines acc_{alloc,free,hostptr,deviceptr,memcpy_{to,from}_device*}

2024-02-27 Thread Thomas Schwinge
Hi Tobias!

On 2024-02-27T13:29:33+0100, Tobias Burnus  wrote:
> Thomas Schwinge:
>>>   @table @asis
>>>   @item @emph{Description}
>>> -This function allocates @var{len} bytes of device memory. It returns
>>> +This function allocates @var{bytes} of device memory. It returns

>> Not '@var{bytes} {+bytes+}' or similar?
>
> I think either works – depending how one parses @var{} mentally, 
> one of the variants sounds smooth and the other very odd. But I can/will 
> change it.

Yeah, I see.  Not the strongest argument ("upstream vs. local" style),
but I see that while OpenACC 3.3 doesn't for 'acc_malloc', it does, for
example, for 'acc_copyin' talk about "'bytes' bytes" (or, avoiding the
issue: "'bytes' specifies the data size in bytes").


>>> --- a/libgomp/openacc.f90
>>> +++ b/libgomp/openacc.f90

>> Assuming that 'module openacc_internal' currently is sorted per
>> appearance in the OpenACC specification (?), I suggest we continue to do
>> so.  (..., like in 'openacc_lib.h', too.)

> I will check – it looks only block-wise sorted but I might be wrong.

OK, but please don't sink too much time into that.

> I 
> followed location of the comments, placing it before the routines that 
> followed the comment, assuming that the comments were at the right spot.


>>> @@ -794,6 +881,9 @@ module openacc
>>> ...
>>> +  public :: acc_malloc, acc_free, acc_map_data, acc_unmap_data, 
>>> acc_deviceptr
>>> +  public :: acc_hostptr, acc_memcpy_to_device, acc_memcpy_to_device_async
>>> +  public :: acc_memcpy_from_device, acc_memcpy_from_device_async
>>>   ...
>>> -  ! acc_malloc: Only available in C/C++
>>> -  ! acc_free: Only available in C/C++
>>> -
>>> ...
>>> interface acc_is_present
>>>   procedure :: acc_is_present_32_h
>>>   procedure :: acc_is_present_64_h
>>>   procedure :: acc_is_present_array_h
>>> end interface

>> Is that now a different style that we're not listing the new interfaces
>> in 'module openacc' here?
>
> As there is no precedent for this type of interface, the style is by 
> nature differently. But the question is which style is better. The 
> current 'openacc' is very short – and contains not a single specific 
> interface, but only generic interfaces. The actual specific-procedure 
> declarations are only in 'openacc_internal'.
>
> Those new procedures are the first ones that do not have a generic 
> interface and only a specific one. Thus, one can either put the specific 
> one into 'openacc_internal' and refer it from 'openacc' (via 'use 
> openacc_internal' + 'public :: acc_') – or place the 
> interface directly into 'openacc' (and not touching 'openacc_internal' 
> at all).
>
> During development, I had a accidentally a mixture between both - and 
> then settled for the current variant. – Possibly, moving the interface 
> to 'openacc' is clearer?
>
> Thoughts?

No, sorry.  As I said: "I don't know much about Fortran interfaces".  :-|


>>> --- /dev/null
>>> +++ b/libgomp/testsuite/libgomp.fortran/acc_host_device_ptr.f90

>>> +  ! The following assumes sizeof(void*) being the same on host and device:

>> That's generally required anyway.
>
> I have to admit that I don't know OpenACC well enough to see whether 
> that's the case or not.

My thinking, "simply", is that this follows implicitly from the fact that
data layout has to match between host and device, and if pointers have
different sizes, that breaks?

For example, OpenACC 3.3, 2.6.4 "Data Structures with Pointers":

| [...]
| When a data object is copied to device memory, the values are copied exactly. 
If the data is a data
| structure that includes a pointer, or is just a pointer, the pointer value 
copied to device memory
| will be the host pointer value. [...]

> And, while I am not very consistent, I do try to 
> document stricter requirements / implementation-specific parts in a 
> testcases.

ACK, that's always good practice.

> I know that OpenMP permits that the pointer size differs

Oh, really!?

> and 'void *p = 
> omp_target_alloc (...);' might in this case not return the device 
> pointer but a handle to the device ptr. (For instance, it could be a 
> pointer to an uint128_t variable for a 128bit device pointer; I think 
> such a hardware exists in real - and uses several bits for other 
> purposes like flags.)

I do see in OpenMP 5.2, 1.2.6 "Data Terminology":

| *device address*  An address of an object that may be referenced on a _target 
device_.

| *device pointer*  An _implementation-defined handle_ that refers to a _device 
address_.

..., and I'm now -- at least vaguely -- curious how OpenMP handles
different-sized pointers for host vs. device in host/device-shared data
layout.  ("Fortunately" ;-) I have too many higher-priority items to look
after, so not able to spend more time on that questions...)

> In that case, host-side pointer arithmetic won't work and 
> 'is_device_ptr' clauses etc. need to do transfer work.
>
> But, admittedly, in GCC there it is assumed at many places that both 

Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

2024-02-27 Thread Richard Biener
On Tue, Feb 27, 2024 at 1:57 PM Tamar Christina  wrote:
>
> > Thanks Tamar.
> >
> > > Those two cases also *completely* stop vectorization because of either the
> > > control flow or the fact the vectorizer can't handle complex types.
> >
> > Yes, we eventually would like to vectorize the SAT ALU but we start with 
> > scalar part
> > first.
> > I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> > well with some additions as below.
> > Feel free to correct me if any misunderstandings.
> >
> > 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> > usadd$a3(as well as gen_int_libfunc) for int.
> > 2. We need to implement a default implementation of SAT_ADD if
> > direct_binary_optab_supported_p is false.
> > It looks like the default implementation is difficult to make every 
> > backend happy.
> > That is why you suggest just normal
> > DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> >
> > Thanks Richard.
> >
> > > But what I'd like to see is that we do more instruction selection on 
> > > GIMPLE
> > > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > > passes doing what I'd call instruction selection).  But that means not 
> > > adding
> > > match.pd patterns for that or at least have a separate isel-match.pd
> > > machinery for that.
> >
> > > So as a start I would go for a direct optab and see to recognize it during
> > > ISEL?
> >
> > Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> > to know isel and I am happy to
> > try that once we have conclusion.
> >
>
> So after a lively discussion on IRC, the conclusion is that before we proceed 
> Richi would
> like to see some examples of various operations.  The problem is that 
> unsigned saturating
> addition is the simplest example and it may lead to an implementation 
> strategy that doesn't
> scale.
>
> So I'd suggest writing some example of both signed and unsigned saturating 
> add and multiply
>
> Because signed addition, will likely require a branch and signed 
> multiplication would require a
> larger type.
>
> This would allow us to better understand what kind of gimple would have to to 
> deal with in
> ISEL and VECT if we decide not to lower early.

More specifically before making something like .SAT_ADD a core part of
GIMPLE I'd like
to point out that we have saturating PLUS_EXPR but just for
fixed-point types.  I realize
Joseph thinks that keying this on the type was wrong and it should
have used integer
types and special saturating operations.  Still having both,
type-keyed saturating PLUS_EXPR
and "code"-keyed .SAT_ADD (on integer types only?) looks like a mess.

It might be that the way to go is to turn all existing saturating type
*_EXPR into
.SAT_* internal function calls, in the end mapping to the optabs and
eventual RTX codes.

That could work for both integer types and fixed-point types.

I'll also note that "saturating" is just another variant of overflow
behavior of which we have
trapping (-ftrapv), wrapping (-fwrapv), signed-undefined (default) and
also (kind-of) sanitized.
We do lack direct IL representation of -ftrapv and -fwrapv, the
semantics on a PLUS_EXPR
depend on per-function flags.  Eventually a common representation
could be found here.
For saturating I was thinking of .ADD_OVERFLOW (a, b,
saturation-value), a "trap" could
be a "trapping" saturation value, "undefined" could be a
"not-a-thing".  But I didn't think much
about this.

That said, I would like to see the bigger picture to be kept in mind
before altering the GIMPLE IL.

Adding an internal function for an already present optab is a
no-brainer.  Adding a vectorizer
and/or if-conversion pattern to make use of this during vectorization
is existing practice.
Adding pattern recognition to ISEL or widening-mul passes for
instructions the CPU can do
is existing practice and OK.

Thanks,
Richard.


> Thanks,
> Tamar
>
> > Pan
> >
> > -Original Message-
> > From: Tamar Christina 
> > Sent: Tuesday, February 27, 2024 5:57 PM
> > To: Richard Biener 
> > Cc: Li, Pan2 ; gcc-patches@gcc.gnu.org; 
> > juzhe.zh...@rivai.ai;
> > Wang, Yanzhang ; kito.ch...@gmail.com;
> > richard.sandiford@arm.com2; jeffreya...@gmail.com
> > Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > US_PLUS
> >
> > > -Original Message-
> > > From: Richard Biener 
> > > Sent: Tuesday, February 27, 2024 9:44 AM
> > > To: Tamar Christina 
> > > Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai;
> > > yanzhang.w...@intel.com; kito.ch...@gmail.com;
> > > richard.sandiford@arm.com2; jeffreya...@gmail.com
> > > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn 
> > > saturation
> > > US_PLUS
> > >
> > > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> > >  wrote:
> > > >
> > > > Hi Pan,
> > > >
> > > > > From: Pan Li 
> > > > >
> > > > > Hi Richard & Tamar,
> > > > >
> > > > > Try the DEF_INTERNAL_INT_EXT_FN as your 

[pushed] analyzer: fix ICE on floating-point bounds [PR111881]

2024-02-27 Thread David Malcolm
Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Successful run of analyzer integration tests on x86_64-pc-linux-gnu.
Pushed to trunk as r14-9195-g43ad6ce60108ac.

gcc/analyzer/ChangeLog:
PR analyzer/111881
* constraint-manager.cc (bound::ensure_closed): Assert that
m_constant has integral type.
(range::add_bound): Bail out on floating point constants.

gcc/testsuite/ChangeLog:
PR analyzer/111881
* c-c++-common/analyzer/conditionals-pr111881.c: New test.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/constraint-manager.cc| 6 ++
 .../c-c++-common/analyzer/conditionals-pr111881.c | 8 
 2 files changed, 14 insertions(+)
 create mode 100644 gcc/testsuite/c-c++-common/analyzer/conditionals-pr111881.c

diff --git a/gcc/analyzer/constraint-manager.cc 
b/gcc/analyzer/constraint-manager.cc
index e8bcabeb0cd5..a380b95315ee 100644
--- a/gcc/analyzer/constraint-manager.cc
+++ b/gcc/analyzer/constraint-manager.cc
@@ -124,10 +124,12 @@ bound::ensure_closed (enum bound_kind bound_kind)
 For example, convert 3 < x into 4 <= x,
 and convert x < 5 into x <= 4.  */
   gcc_assert (CONSTANT_CLASS_P (m_constant));
+  gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (m_constant)));
   m_constant = fold_build2 (bound_kind == BK_UPPER ? MINUS_EXPR : 
PLUS_EXPR,
TREE_TYPE (m_constant),
m_constant, integer_one_node);
   gcc_assert (CONSTANT_CLASS_P (m_constant));
+  gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (m_constant)));
   m_closed = true;
 }
 }
@@ -306,6 +308,10 @@ range::above_upper_bound (tree rhs_const) const
 bool
 range::add_bound (bound b, enum bound_kind bound_kind)
 {
+  /* Bail out on floating point constants.  */
+  if (!INTEGRAL_TYPE_P (TREE_TYPE (b.m_constant)))
+return true;
+
   b.ensure_closed (bound_kind);
 
   switch (bound_kind)
diff --git a/gcc/testsuite/c-c++-common/analyzer/conditionals-pr111881.c 
b/gcc/testsuite/c-c++-common/analyzer/conditionals-pr111881.c
new file mode 100644
index ..ecf165feeec2
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/analyzer/conditionals-pr111881.c
@@ -0,0 +1,8 @@
+/* Verify we don't ICE on certain float conditionals.  */
+
+/* { dg-additional-options "-Ofast" } */
+
+int test_pr111881 (float sf1)
+{
+  return sf1 <= 0 || sf1 >= 7 ? 0 : sf1;
+}
-- 
2.26.3



Re: [PATCH 2/2] aarch64: Add support for _BitInt

2024-02-27 Thread Andre Vieira (lists)

Hey,

Dropped the first patch and dealt with the comments above, hopefully I 
didn't miss any this time.


--

This patch adds support for C23's _BitInt for the AArch64 port when 
compiling

for little endianness.  Big Endianness requires further target-agnostic
support and we therefor disable it for now.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (TARGET_C_BITINT_TYPE_INFO): Declare MACRO.
(aarch64_bitint_type_info): New function.
(aarch64_return_in_memory_1): Return large _BitInt's in memory.
(aarch64_function_arg_alignment): Adapt to correctly return the ABI
mandated alignment of _BitInt(N) where N > 128 as the alignment of
TImode.
(aarch64_composite_type_p): Return true for _BitInt(N), where N > 128.

libgcc/ChangeLog:

* config/aarch64/t-softfp (softfp_extras): Add floatbitinthf,
floatbitintbf, floatbitinttf and fixtfbitint.
* config/aarch64/libgcc-softfp.ver (GCC_14.0.0): Add __floatbitinthf,
__floatbitintbf, __floatbitinttf and __fixtfbitint.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/bitint-alignments.c: New test.
* gcc.target/aarch64/bitint-args.c: New test.
* gcc.target/aarch64/bitint-sizes.c: New test.


On 02/02/2024 14:46, Jakub Jelinek wrote:

On Thu, Jan 25, 2024 at 05:45:01PM +, Andre Vieira wrote:

This patch adds support for C23's _BitInt for the AArch64 port when compiling
for little endianness.  Big Endianness requires further target-agnostic
support and we therefor disable it for now.

gcc/ChangeLog:

* config/aarch64/aarch64.cc (TARGET_C_BITINT_TYPE_INFO): Declare MACRO.
(aarch64_bitint_type_info): New function.
(aarch64_return_in_memory_1): Return large _BitInt's in memory.
(aarch64_function_arg_alignment): Adapt to correctly return the ABI
mandated alignment of _BitInt(N) where N > 128 as the alignment of
TImode.
(aarch64_composite_type_p): Return true for _BitInt(N), where N > 128.

libgcc/ChangeLog:

* config/aarch64/t-softfp: Add fixtfbitint, floatbitinttf and
floatbitinthf to the softfp_extras variable to ensure the
runtime support is available for _BitInt.


I think this lacks some config/aarch64/t-whatever.ver
additions.
See PR113700 for some more details.
We want the support routines for binary floating point <-> _BitInt
conversions in both libgcc.a and libgcc_s.so.1 and exported from the latter
too at GCC_14.0.0 symver, while decimal floating point <-> _BitInt solely in
libgcc.a (as with all the huge dfp/bid stuff).

Jakub
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
16318bf925883ecedf9345e53fc0824a553b2747..9bd8d22f6edd9f6c77907ec383f9e8bf055cfb8b
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6583,6 +6583,7 @@ aarch64_return_in_memory_1 (const_tree type)
   int count;
 
   if (!AGGREGATE_TYPE_P (type)
+  && TREE_CODE (type) != BITINT_TYPE
   && TREE_CODE (type) != COMPLEX_TYPE
   && TREE_CODE (type) != VECTOR_TYPE)
 /* Simple scalar types always returned in registers.  */
@@ -21895,6 +21896,11 @@ aarch64_composite_type_p (const_tree type,
   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
 return true;
 
+  if (type
+  && TREE_CODE (type) == BITINT_TYPE
+  && int_size_in_bytes (type) > 16)
+return true;
+
   if (mode == BLKmode
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
@@ -28400,6 +28406,42 @@ aarch64_excess_precision (enum excess_precision_type 
type)
   return FLT_EVAL_METHOD_UNPREDICTABLE;
 }
 
+/* Implement TARGET_C_BITINT_TYPE_INFO.
+   Return true if _BitInt(N) is supported and fill its details into *INFO.  */
+bool
+aarch64_bitint_type_info (int n, struct bitint_info *info)
+{
+  if (TARGET_BIG_END)
+return false;
+
+  if (n <= 8)
+info->limb_mode = QImode;
+  else if (n <= 16)
+info->limb_mode = HImode;
+  else if (n <= 32)
+info->limb_mode = SImode;
+  else if (n <= 64)
+info->limb_mode = DImode;
+  else if (n <= 128)
+info->limb_mode = TImode;
+  else
+/* The AAPCS for AArch64 defines _BitInt(N > 128) as an array with
+   type {signed,unsigned} __int128[M] where M*128 >= N.  However, to be
+   able to use libgcc's implementation to support large _BitInt's we need
+   to use a LIMB_MODE that is no larger than 'long long'.  This is why we
+   use DImode for our internal LIMB_MODE and we define the ABI_LIMB_MODE to
+   be TImode to ensure we are ABI compliant.  */
+info->limb_mode = DImode;
+
+  if (n > 128)
+info->abi_limb_mode = TImode;
+  else
+info->abi_limb_mode = info->limb_mode;
+  info->big_endian = TARGET_BIG_END;
+  info->extended = false;
+  return true;
+}
+
 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
scheduled for speculative 

Re: [PATCH] Fix internal error in GIMPLE DSE

2024-02-27 Thread Richard Biener
On Tue, Feb 27, 2024 at 1:50 PM Eric Botcazou  wrote:
>
> Hi,
>
> this is a regression present on the mainline, 13 and 12 branches.  For the
> attached Ada case, it's a tree checking failure on the mainline at -O:
>
> +===GNAT BUG DETECTED==+
> | 14.0.1 20240226 (experimental) [master r14-9171-g4972f97a265]  GCC error:|
> | tree check: expected tree that contains 'decl common' structure, |
> | have 'component_ref' in tree_could_trap_p, at tree-eh.cc:2733|
> | Error detected around /home/eric/cvs/gcc/gcc/testsuite/gnat.dg/opt104.adb:
>
> Time is a 10-byte record and Packed_Rec.T is placed at bit-offset 65 because
> of the packing. so tree-ssa-dse.cc:setup_live_bytes_from_ref has computed a
> const_size of 88 from ref->offset of 65 and ref->max_size of 80.
>
> Then in tree-ssa-dse.cc:compute_trims:
>
> 411   int last_live = bitmap_last_set_bit (live);
> (gdb) next
> 412   if (ref->size.is_constant (_size))
> (gdb)
> 414   int last_orig = (const_size / BITS_PER_UNIT) - 1;
> (gdb)
> 418   *trim_tail = last_orig - last_live;
>
> (gdb) call debug_bitmap (live)
> n_bits = 256, set = {0 1 2 3 4 5 6 7 8 9 10 }
> (gdb) p last_live
> $33 = 10
> (gdb) p const_size
> $34 = 80
> (gdb) p last_orig
> $35 = 9
> (gdb) p *trim_tail
> $36 = -1
>
> In other words, compute_trims is overlooking the alignment adjustments applied
> earlier by setup_live_bytes_from_ref.  Moveover it reads:
>
>   /* We use sbitmaps biased such that ref->offset is bit zero and the bitmap
>  extends through ref->size.  So we know that in the original bitmap
>  bits 0..ref->size were true.  We don't actually need the bitmap, just
>  the REF to compute the trims.  */
>
> But setup_live_bytes_from_ref used ref->max_size instead of ref->size.
>
> It appears that all the callers of compute_trims assume that ref->offset is
> byte aligned and that the trimmed bytes are relative to ref->size, so the
> patch simply adds an early return if either condition is not fulfilled
>
> Tested on x86-64/Linux, OK for all the affected branches?

OK.

Thanks,
Richard.

>
> 2024-02-27  Eric Botcazou  
>
> * tree-ssa-dse.cc (compute_trims): Fix description.  Return early
> if ref->offset is not byte aligned or ref->size is not known to be
> equal to ref->max_size.
> (maybe_trim_complex_store): Fix description.
> (maybe_trim_constructor_store): Likewise.
> (maybe_trim_partially_dead_store): Likewise.
>
>
> 2024-02-27  Eric Botcazou  
>
> * gnat.dg/opt104.ads, gnat.dg/opt104.adb! New test.
>
> --
> Eric Botcazou


Re: [r14-9173 Regression] FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr" on Linux/x86_64

2024-02-27 Thread Jeff Law




On 2/27/24 00:43, Richard Biener wrote:

On Tue, 27 Feb 2024, haochen.jiang wrote:


On Linux/x86_64,

af66ad89e8169f44db723813662917cf4cbb78fc is the first bad commit
commit af66ad89e8169f44db723813662917cf4cbb78fc
Author: Richard Biener 
Date:   Fri Feb 23 16:06:05 2024 +0100

 middle-end/114070 - folding breaking VEC_COND expansion

caused

FAIL: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"


This shows that the x86 backend is missing vcond_mask_qiqi and friends
(for AVX512 mask modes).  Either that or both expand_vec_cond_expr_p
and all the machinery behind it (ISEL pass, lowering) should handle
pure integer mode VEC_COND_EXPR via bit operations.  I think quite some
targets now implement patterns for these variants, whatever their
boolean vector modes are.
There may be more going on than just that.  The andnot-2 test started 
regressing on most targets overnight, including on targets without 
vector capabilities.


fr30-elf for example:



Tests that now fail, but worked before (2 tests):

fr30-sim: gcc: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"
fr30-sim: gcc: gcc.dg/tree-ssa/andnot-2.c scan-tree-dump-not forwprop3 "_expr"



Jeff


[PING 4][PATCH v3] rs6000/p8swap: Fix incorrect lane extraction by vec_extract() [PR106770]

2024-02-27 Thread Surya Kumari Jangala
Ping

On 08/01/24 11:19 am, Surya Kumari Jangala wrote:
> Ping
> 
> On 28/11/23 6:24 pm, Surya Kumari Jangala wrote:
>> Ping
>>
>> On 10/11/23 12:27 pm, Surya Kumari Jangala wrote:
>>> Ping
>>>
>>> On 03/11/23 1:14 pm, Surya Kumari Jangala wrote:
 Hi Segher,
 I have incorporated changes in the code as per the review comments 
 provided by you 
 for version 2 of the patch. Please review.

 Regards,
 Surya


 rs6000/p8swap: Fix incorrect lane extraction by vec_extract() [PR106770]

 In the routine rs6000_analyze_swaps(), special handling of swappable
 instructions is done even if the webs that contain the swappable 
 instructions
 are not optimized, i.e., the webs do not contain any permuting load/store
 instructions along with the associated register swap instructions. Doing 
 special
 handling in such webs will result in the extracted lane being adjusted
 unnecessarily for vec_extract.

 Another issue is that existing code treats non-permuting loads/stores as 
 special
 swappables. Non-permuting loads/stores (that have not yet been split into a
 permuting load/store and a swap) are handled by converting them into a 
 permuting
 load/store (which effectively removes the swap). As a result, if special
 swappables are handled only in webs containing permuting loads/stores, then
 non-optimal code is generated for non-permuting loads/stores.

 Hence, in this patch, all webs containing either permuting loads/ stores or
 non-permuting loads/stores are marked as requiring special handling of
 swappables. Swaps associated with permuting loads/stores are marked for 
 removal,
 and non-permuting loads/stores are converted to permuting loads/stores. 
 Then the
 special swappables in the webs are fixed up.

 This patch also ensures that swappable instructions are not modified in the
 following webs as it is incorrect to do so:
  - webs containing permuting load/store instructions and associated swap
instructions that are transformed by converting the permuting memory
instructions into non-permuting instructions and removing the swap
instructions.
  - webs where swap(load(vector constant)) instructions are replaced with
load(swapped vector constant).

 2023-09-10  Surya Kumari Jangala  

 gcc/
PR rtl-optimization/PR106770
* config/rs6000/rs6000-p8swap.cc (non_permuting_mem_insn): New function.
(handle_non_permuting_mem_insn): New function.
(rs6000_analyze_swaps): Handle swappable instructions only in certain
webs.
(web_requires_special_handling): New instance variable.
(handle_special_swappables): Remove handling of non-permuting load/store
instructions.

 gcc/testsuite/
PR rtl-optimization/PR106770
* gcc.target/powerpc/pr106770.c: New test.
 ---

 diff --git a/gcc/config/rs6000/rs6000-p8swap.cc 
 b/gcc/config/rs6000/rs6000-p8swap.cc
 index 0388b9bd736..02ea299bc3d 100644
 --- a/gcc/config/rs6000/rs6000-p8swap.cc
 +++ b/gcc/config/rs6000/rs6000-p8swap.cc
 @@ -179,6 +179,13 @@ class swap_web_entry : public web_entry_base
unsigned int special_handling : 4;
/* Set if the web represented by this entry cannot be optimized.  */
unsigned int web_not_optimizable : 1;
 +  /* Set if the swappable insns in the web represented by this entry
 + have to be fixed. Swappable insns have to be fixed in:
 +   - webs containing permuting loads/stores and the swap insns
 +   in such webs have been marked for removal
 +   - webs where non-permuting loads/stores have been converted
 +   to permuting loads/stores  */
 +  unsigned int web_requires_special_handling : 1;
/* Set if this insn should be deleted.  */
unsigned int will_delete : 1;
  };
 @@ -1468,14 +1475,6 @@ handle_special_swappables (swap_web_entry 
 *insn_entry, unsigned i)
if (dump_file)
fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
break;
 -case SH_NOSWAP_LD:
 -  /* Convert a non-permuting load to a permuting one.  */
 -  permute_load (insn);
 -  break;
 -case SH_NOSWAP_ST:
 -  /* Convert a non-permuting store to a permuting one.  */
 -  permute_store (insn);
 -  break;
  case SH_EXTRACT:
/* Change the lane on an extract operation.  */
adjust_extract (insn);
 @@ -2401,6 +2400,25 @@ recombine_lvx_stvx_patterns (function *fun)
free (to_delete);
  }
  
 +/* Return true if insn is a non-permuting load/store.  */
 +static bool
 +non_permuting_mem_insn (swap_web_entry *insn_entry, unsigned int i)
 +{
 +  return insn_entry[i].special_handling == SH_NOSWAP_LD
 +   || 

RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

2024-02-27 Thread Tamar Christina
> Thanks Tamar.
> 
> > Those two cases also *completely* stop vectorization because of either the
> > control flow or the fact the vectorizer can't handle complex types.
> 
> Yes, we eventually would like to vectorize the SAT ALU but we start with 
> scalar part
> first.
> I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works
> well with some additions as below.
> Feel free to correct me if any misunderstandings.
> 
> 1. usadd$Q$a3 are restricted to fixed point and we need to change it to
> usadd$a3(as well as gen_int_libfunc) for int.
> 2. We need to implement a default implementation of SAT_ADD if
> direct_binary_optab_supported_p is false.
> It looks like the default implementation is difficult to make every 
> backend happy.
> That is why you suggest just normal
> DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.
> 
> Thanks Richard.
> 
> > But what I'd like to see is that we do more instruction selection on GIMPLE
> > but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> > passes doing what I'd call instruction selection).  But that means not 
> > adding
> > match.pd patterns for that or at least have a separate isel-match.pd
> > machinery for that.
> 
> > So as a start I would go for a direct optab and see to recognize it during
> > ISEL?
> 
> Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good
> to know isel and I am happy to
> try that once we have conclusion.
> 

So after a lively discussion on IRC, the conclusion is that before we proceed 
Richi would
like to see some examples of various operations.  The problem is that unsigned 
saturating
addition is the simplest example and it may lead to an implementation strategy 
that doesn't
scale.

So I'd suggest writing some example of both signed and unsigned saturating add 
and multiply

Because signed addition, will likely require a branch and signed multiplication 
would require a
larger type.

This would allow us to better understand what kind of gimple would have to to 
deal with in
ISEL and VECT if we decide not to lower early.

Thanks,
Tamar

> Pan
> 
> -Original Message-
> From: Tamar Christina 
> Sent: Tuesday, February 27, 2024 5:57 PM
> To: Richard Biener 
> Cc: Li, Pan2 ; gcc-patches@gcc.gnu.org; 
> juzhe.zh...@rivai.ai;
> Wang, Yanzhang ; kito.ch...@gmail.com;
> richard.sandiford@arm.com2; jeffreya...@gmail.com
> Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> US_PLUS
> 
> > -Original Message-
> > From: Richard Biener 
> > Sent: Tuesday, February 27, 2024 9:44 AM
> > To: Tamar Christina 
> > Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai;
> > yanzhang.w...@intel.com; kito.ch...@gmail.com;
> > richard.sandiford@arm.com2; jeffreya...@gmail.com
> > Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> > US_PLUS
> >
> > On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
> >  wrote:
> > >
> > > Hi Pan,
> > >
> > > > From: Pan Li 
> > > >
> > > > Hi Richard & Tamar,
> > > >
> > > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > > >
> > > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > > >
> > >
> > > I think a couple of things are being confused here.  So lets break it 
> > > down:
> > >
> > > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > > we only want one internal function for both signed and unsigned SAT_ADD.
> > > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> > >
> > > Now the downside of this is that this is a direct internal optab.  This 
> > > means
> > > that for the representation to be used the target *must* have the optab
> > > implemented.   This is a bit annoying because it doesn't allow us to 
> > > generically
> > > assume that all targets use SAT_ADD for saturating add and thus only have 
> > > to
> > > write optimization for this representation.
> > >
> > > This is why Richi said we may need to use a new tree_code because we can
> > > override tree code expansions.  However the same can be done with the
> _EXT_FN
> > > internal functions.
> > >
> > > So what I meant was that we want to have a combination of the two. i.e. a
> > > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> >
> > Whether we want/need _EXT or only direct depends mainly on how we want to
> > leverage support.  If it's only during vectorization and possibly 
> > instruction
> > selection a direct optab is IMO the way to go.  Generic optimization only
> > marginally improves when you explode the number of basic operations you
> > expose - in fact it gets quite 

[PATCH] Fix internal error in GIMPLE DSE

2024-02-27 Thread Eric Botcazou
Hi,

this is a regression present on the mainline, 13 and 12 branches.  For the 
attached Ada case, it's a tree checking failure on the mainline at -O:

+===GNAT BUG DETECTED==+
| 14.0.1 20240226 (experimental) [master r14-9171-g4972f97a265]  GCC error:|
| tree check: expected tree that contains 'decl common' structure, |
| have 'component_ref' in tree_could_trap_p, at tree-eh.cc:2733|
| Error detected around /home/eric/cvs/gcc/gcc/testsuite/gnat.dg/opt104.adb:

Time is a 10-byte record and Packed_Rec.T is placed at bit-offset 65 because 
of the packing. so tree-ssa-dse.cc:setup_live_bytes_from_ref has computed a 
const_size of 88 from ref->offset of 65 and ref->max_size of 80.

Then in tree-ssa-dse.cc:compute_trims:

411   int last_live = bitmap_last_set_bit (live);
(gdb) next
412   if (ref->size.is_constant (_size))
(gdb) 
414   int last_orig = (const_size / BITS_PER_UNIT) - 1;
(gdb) 
418   *trim_tail = last_orig - last_live;

(gdb) call debug_bitmap (live)
n_bits = 256, set = {0 1 2 3 4 5 6 7 8 9 10 }
(gdb) p last_live
$33 = 10
(gdb) p const_size
$34 = 80
(gdb) p last_orig
$35 = 9
(gdb) p *trim_tail
$36 = -1

In other words, compute_trims is overlooking the alignment adjustments applied 
earlier by setup_live_bytes_from_ref.  Moveover it reads:

  /* We use sbitmaps biased such that ref->offset is bit zero and the bitmap
 extends through ref->size.  So we know that in the original bitmap
 bits 0..ref->size were true.  We don't actually need the bitmap, just
 the REF to compute the trims.  */

But setup_live_bytes_from_ref used ref->max_size instead of ref->size.

It appears that all the callers of compute_trims assume that ref->offset is 
byte aligned and that the trimmed bytes are relative to ref->size, so the 
patch simply adds an early return if either condition is not fulfilled

Tested on x86-64/Linux, OK for all the affected branches?


2024-02-27  Eric Botcazou  

* tree-ssa-dse.cc (compute_trims): Fix description.  Return early
if ref->offset is not byte aligned or ref->size is not known to be
equal to ref->max_size.
(maybe_trim_complex_store): Fix description.
(maybe_trim_constructor_store): Likewise.
(maybe_trim_partially_dead_store): Likewise.


2024-02-27  Eric Botcazou  

* gnat.dg/opt104.ads, gnat.dg/opt104.adb! New test.

-- 
Eric Botcazoudiff --git a/gcc/tree-ssa-dse.cc b/gcc/tree-ssa-dse.cc
index 81b65125409..5869010287c 100644
--- a/gcc/tree-ssa-dse.cc
+++ b/gcc/tree-ssa-dse.cc
@@ -403,11 +403,11 @@ setup_live_bytes_from_ref (ao_ref *ref, sbitmap live_bytes)
   return false;
 }
 
-/* Compute the number of elements that we can trim from the head and
-   tail of ORIG resulting in a bitmap that is a superset of LIVE.
+/* Compute the number of stored bytes that we can trim from the head and
+   tail of REF.  LIVE is the bitmap of stores to REF that are still live.
 
-   Store the number of elements trimmed from the head and tail in
-   TRIM_HEAD and TRIM_TAIL.
+   Store the number of bytes trimmed from the head and tail in TRIM_HEAD
+   and TRIM_TAIL respectively.
 
STMT is the statement being trimmed and is used for debugging dump
output only.  */
@@ -416,10 +416,16 @@ static void
 compute_trims (ao_ref *ref, sbitmap live, int *trim_head, int *trim_tail,
 	   gimple *stmt)
 {
-  /* We use sbitmaps biased such that ref->offset is bit zero and the bitmap
- extends through ref->size.  So we know that in the original bitmap
- bits 0..ref->size were true.  We don't actually need the bitmap, just
- the REF to compute the trims.  */
+  *trim_head = 0;
+  *trim_tail = 0;
+
+  /* We use bitmaps biased such that ref->offset is contained in bit zero and
+ the bitmap extends through ref->max_size and we know that in the original
+ bitmap bits 0 .. ref->max_size were true.  But we need to check that this
+ covers exactly the bytes of REF.  */
+  const unsigned int align = known_alignment (ref->offset);
+  if ((align && align < BITS_PER_UNIT) || !known_eq (ref->size, ref->max_size))
+return;
 
   /* Now identify how much, if any of the tail we can chop off.  */
   HOST_WIDE_INT const_size;
@@ -444,8 +450,6 @@ compute_trims (ao_ref *ref, sbitmap live, int *trim_head, int *trim_tail,
 			   last_orig) <= 0)
 	*trim_tail = 0;
 }
-  else
-*trim_tail = 0;
 
   /* Identify how much, if any of the head we can chop off.  */
   int first_orig = 0;
@@ -503,8 +507,7 @@ compute_trims (ao_ref *ref, sbitmap live, int *trim_head, int *trim_tail,
 	}
 }
 
-  if ((*trim_head || *trim_tail)
-  && dump_file && (dump_flags & TDF_DETAILS))
+  if ((*trim_head || *trim_tail) && dump_file && (dump_flags & TDF_DETAILS))
 {
   fprintf (dump_file, "  Trimming statement (head = %d, tail = %d): ",
 	   *trim_head, *trim_tail);
@@ -513,9 +516,9 @@ compute_trims (ao_ref *ref, sbitmap live, int 

RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation US_PLUS

2024-02-27 Thread Li, Pan2
Thanks Tamar.

> Those two cases also *completely* stop vectorization because of either the
> control flow or the fact the vectorizer can't handle complex types.

Yes, we eventually would like to vectorize the SAT ALU but we start with scalar 
part first.
I tried the DEF_INTERNAL_SIGNED_OPTAB_EXT_FN as your suggestion. It works well 
with some additions as below.
Feel free to correct me if any misunderstandings.

1. usadd$Q$a3 are restricted to fixed point and we need to change it to 
usadd$a3(as well as gen_int_libfunc) for int.
2. We need to implement a default implementation of SAT_ADD if 
direct_binary_optab_supported_p is false.
It looks like the default implementation is difficult to make every backend 
happy. That is why you suggest just normal
DEF_INTERNAL_SIGNED_OPTAB_FN in another thread.

Thanks Richard.

> But what I'd like to see is that we do more instruction selection on GIMPLE
> but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> passes doing what I'd call instruction selection).  But that means not adding
> match.pd patterns for that or at least have a separate isel-match.pd
> machinery for that.

> So as a start I would go for a direct optab and see to recognize it during
> ISEL?

Looks we have sorts of SAT alu like PLUS/MINUS/MULT/DIV/SHIFT/NEG/ABS, good to 
know isel and I am happy to
try that once we have conclusion.

Pan

-Original Message-
From: Tamar Christina  
Sent: Tuesday, February 27, 2024 5:57 PM
To: Richard Biener 
Cc: Li, Pan2 ; gcc-patches@gcc.gnu.org; 
juzhe.zh...@rivai.ai; Wang, Yanzhang ; 
kito.ch...@gmail.com; richard.sandiford@arm.com2; jeffreya...@gmail.com
Subject: RE: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation 
US_PLUS

> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, February 27, 2024 9:44 AM
> To: Tamar Christina 
> Cc: pan2...@intel.com; gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai;
> yanzhang.w...@intel.com; kito.ch...@gmail.com;
> richard.sandiford@arm.com2; jeffreya...@gmail.com
> Subject: Re: [PATCH v2] Draft|Internal-fn: Introduce internal fn saturation
> US_PLUS
> 
> On Sun, Feb 25, 2024 at 10:01 AM Tamar Christina
>  wrote:
> >
> > Hi Pan,
> >
> > > From: Pan Li 
> > >
> > > Hi Richard & Tamar,
> > >
> > > Try the DEF_INTERNAL_INT_EXT_FN as your suggestion.  By mapping
> > > us_plus$a3 to the RTL representation (us_plus:m x y) in optabs.def.
> > > And then expand_US_PLUS in internal-fn.cc.  Not very sure if my
> > > understanding is correct for DEF_INTERNAL_INT_EXT_FN.
> > >
> > > I am not sure if we still need DEF_INTERNAL_SIGNED_OPTAB_FN here, given
> > > the RTL representation has (ss_plus:m x y) and (us_plus:m x y) already.
> > >
> >
> > I think a couple of things are being confused here.  So lets break it down:
> >
> > The reason for DEF_INTERNAL_SIGNED_OPTAB_FN is because in GIMPLE
> > we only want one internal function for both signed and unsigned SAT_ADD.
> > with this definition we don't need SAT_UADD and SAT_SADD but instead
> > we will only have SAT_ADD, which will expand to us_plus or ss_plus.
> >
> > Now the downside of this is that this is a direct internal optab.  This 
> > means
> > that for the representation to be used the target *must* have the optab
> > implemented.   This is a bit annoying because it doesn't allow us to 
> > generically
> > assume that all targets use SAT_ADD for saturating add and thus only have to
> > write optimization for this representation.
> >
> > This is why Richi said we may need to use a new tree_code because we can
> > override tree code expansions.  However the same can be done with the 
> > _EXT_FN
> > internal functions.
> >
> > So what I meant was that we want to have a combination of the two. i.e. a
> > DEF_INTERNAL_SIGNED_OPTAB_EXT_FN.
> 
> Whether we want/need _EXT or only direct depends mainly on how we want to
> leverage support.  If it's only during vectorization and possibly instruction
> selection a direct optab is IMO the way to go.  Generic optimization only
> marginally improves when you explode the number of basic operations you
> expose - in fact it gets quite unwieldly to support all of them in
> simplifications
> and/or canonicalization and you possibly need to translate them back to what
> the target CPU supports.
> 
> We already do have too many (IMO) "special" operations exposed "early"
> in the GIMPLE pipeline.
> 
> But what I'd like to see is that we do more instruction selection on GIMPLE
> but _late_ (there's the pass_optimize_widening_mul and pass_gimple_isel
> passes doing what I'd call instruction selection).  But that means not adding
> match.pd patterns for that or at least have a separate isel-match.pd
> machinery for that.
> 
> So as a start I would go for a direct optab and see to recognize it during
> ISEL?
> 

The problem with ISEL and the reason I suggested an indirect IFN is that there
Are benefit to be had from recognizing it early.  Saturating arithmetic can be 
optimized
Differently from 

Re: [PATCH V2] rs6000: Don't allow immediate value in the vsx_splat pattern [PR113950]

2024-02-27 Thread Segher Boessenkool
Hi!

On Tue, Feb 27, 2024 at 02:02:38AM +0530, jeevitha wrote:
> There is no immediate value splatting instruction in Power. Currently, those
> values need to be stored in a register or memory. To address this issue, I
> have updated the predicate for the second operand in vsx_splat to
> splat_input_operand and corrected the assignment of op1 to operands[1].
> These changes ensure that operand1 is stored in a register.

input_operand allows a lot of things that splat_input_operand does not,
not just immediate operands.  NAK.

(For example, *all* memory is okay for input_operand, always).

I'm not saying we do not want to restrict these things, but a commit
that doesn't discuss this at all is not okay.  Sorry.


Segher


[PATCH 0/3, RFC] fsra: Add final gimple sra before expander

2024-02-27 Thread Jiufu Guo
Hi,

As known there are a few PRs (meta-bug PR101926) about
accessing aggregate param/returns which are passed through registers.

Given the suggestion from: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637935.html:
We could even use the actual SRA pass in a special mode right before
RTL expansion for the incoming/outgoing part.

Compared to other solutions (e.g. previous light-sra-in-expander), this
method could decouple the different parts (gimple-sra and rtl-expand),
and could leverage the current SRA maximum.

The following patches implements a prototype of this idea.

In this prototype, only "parameters and returns" are treated as 'sra'
candidates.  If a 'parameter' is scalarized, then an IFN_ARG_PART is 
generated for the access at the beginning of the function, and if an
access of a 'return' is scalarized, then IFN_SET_RET is generated 
for it.  Those IFNs are expanded according to the incoming/outgoing
registers for the accesses.

Bootstrapped/regtested on ppc64{,le} and x86_64.


In this prototype, there are still a few areas which can be enhanced,
like:
- Access multi-registers in one stmt,
- Arg access across function calls,
- More special target/ABI behavior,
...

I would like to ask for comments/suggestions before jump into depth,
to ensure this is in the correct direction, and to avoid missing some
important thing.

One thing/concern in this implementation:
For an aggregate parameter, if it is not passed through registers,
there is no need to scalarize in this sra.
For example like i386/pr101908-3.c,
Without sra, the stmts look like this:
 bar ();
 vect__1.5_10 = MEM  [(double *)];

With sra, the stmts look like this:
 x_1 = .ARG_PART (x, 0, 128);
 bar ();
 vect__1.5_10 = x_1;

The issue is that there are no instructions before invoking
'bar ()' without the patch; with the patch, instructions may be generated
before 'bar ()' and those insns would not easy to be optimized by RTL
passes.
This would not be hard to fix (but maybe hacking):
- Let 'sra' pass know the information about if the access is in the
  register, then avoid generating 'ARG_PART'. This would introduce coupling
  before gimple sra and rtl.
- When expanding 'ARG_PART', if the access is in mem, then defer it.
  This would mean 'ARG_PART' may expand to nothing and make the dumped rtl is
  a little confused.
Any comments?

This prototype is splitted into three patches for review.
1/3: Add final gimple sra just before expander
2/3: Add support for ARG_PARTS
3/3: Add support for RET_PARTS

Thanks for your comments and suggestions!

BR,
Jeff (Jiufu Guo)


Re: [patch] OpenACC: Add Fortran routines acc_{alloc,free,hostptr,deviceptr,memcpy_{to,from}_device*}

2024-02-27 Thread Tobias Burnus

Hi Thomas,

(Regarding 'call acc_attach(x)' – the problem is that one needs the 
address of '' and 'x'; while 'x' is readily available, for '' no 
temporary variable has to get involved – and there are plenty of ways 
temporaries can get introduced; for most cases, an interface exists that 
prevents this but they are mutually exclusive. Hence, this needs support 
in the FE. The simplest workaround for a user is to use '!$acc attach' 
instead ...)


Thomas Schwinge:

  @table @asis
  @item @emph{Description}
-This function allocates @var{len} bytes of device memory. It returns
+This function allocates @var{bytes} of device memory. It returns

Not '@var{bytes} {+bytes+}' or similar?


I think either works – depending how one parses @var{} mentally, 
one of the variants sounds smooth and the other very odd. But I can/will 
change it.



--- a/libgomp/openacc.f90
+++ b/libgomp/openacc.f90

Assuming that 'module openacc_internal' currently is sorted per
appearance in the OpenACC specification (?), I suggest we continue to do
so.  (..., like in 'openacc_lib.h', too.)
I will check – it looks only block-wise sorted but I might be wrong.I 
followed location of the comments, placing it before the routines that 
followed the comment, assuming that the comments were at the right spot.

@@ -794,6 +881,9 @@ module openacc
...
+  public :: acc_malloc, acc_free, acc_map_data, acc_unmap_data, acc_deviceptr
+  public :: acc_hostptr, acc_memcpy_to_device, acc_memcpy_to_device_async
+  public :: acc_memcpy_from_device, acc_memcpy_from_device_async
  ...
-  ! acc_malloc: Only available in C/C++
-  ! acc_free: Only available in C/C++
-
...
interface acc_is_present
  procedure :: acc_is_present_32_h
  procedure :: acc_is_present_64_h
  procedure :: acc_is_present_array_h
end interface

Is that now a different style that we're not listing the new interfaces
in 'module openacc' here?


As there is no precedent for this type of interface, the style is by 
nature differently. But the question is which style is better. The 
current 'openacc' is very short – and contains not a single specific 
interface, but only generic interfaces. The actual specific-procedure 
declarations are only in 'openacc_internal'.


Those new procedures are the first ones that do not have a generic 
interface and only a specific one. Thus, one can either put the specific 
one into 'openacc_internal' and refer it from 'openacc' (via 'use 
openacc_internal' + 'public :: acc_') – or place the 
interface directly into 'openacc' (and not touching 'openacc_internal' 
at all).


During development, I had a accidentally a mixture between both - and 
then settled for the current variant. – Possibly, moving the interface 
to 'openacc' is clearer?


Thoughts?


--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/acc_host_device_ptr.f90
[...]
+! Fortran version of libgomp.oacc-c-c++-common/lib-59.c

I like to also put a cross reference into the originating C/C++ test
case, so that anyone adjusting either one also is aware that another one
may need adjusting, too.

OK - I will do so.

+  ! The following assumes sizeof(void*) being the same on host and device:

That's generally required anyway.


I have to admit that I don't know OpenACC well enough to see whether 
that's the case or not. And, while I am not very consistent, I do try to 
document stricter requirements / implementation-specific parts in a 
testcases.


I know that OpenMP permits that the pointer size differs and 'void *p = 
omp_target_alloc (...);' might in this case not return the device 
pointer but a handle to the device ptr. (For instance, it could be a 
pointer to an uint128_t variable for a 128bit device pointer; I think 
such a hardware exists in real - and uses several bits for other 
purposes like flags.)


In that case, host-side pointer arithmetic won't work and 
'is_device_ptr' clauses etc. need to do transfer work.


But, admittedly, in GCC there it is assumed at many places that both 
sides use the same pointer size* and also during specification 
development, everyone implicitly assumes that routines and clauses yield 
bare device pointers and not some opaque pointer to the actual data (a 
handle); hence, one has to keep remind oneself that the spec permits 
system where that's not the case.


Tobias

(* There are a few spots which handle a smaller device pointer than the 
host pointer or consider a different size but that's not done very 
consistently and largely lacking.)





Re: [PATCH] i386: For noreturn functions save at least the bp register if it is used [PR114116]

2024-02-27 Thread Jakub Jelinek
On Tue, Feb 27, 2024 at 10:13:14AM +0100, Jakub Jelinek wrote:
> For __libc_start_main, glibc surely just could use no_callee_saved_registers
> attribute, because that is typically the outermost frame in backtrace,
> there is no need to save those there.
> And for kernel if it really wants it and nothing will use the backtraces,
> perhaps the patch wouldn't need to be reverted completely but just guarded
> the implicit no_callee_saved_registers treatment of noreturn
> functions on -mcmodel=kernel or -fno-asynchronous-unwind-tables.

Guarding on -fno-asynchronous-unwind-tables isn't a good idea,
with just -g we emit in that case unwind info in .debug_frame section
and even that shouldn't break, and we shouldn't generate different code for
-g vs. -g0.
The problem with the changes is that it breaks the unwinding and debugging
experience not just in the functions on which the optimization triggers,
but on all functions in the backtrace as well.

So, IMHO either revert the changes altogether, or guard on -mcmodel=kernel
(but talk to kernel people on linux-toolchains if that is what they actually
want).

Jakub



Re: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU

2024-02-27 Thread Georg-Johann Lay




Am 27.02.24 um 12:15 schrieb Tamar Christina:

Am 19.02.24 um 08:36 schrieb Richard Biener:

On Sat, Feb 17, 2024 at 11:30 AM  wrote:


From: Pan Li 

This patch would like to add the middle-end presentation for the
unsigned saturation add.  Aka set the result of add to the max
when overflow.  It will take the pattern similar as below.

SAT_ADDU (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))


Does this even try to wort out the costs?

For example, with the following example


#define T __UINT16_TYPE__

T sat_add1 (T x, T y)
{
return (x + y) | (- (T)((T)(x + y) < x));
}

T sat_add2 (T x, T y)
{
  T z = x + y;
  if (z < x)
  z = (T) -1;
  return z;
}

And then "avr-gcc -S -Os -dp" the code is


sat_add1:
add r22,r24  ;  7   [c=8 l=2]  *addhi3/0
adc r23,r25
ldi r18,lo8(1)   ;  8   [c=4 l=2]  *movhi/4
ldi r19,0
cp r22,r24   ;  9   [c=8 l=2]  cmphi3/2
cpc r23,r25
brlo .L2 ;  10  [c=16 l=1]  branch
ldi r19,0;  31  [c=4 l=1]  movqi_insn/0
ldi r18,0;  32  [c=4 l=1]  movqi_insn/0
.L2:
clr r24  ;  13  [c=12 l=4]  neghi2/1
clr r25
sub r24,r18
sbc r25,r19
or r24,r22   ;  29  [c=4 l=1]  iorqi3/0
or r25,r23   ;  30  [c=4 l=1]  iorqi3/0
ret  ;  35  [c=0 l=1]  return

sat_add2:
add r22,r24  ;  8   [c=8 l=2]  *addhi3/0
adc r23,r25
cp r22,r24   ;  9   [c=8 l=2]  cmphi3/2
cpc r23,r25
brsh .L3 ;  10  [c=16 l=1]  branch
ldi r22,lo8(-1)  ;  5   [c=4 l=2]  *movhi/4
ldi r23,lo8(-1)
.L3:
mov r25,r23  ;  21  [c=4 l=1]  movqi_insn/0
mov r24,r22  ;  22  [c=4 l=1]  movqi_insn/0
ret  ;  25  [c=0 l=1]  return

i.e. the conditional jump is better than overly smart arithmetic
(smaller and faster code with less register pressure).
With larger dypes the difference is even more pronounced-



*on AVR. https://godbolt.org/z/7jaExbTa8  shows the branchless code is better.
And the branchy code will vectorize worse if at all 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51492


AVR is a GCC backend

https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=gcc/config/avr

and likely not the only backend where tricky arithmetic is more
expensive than branching more often than not.

Johann




But looking at that output it just seems like it's your expansion that's 
inefficient.

But fair point, perhaps it should be just a normal DEF_INTERNAL_SIGNED_OPTAB_FN 
so that we
provide the additional optimization only for targets that want it.

Tamar




[PATCH 3/3, RFC] fsra: support SET_RET_PART

2024-02-27 Thread Jiufu Guo
This patch adds IFN_SET_RET_PARTS, and generate this IFN for the accesses of
the 'returns' in fsra pass.  And the IFN is expanded according to the outgoing
registers of the 'return'.  "fsra" is tunned for the access analyze for
'returns'.

'IFN_SET_RET_LAST_PARTS' is just for this prototype, it helps to
reuse the decl information of the 'return var'.  With enhancing the
implementation, this IFN may be removed.

PR target/65421
PR target/69143

gcc/ChangeLog:

* cfgexpand.cc (expand_value_return): Update.
(expand_return): Update for returns expand.
* internal-fn.cc (store_outgoing_element): New function.
(expand_SET_RET_PARTS): New IFN expand function.
(expand_SET_RET_LAST_PARTS): New IFN expand function.
* internal-fn.def (SET_RET_PARTS): New IFN.
(SET_RET_LAST_PARTS): New IFN.
* tree-sra.cc (analyze_access_subtree): Upate for returns in fsra.
(generate_subtree_copies): Generate IFN for returns.

gcc/testsuite/ChangeLog:

* gcc.target/powerpc/pr65421.c: New test.
* gcc.target/powerpc/pr69143.c: New test.

---
 gcc/cfgexpand.cc   |  6 +-
 gcc/internal-fn.cc | 84 ++
 gcc/internal-fn.def|  6 ++
 gcc/tree-sra.cc| 39 --
 gcc/testsuite/gcc.target/powerpc/pr65421.c | 10 +++
 gcc/testsuite/gcc.target/powerpc/pr69143.c | 23 ++
 6 files changed, 163 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr69143.c

diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
index eef565eddb5..1ec6c2d8102 100644
--- a/gcc/cfgexpand.cc
+++ b/gcc/cfgexpand.cc
@@ -3759,7 +3759,7 @@ expand_value_return (rtx val)
 
   tree decl = DECL_RESULT (current_function_decl);
   rtx return_reg = DECL_RTL (decl);
-  if (return_reg != val)
+  if (!rtx_equal_p (return_reg, val))
 {
   tree funtype = TREE_TYPE (current_function_decl);
   tree type = TREE_TYPE (decl);
@@ -3832,6 +3832,10 @@ expand_return (tree retval)
  been stored into it, so we don't have to do anything special.  */
   if (TREE_CODE (retval_rhs) == RESULT_DECL)
 expand_value_return (result_rtl);
+  /* return is scalarized by fsra: TODO use FLAG. */
+  else if (VAR_P (retval_rhs)
+  && rtx_equal_p (result_rtl, DECL_RTL (retval_rhs)))
+expand_null_return_1 ();
 
   /* If the result is an aggregate that is being returned in one (or more)
  registers, load the registers here.  */
diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index ee19e155628..be06dc3a16c 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -3557,6 +3557,90 @@ expand_ARG_PARTS (internal_fn, gcall *stmt)
 }
 }
 
+static bool
+store_outgoing_element (rtx regs, HOST_WIDE_INT bitpos, HOST_WIDE_INT bitsize,
+   tree rhs)
+{
+  if (GET_CODE (regs) != PARALLEL)
+return false;
+
+  int start_index = -1;
+  int end_index = -1;
+  HOST_WIDE_INT left_bits = 0;
+  HOST_WIDE_INT right_bits = 0;
+  query_position_in_parallel (bitpos, bitsize, regs, start_index, end_index,
+ left_bits, right_bits);
+
+  if (start_index < 0 || end_index < 0)
+return false;
+
+  if (end_index != start_index)
+return false;
+
+  if (!((left_bits == 0 && !BITS_BIG_ENDIAN)
+   || (right_bits == 0 && BITS_BIG_ENDIAN)))
+return false;
+
+  /* Just need one reg for the access.  */
+  rtx dest = XEXP (XVECEXP (regs, 0, start_index), 0);
+  machine_mode mode = GET_MODE (dest);
+
+  if (left_bits != 0 || right_bits != 0)
+{
+  machine_mode small_mode;
+  if (!SCALAR_INT_MODE_P (mode)
+ || !mode_for_size (bitsize, GET_MODE_CLASS (mode), 0)
+   .exists (_mode))
+   return false;
+
+  dest = gen_lowpart (small_mode, dest);
+  mode = small_mode;
+}
+
+  rtx src = expand_expr (rhs, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  if (!src)
+return false;
+
+  machine_mode src_mode = GET_MODE (src);
+  if (mode != src_mode)
+src = gen_lowpart (mode, src);
+
+  emit_move_insn (dest, src);
+
+  return true;
+}
+
+static void
+expand_SET_RET_PARTS (internal_fn, gcall *stmt)
+{
+  HOST_WIDE_INT offset = tree_to_shwi (gimple_call_arg (stmt, 1));
+  HOST_WIDE_INT size = tree_to_shwi (gimple_call_arg (stmt, 2));
+  tree decl = DECL_RESULT (current_function_decl);
+  rtx dest_regs = decl->decl_with_rtl.rtl; // DECL_RTL (base);
+  tree rhs = gimple_call_arg (stmt, 3);
+  bool res = store_outgoing_element (dest_regs, offset, size, rhs);
+  if (!res)
+{
+  tree base = gimple_call_arg (stmt, 0);
+  tree lhs = gimple_call_lhs (stmt);
+  expand_assignment (base, decl, false);
+  expand_assignment (lhs, rhs, false);
+  expand_assignment (decl, base, false);
+}
+}
+
+static void
+expand_SET_RET_LAST_PARTS (internal_fn, gcall *stmt)
+{
+  expand_SET_RET_PARTS 

[PATCH 8/8] libstdc++: Do not define lock-free atomic aliases if not fully lock-free [PR114103]

2024-02-27 Thread Jonathan Wakely
Tested x86_64-linux. I think we should make this change, because
otherwise we define the typedefs for platforms with no lock-free
atomics, like hppa-hpux. Instead of lying, those typedefs should be
absent on that target.

-- >8 --

libstdc++-v3/ChangeLog:

PR libstdc++/114103
* include/bits/version.def (atomic_lock_free_type_aliases): Add
extra_cond to check for at least one always-lock-free type.
* include/bits/version.h: Regenerate.
* include/std/atomic (atomic_signed_lock_free)
(atomic_unsigned_lock_free): Only use always-lock-free types.
---
 libstdc++-v3/include/bits/version.def | 1 +
 libstdc++-v3/include/bits/version.h   | 2 +-
 libstdc++-v3/include/std/atomic   | 6 +++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/libstdc++-v3/include/bits/version.def 
b/libstdc++-v3/include/bits/version.def
index 502961eb269..d298420121b 100644
--- a/libstdc++-v3/include/bits/version.def
+++ b/libstdc++-v3/include/bits/version.def
@@ -739,6 +739,7 @@ ftms = {
   values = {
 v = 201907;
 cxxmin = 20;
+extra_cond = "(__GCC_ATOMIC_INT_LOCK_FREE | __GCC_ATOMIC_LONG_LOCK_FREE | 
__GCC_ATOMIC_CHAR_LOCK_FREE) & 2";
   };
 };
 
diff --git a/libstdc++-v3/include/bits/version.h 
b/libstdc++-v3/include/bits/version.h
index 7a6fbd35e2e..9107b45a484 100644
--- a/libstdc++-v3/include/bits/version.h
+++ b/libstdc++-v3/include/bits/version.h
@@ -819,7 +819,7 @@
 #undef __glibcxx_want_atomic_float
 
 #if !defined(__cpp_lib_atomic_lock_free_type_aliases)
-# if (__cplusplus >= 202002L)
+# if (__cplusplus >= 202002L) && ((__GCC_ATOMIC_INT_LOCK_FREE | 
__GCC_ATOMIC_LONG_LOCK_FREE | __GCC_ATOMIC_CHAR_LOCK_FREE) & 2)
 #  define __glibcxx_atomic_lock_free_type_aliases 201907L
 #  if defined(__glibcxx_want_all) || 
defined(__glibcxx_want_atomic_lock_free_type_aliases)
 #   define __cpp_lib_atomic_lock_free_type_aliases 201907L
diff --git a/libstdc++-v3/include/std/atomic b/libstdc++-v3/include/std/atomic
index 559f8370459..1462cf5ec23 100644
--- a/libstdc++-v3/include/std/atomic
+++ b/libstdc++-v3/include/std/atomic
@@ -1774,13 +1774,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 = atomic>;
   using atomic_unsigned_lock_free
 = atomic>;
-# elif ATOMIC_INT_LOCK_FREE || !(ATOMIC_LONG_LOCK_FREE || 
ATOMIC_CHAR_LOCK_FREE)
+# elif ATOMIC_INT_LOCK_FREE == 2
   using atomic_signed_lock_free = atomic;
   using atomic_unsigned_lock_free = atomic;
-# elif ATOMIC_LONG_LOCK_FREE
+# elif ATOMIC_LONG_LOCK_FREE == 2
   using atomic_signed_lock_free = atomic;
   using atomic_unsigned_lock_free = atomic;
-# elif ATOMIC_CHAR_LOCK_FREE
+# elif ATOMIC_CHAR_LOCK_FREE == 2
   using atomic_signed_lock_free = atomic;
   using atomic_unsigned_lock_free = atomic;
 # else
-- 
2.43.0



[PATCH 1/3, RFC] fsra: Add final gimple sra just before expander

2024-02-27 Thread Jiufu Guo
This patch adds a new mode for sra pass: "fsra".
This 'fsra' pass handle function parameters and returns as candidates.
And run it at the end of GIMPLE passes sequences.

gcc/ChangeLog:

* passes.def: Add pass pass_sra_final.
* tree-pass.h (make_pass_sra_final): Declare make_pass_sra_final.
* tree-sra.cc (enum sra_mode): New enum item SRA_MODE_FINAL_INTRA.
(build_accesses_from_assign): Accept SRA_MODE_FINAL_INTRA.
(find_var_candidates): Collect candidates for SRA_MODE_FINAL_INTRA.
(final_intra_sra): New function.
(class pass_sra_final): New pass class.
(make_pass_sra_final): New function.

---
 gcc/passes.def  |  2 ++
 gcc/tree-pass.h |  1 +
 gcc/tree-sra.cc | 81 +
 3 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/gcc/passes.def b/gcc/passes.def
index 1cbbd413097..183c1becd65 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -449,6 +449,8 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_harden_conditional_branches);
   NEXT_PASS (pass_harden_compares);
   NEXT_PASS (pass_warn_access, /*early=*/false);
+  NEXT_PASS (pass_sra_final);
+
   NEXT_PASS (pass_cleanup_cfg_post_optimizing);
   NEXT_PASS (pass_warn_function_noreturn);
 
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 29267589eeb..2d0e12bd1bb 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -366,6 +366,7 @@ extern gimple_opt_pass *make_pass_early_tree_profile 
(gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_cleanup_eh (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_sra (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_sra_early (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_sra_final (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tail_recursion (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tail_calls (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_fix_loops (gcc::context *ctxt);
diff --git a/gcc/tree-sra.cc b/gcc/tree-sra.cc
index f8e71ec48b9..aacc76f58b5 100644
--- a/gcc/tree-sra.cc
+++ b/gcc/tree-sra.cc
@@ -21,14 +21,16 @@ along with GCC; see the file COPYING3.  If not see
 .  */
 
 /* This file implements Scalar Reduction of Aggregates (SRA).  SRA is run
-   twice, once in the early stages of compilation (early SRA) and once in the
-   late stages (late SRA).  The aim of both is to turn references to scalar
-   parts of aggregates into uses of independent scalar variables.
+   three times, once in the early stages of compilation (early SRA) and once
+   in the late stages (late SRA).  The aim of them is to turn references to
+   scalar parts of aggregates into uses of independent scalar variables.
 
-   The two passes are nearly identical, the only difference is that early SRA
+   The three passes are nearly identical, the difference are that early SRA
does not scalarize unions which are used as the result in a GIMPLE_RETURN
statement because together with inlining this can lead to weird type
-   conversions.
+   conversions.  The third pass is more care about parameters and returns,
+   it would be helpful for the parameters and returns which are passed through
+   registers.
 
Both passes operate in four stages:
 
@@ -104,6 +106,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Enumeration of all aggregate reductions we can do.  */
 enum sra_mode { SRA_MODE_EARLY_IPA,   /* early call regularization */
SRA_MODE_EARLY_INTRA, /* early intraprocedural SRA */
+   SRA_MODE_FINAL_INTRA, /* final gimple intraprocedural SRA */
SRA_MODE_INTRA }; /* late intraprocedural SRA */
 
 /* Global variable describing which aggregate reduction we are performing at
@@ -1437,7 +1440,8 @@ build_accesses_from_assign (gimple *stmt)
 }
 
   if (lacc && racc
-  && (sra_mode == SRA_MODE_EARLY_INTRA || sra_mode == SRA_MODE_INTRA)
+  && (sra_mode == SRA_MODE_EARLY_INTRA || sra_mode == SRA_MODE_INTRA
+ || sra_mode == SRA_MODE_FINAL_INTRA)
   && !lacc->grp_unscalarizable_region
   && !racc->grp_unscalarizable_region
   && AGGREGATE_TYPE_P (TREE_TYPE (lhs))
@@ -2149,6 +2153,24 @@ find_var_candidates (void)
parm = DECL_CHAIN (parm))
 ret |= maybe_add_sra_candidate (parm);
 
+  /* fsra only care about parameters and returns */
+  if (sra_mode == SRA_MODE_FINAL_INTRA)
+{
+  if (!DECL_RESULT (current_function_decl))
+   return ret;
+
+  edge_iterator ei;
+  edge e;
+  FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+   if (greturn *r = safe_dyn_cast (*gsi_last_bb (e->src)))
+ {
+   tree val = gimple_return_retval (r);
+   if (val && VAR_P (val))
+ ret |= maybe_add_sra_candidate (val);
+ }
+  return ret;
+}
+
   FOR_EACH_LOCAL_DECL (cfun, i, var)
 {
   if (!VAR_P (var))
@@ -5017,6 +5039,14 @@ late_intra_sra (void)
   

[PATCH 3/8] libstdc++: Fix std::print for Cygwin

2024-02-27 Thread Jonathan Wakely
Tested x86_64-linux. I am unable to test this on Cygwin myself. Testing
and reviews invited.

-- >8 --

Cygwin should use std::fwrite, not WriteConsoleW. And the -lstdc++exp
library is only needed when running the tests on *-*-mingw*.

libstdc++-v3/ChangeLog:

* include/std/ostream (vprint_unicode) [__CYGWIN__]: Use POSIX
code path for Cygwin instead of Windows.
* include/std/print (vprint_unicode) [__CYGWIN__]: Likewise.
* testsuite/27_io/basic_ostream/print/1.cc: Only add -lstdc++exp
for *-*-mingw* targets.
* testsuite/27_io/print/1.cc: Likewise.
---
 libstdc++-v3/include/std/ostream  | 4 ++--
 libstdc++-v3/include/std/print| 2 +-
 libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc | 2 +-
 libstdc++-v3/testsuite/27_io/print/1.cc   | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libstdc++-v3/include/std/ostream b/libstdc++-v3/include/std/ostream
index 7d501d67489..a136399ad0b 100644
--- a/libstdc++-v3/include/std/ostream
+++ b/libstdc++-v3/include/std/ostream
@@ -906,7 +906,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   inline void
   vprint_unicode(ostream& __os, string_view __fmt, format_args __args)
   {
-#ifndef _WIN32
+#if !defined(_WIN32) || defined(__CYGWIN__)
 // For most targets we don't need to do anything special to write
 // Unicode to a terminal.
 std::vprint_nonunicode(__os, __fmt, __args);
@@ -923,7 +923,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
// If stream refers to a terminal, write a Unicode string to it.
if (auto __term = __open_terminal(__os.rdbuf()))
  {
-#ifndef _WIN32
+#if !defined(_WIN32) || defined(__CYGWIN__)
// For POSIX, __open_terminal(streambuf*) uses fdopen to open a
// new file, so we would need to close it here. This code is not
// actually compiled because it's inside an #ifdef _WIN32 group,
diff --git a/libstdc++-v3/include/std/print b/libstdc++-v3/include/std/print
index 492f333dfa6..d44033469de 100644
--- a/libstdc++-v3/include/std/print
+++ b/libstdc++-v3/include/std/print
@@ -64,7 +64,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   inline void
   vprint_unicode(FILE* __stream, string_view __fmt, format_args __args)
   {
-#ifndef _WIN32
+#if !defined(_WIN32) || defined(__CYGWIN__)
 // For most targets we don't need to do anything special to write
 // Unicode to a terminal.
 std::vprint_nonunicode(__stream, __fmt, __args);
diff --git a/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc 
b/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
index b3abc570d1e..71a4daa04c9 100644
--- a/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
+++ b/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
@@ -1,4 +1,4 @@
-// { dg-options "-lstdc++exp" }
+// { dg-additional-options "-lstdc++exp" { target { *-*-mingw* } } }
 // { dg-do run { target c++23 } }
 // { dg-require-fileio "" }
 
diff --git a/libstdc++-v3/testsuite/27_io/print/1.cc 
b/libstdc++-v3/testsuite/27_io/print/1.cc
index 3cfdac1bb74..6a294e0454b 100644
--- a/libstdc++-v3/testsuite/27_io/print/1.cc
+++ b/libstdc++-v3/testsuite/27_io/print/1.cc
@@ -1,4 +1,4 @@
-// { dg-options "-lstdc++exp" }
+// { dg-additional-options "-lstdc++exp" { target { *-*-mingw* } } }
 // { dg-do run { target c++23 } }
 // { dg-require-fileio "" }
 
-- 
2.43.0



[PATCH 5/8] libstdc++: Consistently use noexcept, constexpr, nodiscard on bitmask ops

2024-02-27 Thread Jonathan Wakely
Tested x86_64-linux. Reviews invited.

-- >8 --

The bitwise operators for combining bitmask types such as std::launch
are not consistently annotated with noexcept, constexpr, and nodiscard.

This is the subject of LWG 3977, although the proposed resolution
doesn't work. We can make the changes in libstdc++ anyway though.

libstdc++-v3/ChangeLog:

* include/bits/atomic_base.h (operator|, operator&): Add
noexcept.
* include/bits/fs_fwd.h (operator&, operator|, operator^)
(operator~): Add nodiscard to overloads for copy_options, perms,
perm_options, and directory_options.
* include/bits/ios_base.h (operator&, operator|, operator^)
(operator~): Add nodiscard and noexcept to overloads for
_Ios_Fmtflags, _Ios_Openmode, and _Ios_Iostate.
(operator|=, operator&=, operator^=): Add constexpr for C++14.
* include/bits/regex_constants.h (operator&, operator|, operator^)
(operator~): Add nodiscard and noexcept to overloads for
syntax_option_type and match_flag_type.
(operator|=, operator&=, operator^=): Add noexcept.
* include/std/charconv (operator&, operator|, operator^)
(operator~): Add nodiscard to overloads for chars_format.
* include/std/future (operator&, operator|, operator^)
(operator~): Add nodiscard for overloads for launch.
(operator&=, operator|=, operator^=): Add constexpr for C++14.
* include/experimental/bits/fs_fwd.h  (operator&, operator|)
(operator^, operator~): Add nodiscard to overloads for
copy_options, perms, and directory_options.
* testsuite/27_io/ios_base/types/fmtflags/bitmask_operators.cc:
Add dg-warning for nodiscard warnings.
* testsuite/27_io/ios_base/types/iostate/bitmask_operators.cc:
Likewise.
* testsuite/27_io/ios_base/types/openmode/bitmask_operators.cc:
Likewise.
* testsuite/27_io/filesystem/operations/bitmask_types.cc:
New test.
---
 libstdc++-v3/include/bits/atomic_base.h   |  4 +-
 libstdc++-v3/include/bits/fs_fwd.h| 16 
 libstdc++-v3/include/bits/ios_base.h  | 89 ---
 libstdc++-v3/include/bits/regex_constants.h   | 52 ++-
 .../include/experimental/bits/fs_fwd.h| 12 +++
 libstdc++-v3/include/std/charconv |  4 +
 libstdc++-v3/include/std/future   |  7 ++
 .../filesystem/operations/bitmask_types.cc| 56 
 .../types/fmtflags/bitmask_operators.cc   |  4 +
 .../types/iostate/bitmask_operators.cc|  4 +
 .../types/openmode/bitmask_operators.cc   |  4 +
 11 files changed, 194 insertions(+), 58 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/27_io/filesystem/operations/bitmask_types.cc

diff --git a/libstdc++-v3/include/bits/atomic_base.h 
b/libstdc++-v3/include/bits/atomic_base.h
index d3a2c4f3805..b857b441169 100644
--- a/libstdc++-v3/include/bits/atomic_base.h
+++ b/libstdc++-v3/include/bits/atomic_base.h
@@ -100,13 +100,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   /// @endcond
 
   constexpr memory_order
-  operator|(memory_order __m, __memory_order_modifier __mod)
+  operator|(memory_order __m, __memory_order_modifier __mod) noexcept
   {
 return memory_order(int(__m) | int(__mod));
   }
 
   constexpr memory_order
-  operator&(memory_order __m, __memory_order_modifier __mod)
+  operator&(memory_order __m, __memory_order_modifier __mod) noexcept
   {
 return memory_order(int(__m) & int(__mod));
   }
diff --git a/libstdc++-v3/include/bits/fs_fwd.h 
b/libstdc++-v3/include/bits/fs_fwd.h
index 6208e799b84..7e2bc30df30 100644
--- a/libstdc++-v3/include/bits/fs_fwd.h
+++ b/libstdc++-v3/include/bits/fs_fwd.h
@@ -91,6 +91,7 @@ _GLIBCXX_END_NAMESPACE_CXX11
 
   /// @{
   /// @relates copy_options
+  [[nodiscard]]
   constexpr copy_options
   operator&(copy_options __x, copy_options __y) noexcept
   {
@@ -99,6 +100,7 @@ _GLIBCXX_END_NAMESPACE_CXX11
static_cast<__utype>(__x) & static_cast<__utype>(__y));
   }
 
+  [[nodiscard]]
   constexpr copy_options
   operator|(copy_options __x, copy_options __y) noexcept
   {
@@ -107,6 +109,7 @@ _GLIBCXX_END_NAMESPACE_CXX11
static_cast<__utype>(__x) | static_cast<__utype>(__y));
   }
 
+  [[nodiscard]]
   constexpr copy_options
   operator^(copy_options __x, copy_options __y) noexcept
   {
@@ -115,6 +118,7 @@ _GLIBCXX_END_NAMESPACE_CXX11
static_cast<__utype>(__x) ^ static_cast<__utype>(__y));
   }
 
+  [[nodiscard]]
   constexpr copy_options
   operator~(copy_options __x) noexcept
   {
@@ -161,6 +165,7 @@ _GLIBCXX_END_NAMESPACE_CXX11
 
   /// @{
   /// @relates perms
+  [[nodiscard]]
   constexpr perms
   operator&(perms __x, perms __y) noexcept
   {
@@ -169,6 +174,7 @@ _GLIBCXX_END_NAMESPACE_CXX11
static_cast<__utype>(__x) & static_cast<__utype>(__y));
   }
 
+  [[nodiscard]]
   constexpr perms
   operator|(perms __x, perms __y) noexcept
   {
@@ -177,6 +183,7 @@ 

[PATCH 6/8] libstdc++: Add more nodiscard uses in

2024-02-27 Thread Jonathan Wakely
Tested x86_64-linux. Reviews invited.

-- >8 --

libstdc++-v3/ChangeLog:

* include/bits/stl_bvector.h (vector::at): Add
nodiscard.
* include/bits/stl_vector.h (vector::at): Likewise.
(operator==, operator<=>, operator<, operator!=, operator>)
(operator<=, operator>=): Add nodiscard.
* testsuite/23_containers/vector/nodiscard.cc: New test.
---
 libstdc++-v3/include/bits/stl_bvector.h   |   4 +-
 libstdc++-v3/include/bits/stl_vector.h|  18 +--
 .../23_containers/vector/nodiscard.cc | 153 ++
 3 files changed, 164 insertions(+), 11 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/23_containers/vector/nodiscard.cc

diff --git a/libstdc++-v3/include/bits/stl_bvector.h 
b/libstdc++-v3/include/bits/stl_bvector.h
index aa5644b4a0e..2c8b892b07a 100644
--- a/libstdc++-v3/include/bits/stl_bvector.h
+++ b/libstdc++-v3/include/bits/stl_bvector.h
@@ -1101,7 +1101,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
   }
 
 public:
-  _GLIBCXX20_CONSTEXPR
+  _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
   reference
   at(size_type __n)
   {
@@ -1109,7 +1109,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
return (*this)[__n];
   }
 
-  _GLIBCXX20_CONSTEXPR
+  _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
   const_reference
   at(size_type __n) const
   {
diff --git a/libstdc++-v3/include/bits/stl_vector.h 
b/libstdc++-v3/include/bits/stl_vector.h
index 6a9543eefce..a8d387f40a1 100644
--- a/libstdc++-v3/include/bits/stl_vector.h
+++ b/libstdc++-v3/include/bits/stl_vector.h
@@ -1172,7 +1172,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
*  is first checked that it is in the range of the vector.  The
*  function throws out_of_range if the check fails.
*/
-  _GLIBCXX20_CONSTEXPR
+  _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
   reference
   at(size_type __n)
   {
@@ -1191,7 +1191,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
*  is first checked that it is in the range of the vector.  The
*  function throws out_of_range if the check fails.
*/
-  _GLIBCXX20_CONSTEXPR
+  _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
   const_reference
   at(size_type __n) const
   {
@@ -2042,7 +2042,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
*  and if corresponding elements compare equal.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline bool
 operator==(const vector<_Tp, _Alloc>& __x, const vector<_Tp, _Alloc>& __y)
 { return (__x.size() == __y.size()
@@ -2061,7 +2061,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
*  `<` and `>=` etc.
   */
   template
-_GLIBCXX20_CONSTEXPR
+[[nodiscard]] _GLIBCXX20_CONSTEXPR
 inline __detail::__synth3way_t<_Tp>
 operator<=>(const vector<_Tp, _Alloc>& __x, const vector<_Tp, _Alloc>& __y)
 {
@@ -2082,32 +2082,32 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
*  See std::lexicographical_compare() for how the determination is made.
   */
   template
-inline bool
+_GLIBCXX_NODISCARD inline bool
 operator<(const vector<_Tp, _Alloc>& __x, const vector<_Tp, _Alloc>& __y)
 { return std::lexicographical_compare(__x.begin(), __x.end(),
  __y.begin(), __y.end()); }
 
   /// Based on operator==
   template
-inline bool
+_GLIBCXX_NODISCARD inline bool
 operator!=(const vector<_Tp, _Alloc>& __x, const vector<_Tp, _Alloc>& __y)
 { return !(__x == __y); }
 
   /// Based on operator<
   template
-inline bool
+_GLIBCXX_NODISCARD inline bool
 operator>(const vector<_Tp, _Alloc>& __x, const vector<_Tp, _Alloc>& __y)
 { return __y < __x; }
 
   /// Based on operator<
   template
-inline bool
+_GLIBCXX_NODISCARD inline bool
 operator<=(const vector<_Tp, _Alloc>& __x, const vector<_Tp, _Alloc>& __y)
 { return !(__y < __x); }
 
   /// Based on operator<
   template
-inline bool
+_GLIBCXX_NODISCARD inline bool
 operator>=(const vector<_Tp, _Alloc>& __x, const vector<_Tp, _Alloc>& __y)
 { return !(__x < __y); }
 #endif // three-way comparison
diff --git a/libstdc++-v3/testsuite/23_containers/vector/nodiscard.cc 
b/libstdc++-v3/testsuite/23_containers/vector/nodiscard.cc
new file mode 100644
index 000..3b5480d16d4
--- /dev/null
+++ b/libstdc++-v3/testsuite/23_containers/vector/nodiscard.cc
@@ -0,0 +1,153 @@
+// { dg-do compile { target c++17 } }
+
+#include 
+
+void
+test_observers(std::vector v)
+{
+  v.size(); // { dg-warning "ignoring return value" }
+  v.capacity(); // { dg-warning "ignoring return value" }
+  v.empty(); // { dg-warning "ignoring return value" }
+}
+
+void
+test_element_access(std::vector v)
+{
+  v.front(); // { dg-warning "ignoring return value" }
+  v.back();  // { dg-warning "ignoring return value" }
+  v[1];  // { dg-warning "ignoring return value" }
+  v.at(1);   // { dg-warning "ignoring return value" }
+  v.data();  // { 

[PATCH 7/8] libstdc++: Add nodiscard in

2024-02-27 Thread Jonathan Wakely
Tested x86_64-linux. Reviews invited.

-- >8 --

libstdc++-v3/ChangeLog:

* include/bits/stl_algo.h:
* testsuite/25_algorithms/unique/1.cc: Add dg-warning.
* testsuite/25_algorithms/unique/11480.cc: Likewise.
* testsuite/25_algorithms/unique_copy/26133.cc: Likewise.
---
 libstdc++-v3/include/bits/stl_algo.h  | 38 +--
 .../testsuite/25_algorithms/unique/1.cc   |  1 +
 .../testsuite/25_algorithms/unique/11480.cc   |  2 +-
 .../25_algorithms/unique_copy/26133.cc|  2 +
 4 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/libstdc++-v3/include/bits/stl_algo.h 
b/libstdc++-v3/include/bits/stl_algo.h
index 7a0cf6b6737..bbc130d3e71 100644
--- a/libstdc++-v3/include/bits/stl_algo.h
+++ b/libstdc++-v3/include/bits/stl_algo.h
@@ -320,7 +320,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  [__first1,__last1-(__last2-__first2))
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline _ForwardIterator1
 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
 _ForwardIterator2 __first2, _ForwardIterator2 __last2)
@@ -370,7 +370,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline _ForwardIterator1
 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1,
 _ForwardIterator2 __first2, _ForwardIterator2 __last2,
@@ -405,7 +405,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  @p [__first,__last), and false otherwise.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline bool
 all_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 { return __last == std::find_if_not(__first, __last, __pred); }
@@ -423,7 +423,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  @p [__first,__last), and false otherwise.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline bool
 none_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 { return __last == _GLIBCXX_STD_A::find_if(__first, __last, __pred); }
@@ -442,7 +442,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  otherwise.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline bool
 any_of(_InputIterator __first, _InputIterator __last, _Predicate __pred)
 { return !std::none_of(__first, __last, __pred); }
@@ -458,7 +458,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  such that @p __pred(*i) is false, or @p __last if no such iterator 
exists.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline _InputIterator
 find_if_not(_InputIterator __first, _InputIterator __last,
_Predicate __pred)
@@ -483,7 +483,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  do not.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline bool
 is_partitioned(_InputIterator __first, _InputIterator __last,
   _Predicate __pred)
@@ -505,7 +505,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*   and @p none_of(mid, __last, __pred) are both true.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 _ForwardIterator
 partition_point(_ForwardIterator __first, _ForwardIterator __last,
_Predicate __pred)
@@ -572,7 +572,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  are copied is unchanged.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline _OutputIterator
 remove_copy(_InputIterator __first, _InputIterator __last,
_OutputIterator __result, const _Tp& __value)
@@ -606,7 +606,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline _OutputIterator
 remove_copy_if(_InputIterator __first, _InputIterator __last,
   _OutputIterator __result, _Predicate __pred)
@@ -783,7 +783,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  are still present, but their value is unspecified.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline _ForwardIterator
 remove(_ForwardIterator __first, _ForwardIterator __last,
   const _Tp& __value)
@@ -817,7 +817,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  are still present, but their value is unspecified.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline _ForwardIterator
 remove_if(_ForwardIterator __first, _ForwardIterator __last,
  _Predicate __pred)
@@ -886,7 +886,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
*  are still present, but their value is unspecified.
   */
   template
-_GLIBCXX20_CONSTEXPR
+_GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
 inline _ForwardIterator
 

[PATCH 1/8] libstdc++: Add more [[nodiscard]] to

2024-02-27 Thread Jonathan Wakely
Tested x86_64-linux. Reviews invited.

-- >8 --

libstdc++-v3/ChangeLog:

* include/std/stacktrace: Add nodiscard attribute to all
functions without side effects.
---
 libstdc++-v3/include/std/stacktrace | 36 +
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/libstdc++-v3/include/std/stacktrace 
b/libstdc++-v3/include/std/stacktrace
index f570745fe51..92a69a53d98 100644
--- a/libstdc++-v3/include/std/stacktrace
+++ b/libstdc++-v3/include/std/stacktrace
@@ -75,12 +75,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 // [stacktrace.entry.obs], observers
 
+[[nodiscard]]
 constexpr native_handle_type
 native_handle() const noexcept { return _M_pc; }
 
 constexpr explicit operator bool() const noexcept { return _M_pc != -1; }
 
 // [stacktrace.entry.query], query
+[[nodiscard]]
 string
 description() const
 {
@@ -89,6 +91,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   return __s;
 }
 
+[[nodiscard]]
 string
 source_file() const
 {
@@ -97,6 +100,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   return __s;
 }
 
+[[nodiscard]]
 uint_least32_t
 source_line() const
 {
@@ -106,11 +110,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 }
 
 // [stacktrace.entry.cmp], comparison
+[[nodiscard]]
 friend constexpr bool
 operator==(const stacktrace_entry& __x,
   const stacktrace_entry& __y) noexcept
 { return __x._M_pc == __y._M_pc; }
 
+[[nodiscard]]
 friend constexpr strong_ordering
 operator<=>(const stacktrace_entry& __x,
const stacktrace_entry& __y) noexcept
@@ -384,36 +390,49 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   }
 
   // [stacktrace.basic.obs], observers
+  [[nodiscard]]
   allocator_type get_allocator() const noexcept { return _M_alloc; }
 
+  [[nodiscard]]
   const_iterator
   begin() const noexcept
   { return const_iterator{_M_impl._M_frames}; }
 
+  [[nodiscard]]
   const_iterator
   end() const noexcept
   { return begin() + size(); }
 
+  [[nodiscard]]
   const_reverse_iterator
   rbegin() const noexcept
   { return std::make_reverse_iterator(end()); }
 
+  [[nodiscard]]
   const_reverse_iterator
   rend() const noexcept
   { return std::make_reverse_iterator(begin()); }
 
-  const_iterator cbegin() const noexcept { return begin(); }
-  const_iterator cend() const noexcept { return end(); }
-  const_reverse_iterator crbegin() const noexcept { return rbegin(); };
-  const_reverse_iterator crend() const noexcept { return rend(); };
+  [[nodiscard]] const_iterator cbegin() const noexcept { return begin(); }
+  [[nodiscard]] const_iterator cend() const noexcept { return end(); }
+
+  [[nodiscard]]
+  const_reverse_iterator
+  crbegin() const noexcept { return rbegin(); };
+
+  [[nodiscard]]
+  const_reverse_iterator
+  crend() const noexcept { return rend(); };
 
   [[nodiscard]] bool empty() const noexcept { return size() == 0; }
-  size_type size() const noexcept { return _M_impl._M_size; }
+  [[nodiscard]] size_type size() const noexcept { return _M_impl._M_size; }
 
+  [[nodiscard]]
   size_type
   max_size() const noexcept
   { return _Impl::_S_max_size(_M_impl._M_alloc); }
 
+  [[nodiscard]]
   const_reference
   operator[](size_type __n) const noexcept
   {
@@ -421,6 +440,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return begin()[__n];
   }
 
+  [[nodiscard]]
   const_reference
   at(size_type __n) const
   {
@@ -431,12 +451,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   // [stacktrace.basic.cmp], comparisons
   template
+   [[nodiscard]]
friend bool
operator==(const basic_stacktrace& __x,
   const basic_stacktrace<_Allocator2>& __y) noexcept
{ return std::equal(__x.begin(), __x.end(), __y.begin(), __y.end()); }
 
   template
+   [[nodiscard]]
friend strong_ordering
operator<=>(const basic_stacktrace& __x,
const basic_stacktrace<_Allocator2>& __y) noexcept
@@ -677,6 +699,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   return __os;
 }
 
+  [[nodiscard]]
   inline string
   to_string(const stacktrace_entry& __f)
   {
@@ -686,6 +709,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   }
 
   template
+[[nodiscard]]
 string
 to_string(const basic_stacktrace<_Allocator>& __st)
 {
@@ -785,6 +809,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template<>
 struct hash
 {
+  [[nodiscard]]
   size_t
   operator()(const stacktrace_entry& __f) const noexcept
   {
@@ -796,6 +821,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template
 struct hash>
 {
+  [[nodiscard]]
   size_t
   operator()(const basic_stacktrace<_Allocator>& __st) const noexcept
   {
-- 
2.43.0



[PATCH 4/8] libstdc++: Fix error handling in std::print

2024-02-27 Thread Jonathan Wakely
Tested x86_64-linux. Reviews invited.

-- >8 --

The standard requires an exception if std::print fails to write to a
std::ostream.

libstdc++-v3/ChangeLog:

* include/std/ostream (vprint_nonunicode): Throw if stream state
indicates writing failed.
* testsuite/27_io/basic_ostream/print/1.cc: Check for exception.
* testsuite/27_io/print/1.cc: Likewise.
---
 libstdc++-v3/include/std/ostream|  5 +
 .../testsuite/27_io/basic_ostream/print/1.cc| 17 +
 libstdc++-v3/testsuite/27_io/print/1.cc | 16 
 3 files changed, 38 insertions(+)

diff --git a/libstdc++-v3/include/std/ostream b/libstdc++-v3/include/std/ostream
index a136399ad0b..3740ad6edfa 100644
--- a/libstdc++-v3/include/std/ostream
+++ b/libstdc++-v3/include/std/ostream
@@ -901,6 +901,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__catch(...)
  { __os._M_setstate(ios_base::badbit); }
   }
+
+if (!__os)
+  __throw_system_error(EIO);
   }
 
   inline void
@@ -974,6 +977,8 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__catch(...)
  { __os._M_setstate(ios_base::badbit); }
   }
+if (!__os)
+  __throw_system_error(EIO);
 #endif // _WIN32
   }
 
diff --git a/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc 
b/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
index 71a4daa04c9..14bfb14d556 100644
--- a/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
+++ b/libstdc++-v3/testsuite/27_io/basic_ostream/print/1.cc
@@ -103,6 +103,22 @@ test_locale()
   }
 }
 
+void
+test_errors()
+{
+#ifdef __cpp_exceptions
+  std::stringstream in(std::ios::in);
+  try
+  {
+std::print(in, "{}", "nope");
+VERIFY(false);
+  }
+  catch (const std::system_error&)
+  {
+  }
+#endif
+}
+
 int main()
 {
   test_print_ostream();
@@ -111,4 +127,5 @@ int main()
   test_print_no_padding();
   test_vprint_nonunicode();
   test_locale();
+  test_errors();
 }
diff --git a/libstdc++-v3/testsuite/27_io/print/1.cc 
b/libstdc++-v3/testsuite/27_io/print/1.cc
index 6a294e0454b..d570f7938be 100644
--- a/libstdc++-v3/testsuite/27_io/print/1.cc
+++ b/libstdc++-v3/testsuite/27_io/print/1.cc
@@ -74,6 +74,21 @@ test_vprint_nonunicode()
   // { dg-output "garbage in . garbage out" }
 }
 
+void
+test_errors()
+{
+#ifdef __cpp_exceptions
+  try
+  {
+std::print(stdin, "{}", "nope");
+VERIFY(false);
+  }
+  catch (const std::system_error&)
+  {
+  }
+#endif
+}
+
 int main()
 {
   test_print_default();
@@ -82,4 +97,5 @@ int main()
   test_println_file();
   test_print_raw();
   test_vprint_nonunicode();
+  test_errors();
 }
-- 
2.43.0



[PATCH 2/8] libstdc++: Include in

2024-02-27 Thread Jonathan Wakely
Tested x86_64-linux. Reviews invited.

-- >8 --

libstdc++-v3/ChangeLog:

* include/bits/alloc_traits.h: Include  for
__make_move_if_noexcept_iterator.
---
 libstdc++-v3/include/bits/alloc_traits.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libstdc++-v3/include/bits/alloc_traits.h 
b/libstdc++-v3/include/bits/alloc_traits.h
index 5e661e04741..82fc79c7b9f 100644
--- a/libstdc++-v3/include/bits/alloc_traits.h
+++ b/libstdc++-v3/include/bits/alloc_traits.h
@@ -38,6 +38,9 @@
 # if _GLIBCXX_HOSTED
 #  include 
 # endif
+# if __cpp_exceptions
+#  include  // __make_move_if_noexcept_iterator
+# endif
 #endif
 
 namespace std _GLIBCXX_VISIBILITY(default)
-- 
2.43.0



Re: [patch] OpenACC: Add Fortran routines acc_{alloc,free,hostptr,deviceptr,memcpy_{to,from}_device*}

2024-02-27 Thread Thomas Schwinge
Hi Tobias!

On 2024-02-19T22:36:51+0100, Tobias Burnus  wrote:
> While waiting for some testing to finish, I got distracted and added the
> very low hanging OpenACC 3.3 fruits, i.e. those Fortran routines that directly
> map to their C counter part.
>
> Comments, remarks?

Thanks, that largely looks straight-forward.  I've not done an in-depth
review, just a few comments.  Resolve these as you think is necessary,
and then 'git push'.

I don't know much about Fortran interfaces -- I trust you've got that
under control.  ;-)

Thanks for the test cases.  Would be nice to have test cases covering all
interfaces -- but I don't think we're currently complete in that regard,
so shall not hold your contribution to higher standards.

> OpenACC: Add Fortran routines 
> acc_{alloc,free,hostptr,deviceptr,memcpy_{to,from}_device*}
>
> These routines map simply to the C counterpart and are meanwhile
> defined in OpenACC 3.3. (There are additional routine changes,
> including the Fortran addition of acc_attach/acc_detach, that
> require more work than a simple addition of an interface and
> are therefore excluded.)

I saw:

  -  "Bogus 'Warning: Interface mismatch in 
global procedure' with C binding"
  -  "[OpenACC][OpenACC 3.3] Add 
'acc_attach'/'acc_detach' routine"

> --- a/libgomp/libgomp.texi
> +++ b/libgomp/libgomp.texi

>  @section @code{acc_malloc} -- Allocate device memory.
>  @table @asis
>  @item @emph{Description}
> -This function allocates @var{len} bytes of device memory. It returns
> +This function allocates @var{bytes} of device memory. It returns

Not '@var{bytes} {+bytes+}' or similar?

>  @section @code{acc_memcpy_to_device} -- Copy host memory to device memory.

>  @item @emph{C/C++}:
>  @multitable @columnfractions .20 .80
> -@item @emph{Prototype}: @tab @code{acc_memcpy_to_device(d_void *dest, h_void 
> *src, size_t bytes);}
> +@item @emph{Prototype}: @tab @code{void acc_memcpy_to_device(d_void* 
> data_dev_dest,}
> +@item   @tab @code{h_void* data_host_src, size_t bytes);}
> +@item @emph{Prototype}: @tab @code{void acc_memcpy_to_device_async(d_void* 
> data_dev_dest,}
> +@item   @tab @code{h_void* data_host_src, size_t bytes, int 
> async_arg);}
> +@end multitable
> +
> +@item @emph{Fortran}:
> +@multitable @columnfractions .20 .80
> +@item @emph{Interface}: @tab @code{subroutine 
> acc_memcpy_to_device(data_dev_dest, &}
> +@item   @tab @code{data_host_src, bytes)}
> +@item @emph{Interface}: @tab @code{subroutine 
> acc_memcpy_to_device_async(data_dev_dest, &}
> +@item   @tab @code{data_host_src, bytes, async_arg)}
> +@item   @tab @code{type(c_ptr), value :: data_dev_dest}
> +@item   @tab @code{type(*), dimension(*) :: data_host_src}
> +@item   @tab @code{integer(c_size_t), value :: bytes}
> +@item   @tab @code{integer(acc_handle_kind), value :: 
> async_arg}
>  @end multitable

I did wonder whether we should (here, and elsewhere) also update the
'@menu' in "OpenACC Runtime Library Routines" to list the 'async'
routines -- but the OpenACC specification also doesn't, so it shall be
fine as is here, too.

>  @item @emph{Reference}:
>  @uref{https://www.openacc.org, OpenACC specification v2.6}, section
> -3.2.31.
> +3.2.31  @uref{https://www.openacc.org, OpenACC specification v3.3}, section

(Fine as is, of course, but could -- generally -- simplify the 'diff' by
starting the new '@uref' on its own line.)

> +3.2.26..

Double '.'.

> --- a/libgomp/openacc.f90
> +++ b/libgomp/openacc.f90
> @@ -758,6 +758,93 @@ module openacc_internal
>integer (c_int), value :: async
>  end subroutine
>end interface
> +
> +  interface
> +type(c_ptr) function acc_malloc (bytes) bind(C)
> +[...]
> +end subroutine
> +  end interface
>  end module openacc_internal

Assuming that 'module openacc_internal' currently is sorted per
appearance in the OpenACC specification (?), I suggest we continue to do
so.  (..., like in 'openacc_lib.h', too.)

> @@ -794,6 +881,9 @@ module openacc
>public :: acc_copyin_async, acc_create_async, acc_copyout_async
>public :: acc_delete_async, acc_update_device_async, acc_update_self_async
>public :: acc_copyout_finalize, acc_delete_finalize
> +  public :: acc_malloc, acc_free, acc_map_data, acc_unmap_data, acc_deviceptr
> +  public :: acc_hostptr, acc_memcpy_to_device, acc_memcpy_to_device_async
> +  public :: acc_memcpy_from_device, acc_memcpy_from_device_async

Likewise.

> @@ -871,9 +961,6 @@ module openacc
>  procedure :: acc_on_device_h
>end interface
>  
> -  ! acc_malloc: Only available in C/C++
> -  ! acc_free: Only available in C/C++
> -
>! As vendor extension, the following code supports both 32bit and 64bit
>! arguments for "size"; the OpenACC standard only permits default-kind
>! integers, which are of kind 4 (i.e. 32 bits).
> @@ -953,20 

RE: [PATCH v1] Internal-fn: Add new internal function SAT_ADDU

2024-02-27 Thread Tamar Christina
> Am 19.02.24 um 08:36 schrieb Richard Biener:
> > On Sat, Feb 17, 2024 at 11:30 AM  wrote:
> >>
> >> From: Pan Li 
> >>
> >> This patch would like to add the middle-end presentation for the
> >> unsigned saturation add.  Aka set the result of add to the max
> >> when overflow.  It will take the pattern similar as below.
> >>
> >> SAT_ADDU (x, y) => (x + y) | (-(TYPE)((TYPE)(x + y) < x))
> 
> Does this even try to wort out the costs?
> 
> For example, with the following example
> 
> 
> #define T __UINT16_TYPE__
> 
> T sat_add1 (T x, T y)
> {
>return (x + y) | (- (T)((T)(x + y) < x));
> }
> 
> T sat_add2 (T x, T y)
> {
>  T z = x + y;
>  if (z < x)
>  z = (T) -1;
>  return z;
> }
> 
> And then "avr-gcc -S -Os -dp" the code is
> 
> 
> sat_add1:
>   add r22,r24  ;  7   [c=8 l=2]  *addhi3/0
>   adc r23,r25
>   ldi r18,lo8(1)   ;  8   [c=4 l=2]  *movhi/4
>   ldi r19,0
>   cp r22,r24   ;  9   [c=8 l=2]  cmphi3/2
>   cpc r23,r25
>   brlo .L2 ;  10  [c=16 l=1]  branch
>   ldi r19,0;  31  [c=4 l=1]  movqi_insn/0
>   ldi r18,0;  32  [c=4 l=1]  movqi_insn/0
> .L2:
>   clr r24  ;  13  [c=12 l=4]  neghi2/1
>   clr r25
>   sub r24,r18
>   sbc r25,r19
>   or r24,r22   ;  29  [c=4 l=1]  iorqi3/0
>   or r25,r23   ;  30  [c=4 l=1]  iorqi3/0
>   ret  ;  35  [c=0 l=1]  return
> 
> sat_add2:
>   add r22,r24  ;  8   [c=8 l=2]  *addhi3/0
>   adc r23,r25
>   cp r22,r24   ;  9   [c=8 l=2]  cmphi3/2
>   cpc r23,r25
>   brsh .L3 ;  10  [c=16 l=1]  branch
>   ldi r22,lo8(-1)  ;  5   [c=4 l=2]  *movhi/4
>   ldi r23,lo8(-1)
> .L3:
>   mov r25,r23  ;  21  [c=4 l=1]  movqi_insn/0
>   mov r24,r22  ;  22  [c=4 l=1]  movqi_insn/0
>   ret  ;  25  [c=0 l=1]  return
> 
> i.e. the conditional jump is better than overly smart arithmetic
> (smaller and faster code with less register pressure).
> With larger dypes the difference is even more pronounced-
> 

*on AVR. https://godbolt.org/z/7jaExbTa8  shows the branchless code is better.
And the branchy code will vectorize worse if at all 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112600
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=51492

But looking at that output it just seems like it's your expansion that's 
inefficient.

But fair point, perhaps it should be just a normal DEF_INTERNAL_SIGNED_OPTAB_FN 
so that we
provide the additional optimization only for targets that want it.

Tamar

> >> Take uint8_t as example, we will have:
> >>
> >> * SAT_ADDU (1, 254)   => 255.
> >> * SAT_ADDU (1, 255)   => 255.
> >> * SAT_ADDU (2, 255)   => 255.
> >> * SAT_ADDU (255, 255) => 255.
> >>
> >> The patch also implement the SAT_ADDU in the riscv backend as
> >> the sample.  Given below example:
> >>
> >> uint64_t sat_add_u64 (uint64_t x, uint64_t y)
> >> {
> >>return (x + y) | (- (uint64_t)((uint64_t)(x + y) < x));
> >> }
> >>
> >> Before this patch:
> >>
> >> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> >> {
> >>long unsigned int _1;
> >>_Bool _2;
> >>long unsigned int _3;
> >>long unsigned int _4;
> >>uint64_t _7;
> >>long unsigned int _10;
> >>__complex__ long unsigned int _11;
> >>
> >> ;;   basic block 2, loop depth 0
> >> ;;pred:   ENTRY
> >>_11 = .ADD_OVERFLOW (x_5(D), y_6(D));
> >>_1 = REALPART_EXPR <_11>;
> >>_10 = IMAGPART_EXPR <_11>;
> >>_2 = _10 != 0;
> >>_3 = (long unsigned int) _2;
> >>_4 = -_3;
> >>_7 = _1 | _4;
> >>return _7;
> >> ;;succ:   EXIT
> >>
> >> }
> >>
> >> After this patch:
> >>
> >> uint64_t sat_add_uint64_t (uint64_t x, uint64_t y)
> >> {
> >>uint64_t _7;
> >>
> >> ;;   basic block 2, loop depth 0
> >> ;;pred:   ENTRY
> >>_7 = .SAT_ADDU (x_5(D), y_6(D)); [tail call]
> >>return _7;
> >> ;;succ:   EXIT
> >>
> >> }
> >>
> >> Then we will have the middle-end representation like .SAT_ADDU after
> >> this patch.
> >
> > I'll note that on RTL we already have SS_PLUS/US_PLUS and friends and
> > the corresponding ssadd/usadd optabs.  There's not much documentation
> > unfortunately besides the use of gen_*_fixed_libfunc usage where the comment
> > suggests this is used for fixed-point operations.  It looks like arm uses
> > fractional/accumulator modes for this but for example bfin has ssaddsi3.
> >
> > So the question is whether the fixed-point case can be distinguished from
> > the integer case based on mode.
> >
> > There's also FIXED_POINT_TYPE on the GENERIC/GIMPLE side and
> > no special tree operator codes for them.  So compared to what appears
> > to be the case on RTL we'd need a way to represent saturating integer
> > operations on GIMPLE.
> >
> > The natural thing is to use direct optab internal functions (that's what you
> > basically did, but you added a new optab, IMO without good reason).
> > More 

  1   2   >