Re: [PATCH 2/4] xtensa: Consider the Loop Option when setmemsi is expanded to small loop

2022-06-10 Thread Takayuki 'January June' Suwa via Gcc-patches

On 2022/06/11 9:12, Max Filippov wrote:

Hi Suwa-san,

hi!


This change results in a bunch of ICEs in tests that look like this:

gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c: In function 'main':
gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: error:
unrecognizable insn:
(insn 7 6 8 2 (set (reg:SI 45)
(plus:SI (reg:SI 44)
(const_int 262144 [0x4])))

oh, what a my mistake... it's so RISCy!

int array[65535];
void test(void) {
  __builtin_memset(array, 0, sizeof(array));
}

.literal_position
.literal .LC0, array
.literal .LC2, 65535
test:
l32ra3, .LC0
l32ra2, .LC2
movi.n  a4, 0
loopa2, .L2_LEND
.L2:
s32i.n  a4, a3, 0
addi.n  a3, a3, 4
.L2_LEND:
ret.n
---
  gcc/config/xtensa/xtensa.cc | 71 ++---
  1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index c7b54babc37..bc3330f836f 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -1483,7 +1483,7 @@ xtensa_expand_block_set_unrolled_loop (rtx *operands)
  int
  xtensa_expand_block_set_small_loop (rtx *operands)
  {
-  HOST_WIDE_INT bytes, value, align;
+  HOST_WIDE_INT bytes, value, align, count;
int expand_len, funccall_len;
rtx x, dst, end, reg;
machine_mode unit_mode;
@@ -1503,17 +1503,25 @@ xtensa_expand_block_set_small_loop (rtx *operands)
/* Totally-aligned block only.  */
if (bytes % align != 0)
  return 0;
+  count = bytes / align;

-  /* If 4-byte aligned, small loop substitution is almost optimal, thus
- limited to only offset to the end address for ADDI/ADDMI
instruction.  */
-  if (align == 4
-  && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0)))
-return 0;
+  /* If the Loop Option (zero-overhead looping) is configured and active,
+ almost no restrictions about the length of the block.  */
+  if (! (TARGET_LOOPS && optimize))
+{
+  /* If 4-byte aligned, small loop substitution is almost optimal,
+thus limited to only offset to the end address for ADDI/ADDMI
+instruction.  */
+  if (align == 4
+ && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0)))
+   return 0;

-  /* If no 4-byte aligned, loop count should be treated as the
constraint.  */
-  if (align != 4
-  && bytes / align > ((optimize > 1 && !optimize_size) ? 8 : 15))
-return 0;
+  /* If no 4-byte aligned, loop count should be treated as the
+constraint.  */
+  if (align != 4
+ && count > ((optimize > 1 && !optimize_size) ? 8 : 15))
+   return 0;
+}

/* Insn expansion: holding the init value.
   Either MOV(.N) or L32R w/litpool.  */
@@ -1523,16 +1531,33 @@ xtensa_expand_block_set_small_loop (rtx *operands)
  expand_len = TARGET_DENSITY ? 2 : 3;
else
  expand_len = 3 + 4;
-  /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
-  expand_len += bytes > 127 ? 3
-   : (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
-
-  /* Insn expansion: the loop body and branch instruction.
- For store, one of S8I, S16I or S32I(.N).
- For advance, ADDI(.N).
- For branch, BNE.  */
-  expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
-   + (TARGET_DENSITY ? 2 : 3) + 3;
+  if (TARGET_LOOPS && optimize) /* zero-overhead looping */
+{
+  /* Insn translation: Either MOV(.N) or L32R w/litpool for the
+loop count.  */
+  expand_len += xtensa_simm12b (count) ? xtensa_sizeof_MOVI (count)
+  : 3 + 4;
+  /* Insn translation: LOOP, the zero-overhead looping setup
+instruction.  */
+  expand_len += 3;
+  /* Insn expansion: the loop body instructions.
+   For store, one of S8I, S16I or S32I(.N).
+   For advance, ADDI(.N).  */
+  expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+   + (TARGET_DENSITY ? 2 : 3);
+}
+  else /* NO zero-overhead looping */
+{
+  /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
+  expand_len += bytes > 127 ? 3
+   : (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
+  /* Insn expansion: the loop body and branch instruction.
+   For store, one of S8I, S16I or S32I(.N).
+   For advance, ADDI(.N).
+   For branch, BNE.  */
+  expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+   + (TARGET_DENSITY ? 2 : 3) + 3;
+}

/* Function call: preparing two arguments.  */
funccall_len = xtensa_sizeof_MOVI (value);
@@ -1555,7 +1580,11 @@ xtensa_expand_block_set_small_loop (rtx *operands)
dst = gen_reg_rtx (SImode);
emit_move_insn (dst, x);
end = gen_reg_rtx (SImode);
-  emit_insn (gen_addsi3 (end, dst, operands[1] /* the length */));
+  if (TARGET_LOOPS && optimize)
+x = force_reg (SImode, 

Re: [PATCH 2/4] xtensa: Consider the Loop Option when setmemsi is expanded to small loop

2022-06-10 Thread Max Filippov via Gcc-patches
Hi Suwa-san,

On Thu, Jun 9, 2022 at 9:26 PM Takayuki 'January June' Suwa
 wrote:
>
> Now apply to almost any size of aligned block under such circumstances.
>
> gcc/ChangeLog:
>
> * config/xtensa/xtensa.cc (xtensa_expand_block_set_small_loop):
> Pass through the block length / loop count conditions if
> zero-overhead looping is configured and active,
> ---
>   gcc/config/xtensa/xtensa.cc | 65 +
>   1 file changed, 45 insertions(+), 20 deletions(-)

This change results in a bunch of ICEs in tests that look like this:

gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c: In function 'main':
gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: error:
unrecognizable insn:
(insn 7 6 8 2 (set (reg:SI 45)
   (plus:SI (reg:SI 44)
   (const_int 262144 [0x4])))
"gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c":23:3 -1
(nil))
during RTL pass: vregs
gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: internal
compiler error: in extract_insn, at recog.cc:2791
0x6a21cf _fatal_insn(char const*, rtx_def const*, char const*, int, char const*)
   gcc/gcc/rtl-error.cc:108
0x6a2252 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*)
   gcc/gcc/rtl-error.cc:116
0x693824 extract_insn(rtx_insn*)
   gcc/gcc/recog.cc:2791
0xb27647 instantiate_virtual_regs_in_insn
   gcc/gcc/function.cc:1611
0xb27647 instantiate_virtual_regs
   gcc/gcc/function.cc:1985
0xb27647 execute
   gcc/gcc/function.cc:2034

-- 
Thanks.
-- Max


Re: [PATCH] libstdc++: Rename __null_terminated to avoid collision with Apple SDK

2022-06-10 Thread Mark Mentovai
Thanks, Jonathan. I am, in fact, so certifying.

I do believe that bringing up support for new OS versions is in scope for
open branches, and it makes sense to merge, particularly for a trivial and
uncontentious patch like this one.

Jonathan Wakely wrote:

> On Fri, 10 Jun 2022 at 21:12, Mark Mentovai  wrote:
> >
> > The macOS 13 SDK (and equivalent-version iOS and other Apple OS SDKs)
> > contain this definition in :
> >
> > 863  #define __null_terminated
> >
> > This collides with the use of __null_terminated in libstdc++'s
> > experimental fs_path.h.
> >
> > As libstdc++'s use of this token is entirely internal to fs_path.h, the
> > simplest workaround, renaming it, is most appropriate. Here, it's
> > renamed to __nul_terminated, referencing the NUL ('\0') value that is
> > used to terminate the strings in the context in which this tag structure
> > is used.
> >
> > libstdc++-v3/ChangeLog:
> >
> > * include/experimental/bits/fs_path.h: Rename __null_terminated
> > to __nul_terminated avoid colliding with a macro in Apple's SDK.
> >
> > Signed-off-by: Mark Mentovai 
>
> Thanks for the patch. The change makes sense so I'll get it committed.
> Is this change needed on the release branches too?
>
> Just to be sure, could you please confirm that your Signed-off-by: tag
> is to certify you agree with the DCO at https://gcc.gnu.org/dco.html
> (and not just something you're doing because you've seen others doing
> it :-)
>
> Thanks again.
>
>


Re: [PATCH] libstdc++: Rename __null_terminated to avoid collision with Apple SDK

2022-06-10 Thread Jonathan Wakely via Gcc-patches
On Fri, 10 Jun 2022 at 21:12, Mark Mentovai  wrote:
>
> The macOS 13 SDK (and equivalent-version iOS and other Apple OS SDKs)
> contain this definition in :
>
> 863  #define __null_terminated
>
> This collides with the use of __null_terminated in libstdc++'s
> experimental fs_path.h.
>
> As libstdc++'s use of this token is entirely internal to fs_path.h, the
> simplest workaround, renaming it, is most appropriate. Here, it's
> renamed to __nul_terminated, referencing the NUL ('\0') value that is
> used to terminate the strings in the context in which this tag structure
> is used.
>
> libstdc++-v3/ChangeLog:
>
> * include/experimental/bits/fs_path.h: Rename __null_terminated
> to __nul_terminated avoid colliding with a macro in Apple's SDK.
>
> Signed-off-by: Mark Mentovai 

Thanks for the patch. The change makes sense so I'll get it committed.
Is this change needed on the release branches too?

Just to be sure, could you please confirm that your Signed-off-by: tag
is to certify you agree with the DCO at https://gcc.gnu.org/dco.html
(and not just something you're doing because you've seen others doing
it :-)

Thanks again.



[PATCH] libstdc++: Rename __null_terminated to avoid collision with Apple SDK

2022-06-10 Thread Mark Mentovai
The macOS 13 SDK (and equivalent-version iOS and other Apple OS SDKs)
contain this definition in :

863  #define __null_terminated

This collides with the use of __null_terminated in libstdc++'s
experimental fs_path.h.

As libstdc++'s use of this token is entirely internal to fs_path.h, the
simplest workaround, renaming it, is most appropriate. Here, it's
renamed to __nul_terminated, referencing the NUL ('\0') value that is
used to terminate the strings in the context in which this tag structure
is used.

libstdc++-v3/ChangeLog:

* include/experimental/bits/fs_path.h: Rename __null_terminated
to __nul_terminated avoid colliding with a macro in Apple's SDK.

Signed-off-by: Mark Mentovai 
---
 libstdc++-v3/include/experimental/bits/fs_path.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libstdc++-v3/include/experimental/bits/fs_path.h 
b/libstdc++-v3/include/experimental/bits/fs_path.h
index b0825ba76e80..19d246100cb5 100644
--- a/libstdc++-v3/include/experimental/bits/fs_path.h
+++ b/libstdc++-v3/include/experimental/bits/fs_path.h
@@ -140,10 +140,10 @@ namespace __detail
 inline _Source
 _S_range_begin(_Source __begin) { return __begin; }
 
-  struct __null_terminated { };
+  struct __nul_terminated { };
 
   template
-inline __null_terminated
+inline __nul_terminated
 _S_range_end(_Source) { return {}; }
 
   template
@@ -459,11 +459,11 @@ namespace __detail
   struct _Cvt;
 
 static string_type
-_S_convert(value_type* __src, __detail::__null_terminated)
+_S_convert(value_type* __src, __detail::__nul_terminated)
 { return string_type(__src); }
 
 static string_type
-_S_convert(const value_type* __src, __detail::__null_terminated)
+_S_convert(const value_type* __src, __detail::__nul_terminated)
 { return string_type(__src); }
 
 template
@@ -477,7 +477,7 @@ namespace __detail
 
 template
   static string_type
-  _S_convert(_InputIterator __src, __detail::__null_terminated)
+  _S_convert(_InputIterator __src, __detail::__nul_terminated)
   {
auto __s = _S_string_from_iter(__src);
return _S_convert(__s.c_str(), __s.c_str() + __s.size());
@@ -504,7 +504,7 @@ namespace __detail
 
 template
   static string_type
-  _S_convert_loc(_InputIterator __src, __detail::__null_terminated,
+  _S_convert_loc(_InputIterator __src, __detail::__nul_terminated,
 const std::locale& __loc)
   {
const std::string __s = _S_string_from_iter(__src);
-- 
2.36.1



Re: [PATCH] regrename: Fix -fcompare-debug issue in check_new_reg_p [PR105041]

2022-06-10 Thread Jeff Law via Gcc-patches




On 6/10/2022 9:40 AM, Segher Boessenkool wrote:

Hi!

On Fri, Jun 10, 2022 at 07:52:57PM +0530, Surya Kumari Jangala wrote:

In check_new_reg_p, the nregs of a du chain is computed by obtaining the MODE
of the first element in the chain, and then calling hard_regno_nregs() with the
MODE. But the first element of the chain can be a DEBUG_INSN whose mode need
not be the same as the rest of the elements in the du chain. This
was resulting in fcompare-debug failure as check_new_reg_p was returning a
different result with -g for the same candidate register. We can instead obtain
nregs from the du chain itself.

Great, thanks for finding and fixing this!  I cannot approve it, you'll
have to wait for someone who can.  It looks fine to me, but that does
not mean so much in regrename.c :-)
I'll go ahead and ACK the regrename bits.  So as soon as you're happy 
with the testsuite bits, this is good to go.


jeff



Re: [PATCH] c++: Add support for __real__/__imag__ modifications in constant expressions [PR88174]

2022-06-10 Thread Jakub Jelinek via Gcc-patches
On Fri, Jun 10, 2022 at 01:27:28PM -0400, Jason Merrill wrote:
> > --- gcc/cp/constexpr.cc.jj  2022-06-08 08:21:02.973448193 +0200
> > +++ gcc/cp/constexpr.cc 2022-06-08 17:13:04.986040449 +0200
> > @@ -5707,6 +5707,20 @@ cxx_eval_store_expression (const constex
> >   }
> >   break;
> > +   case REALPART_EXPR:
> > + gcc_assert (probe == target);
> 
> Doesn't this assert mean that complex_expr will always be == valp?

No, even when handling the pushed *PART_EXPR, it will set
valp = _OPERAND (*valp, index != integer_zero_node);
So, valp will be either _OPERAND (*complex_expr, 0)
or _OPERAND (*complex_expr, 1).
As *valp = init; is what is usually then stored and we want to store there
the scalar.

> > @@ -5946,6 +5984,24 @@ cxx_eval_store_expression (const constex
> > = get_or_insert_ctor_field (*valp, indexes[i], index_pos_hints[i]);
> >   valp = >value;
> > }
> > +  if (complex_part != -1)
> > +   {
> > + if (TREE_CODE (*valp) == COMPLEX_CST)
> > +   *valp = build2 (COMPLEX_EXPR, TREE_TYPE (*valp),
> > +   TREE_REALPART (*valp),
> > +   TREE_IMAGPART (*valp));
> > + else if (TREE_CODE (*valp) == CONSTRUCTOR
> > +  && CONSTRUCTOR_NELTS (*valp) == 0
> > +  && CONSTRUCTOR_NO_CLEARING (*valp))
> > +   {
> > + tree r = build_constructor (TREE_TYPE (TREE_TYPE (*valp)), NULL);
> > + CONSTRUCTOR_NO_CLEARING (r) = 1;
> > + *valp = build2 (COMPLEX_EXPR, TREE_TYPE (*valp), r, r);
> > +   }
> > + gcc_assert (TREE_CODE (*valp) == COMPLEX_EXPR);
> > + complex_expr = valp;
> > + valp = _OPERAND (*valp, complex_part);
> 
> I don't understand this block; shouldn't valp point to the real or imag part
> of the complex number at this point?  How could complex_part be set without
> us handling the complex case in the loop already?

Because for most references, the code will do:
  vec_safe_push (ctors, *valp);
  vec_safe_push (indexes, index);
I chose not to do this for *PART_EXPR, because the COMPLEX_EXPR isn't a
CONSTRUCTOR and code later on e.g. walks all the ctors and accesses
CONSTRUCTOR_NO_CLEARING on them etc.  As the *PART_EXPR is asserted to
be outermost only, complex_expr is a variant of that ctors push and
complex_part of the indexes.
The reason for the above if is just in case the evaluation of the rhs
of the store would store to the complex and could e.g. make it a COMPLEX_CST
again.

> > +   }
> >   }
> > if (*non_constant_p)
> > @@ -6016,6 +6072,22 @@ cxx_eval_store_expression (const constex
> > if (TREE_CODE (TREE_TYPE (elt)) == UNION_TYPE)
> >   CONSTRUCTOR_NO_CLEARING (elt) = false;
> > }
> > +  if (complex_expr)
> 
> I might have added the COMPLEX_EXPR to ctors instead of a separate variable,
> but this is fine too.

See above.
The COMPLEX_EXPR needs special handling (conversion into COMPLEX_CST if it
is constant) anyway.

Jakub



c++: Add a late-writing step for modules

2022-06-10 Thread Nathan Sidwell via Gcc-patches


To add a module initializer optimization, we need to defer finishing 
writing out the module file until the end of determining the dynamic 
initializers. This is achieved by passing some saved-state from the main 
module writing to a new function that completes it.


This patch merely adds the skeleton of that state and move things 
around, allowing the finalization of the ELF file to be postponed.  None 
of the contents writing is moved, or the init optimization added.


nathan

--
Nathan SidwellFrom e6d369bbdb4eb5f03eec233ef9905013a735fd71 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell 
Date: Thu, 9 Jun 2022 08:14:31 -0700
Subject: [PATCH] c++: Add a late-writing step for modules

To add a module initializer optimization, we need to defer finishing writing
out the module file until the end of determining the dynamic initializers.
This is achieved by passing some saved-state from the main module writing
to a new function that completes it.

This patch merely adds the skeleton of that state and move things around,
allowing the finalization of the ELF file to be postponed.  None of the
contents writing is moved, or the init optimization added.

	gcc/cp/
	* cp-tree.h (fini_modules): Add some parameters.
	(finish_module_processing): Return an opaque pointer.
	* decl2.cc (c_parse_final_cleanups): Propagate a cookie from
	finish_module_processing to fini_modules.
	* module.cc (struct module_processing_cookie): New.
	(finish_module_processing): Return a heap-allocated cookie.
	(late_finish_module): New.  Finish out the module writing.
	(fini_modules): Adjust.
---
 gcc/cp/cp-tree.h |   4 +-
 gcc/cp/decl2.cc  |   4 +-
 gcc/cp/module.cc | 145 ++-
 3 files changed, 98 insertions(+), 55 deletions(-)

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index f1294dac7d5..60d7b201595 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -7209,9 +7209,9 @@ extern void import_module (module_state *, location_t, bool export_p,
 extern void declare_module (module_state *, location_t, bool export_p,
 			tree attr, cpp_reader *);
 extern void init_modules (cpp_reader *);
-extern void fini_modules ();
+extern void fini_modules (cpp_reader *, void *cookie);
 extern void maybe_check_all_macros (cpp_reader *);
-extern void finish_module_processing (cpp_reader *);
+extern void *finish_module_processing (cpp_reader *);
 extern char const *module_name (unsigned, bool header_ok);
 extern bitmap get_import_bitmap ();
 extern bitmap visible_instantiation_path (bitmap *);
diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc
index ff1c36745cf..cc0b41324b3 100644
--- a/gcc/cp/decl2.cc
+++ b/gcc/cp/decl2.cc
@@ -5154,7 +5154,7 @@ c_parse_final_cleanups (void)
 	reconsider = true;
 }
 
-  finish_module_processing (parse_in);
+  void *module_cookie = finish_module_processing (parse_in);
 
   lower_var_init ();
 
@@ -5238,7 +5238,7 @@ c_parse_final_cleanups (void)
   }
   pop_lang_context ();
 
-  fini_modules ();
+  fini_modules (parse_in, module_cookie);
 
   /* Generate any missing aliases.  */
   maybe_apply_pending_pragma_weaks ();
diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc
index 2b1877ea82e..51d774ae608 100644
--- a/gcc/cp/module.cc
+++ b/gcc/cp/module.cc
@@ -19854,11 +19854,32 @@ maybe_check_all_macros (cpp_reader *reader)
   dump.pop (n);
 }
 
+// State propagated from finish_module_processing to fini_modules
+struct module_processing_cookie
+{
+  elf_out out;
+  char *cmi_name;
+  char *tmp_name;
+  bool began;
+
+  module_processing_cookie (char *cmi, char *tmp, int fd, int e)
+: out (fd, e), cmi_name (cmi), tmp_name (tmp), began (false)
+  {
+  }
+  ~module_processing_cookie ()
+  {
+XDELETEVEC (tmp_name);
+XDELETEVEC (cmi_name);
+  }
+};
+
 /* Write the CMI, if we're a module interface.  */
 
-void
+void *
 finish_module_processing (cpp_reader *reader)
 {
+  module_processing_cookie *cookie = nullptr;
+
   if (header_module_p ())
 module_kind &= ~MK_EXPORTING;
 
@@ -19870,7 +19891,7 @@ finish_module_processing (cpp_reader *reader)
   else if (!flag_syntax_only)
 {
   int fd = -1;
-  int e = ENOENT;
+  int e = -1;
 
   timevar_start (TV_MODULE_EXPORT);
 
@@ -19879,7 +19900,7 @@ finish_module_processing (cpp_reader *reader)
   linemap_add (line_table, LC_ENTER, false, "", 0);
 
   /* We write to a tmpname, and then atomically rename.  */
-  const char *path = NULL;
+  char *cmi_name = NULL;
   char *tmp_name = NULL;
   module_state *state = (*modules)[0];
 
@@ -19888,9 +19909,9 @@ finish_module_processing (cpp_reader *reader)
   if (state->filename)
 	{
 	  size_t len = 0;
-	  path = maybe_add_cmi_prefix (state->filename, );
+	  cmi_name = xstrdup (maybe_add_cmi_prefix (state->filename, ));
 	  tmp_name = XNEWVEC (char, len + 3);
-	  memcpy (tmp_name, path, len);
+	  memcpy (tmp_name, cmi_name, len);
 	  strcpy (_name[len], "~");
 
 	  if (!errorcount)
@@ -19905,57 +19926,23 @@ finish_module_processing (cpp_reader 

[PATCH] i386: Fix up *3_doubleword_mask [PR105911

2022-06-10 Thread Jakub Jelinek via Gcc-patches
Hi!

Another regression caused by my recent patch.

This time because define_insn_and_split only requires that the
constant mask is const_int_operand.  When it was only SImode,
that wasn't a problem, HImode neither, but for DImode if we need
to and the shift count we might run into a problem that it isn't
a representable signed 32-bit immediate.

But, we don't really care about the upper bits of the mask, so
we can just mask the CONST_INT with the mode mask.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-06-10  Jakub Jelinek  

PR target/105911
* config/i386/i386.md ((*ashl3_doubleword_mask,
*3_doubleword_mask): Use operands[3] masked with
( * BITS_PER_UNIT) - 1 as AND operand instead of
operands[3] unmodified.

* gcc.dg/pr105911.c: New test.

--- gcc/config/i386/i386.md.jj  2022-06-08 08:21:26.0 +0200
+++ gcc/config/i386/i386.md 2022-06-10 11:37:21.931171567 +0200
@@ -11937,7 +11937,8 @@ (define_insn_and_split "*ashl3_doub
   rtx xops[3];
   xops[0] = gen_reg_rtx (GET_MODE (operands[2]));
   xops[1] = operands[2];
-  xops[2] = operands[3];
+  xops[2] = GEN_INT (INTVAL (operands[3])
+& (( * BITS_PER_UNIT) - 1));
   ix86_expand_binary_operator (AND, GET_MODE (operands[2]), xops);
   operands[2] = xops[0];
 }
@@ -12905,7 +12906,8 @@ (define_insn_and_split "*3_do
   rtx xops[3];
   xops[0] = gen_reg_rtx (GET_MODE (operands[2]));
   xops[1] = operands[2];
-  xops[2] = operands[3];
+  xops[2] = GEN_INT (INTVAL (operands[3])
+& (( * BITS_PER_UNIT) - 1));
   ix86_expand_binary_operator (AND, GET_MODE (operands[2]), xops);
   operands[2] = xops[0];
 }
--- gcc/testsuite/gcc.dg/pr105911.c.jj  2022-06-10 11:45:38.314044503 +0200
+++ gcc/testsuite/gcc.dg/pr105911.c 2022-06-10 11:45:18.068253633 +0200
@@ -0,0 +1,16 @@
+/* PR target/105911 */
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+__int128 v, x;
+unsigned __int128 w;
+
+void bar (__int128, __int128);
+
+void
+foo (void)
+{
+  bar (v /= v, v >> (v &= 0x10001));
+  bar (w /= w, w >> (w &= 0x30003));
+  bar (x /= x, x << (x &= 0x70007));
+}

Jakub



Re: [committed] openmp: Add support for HBW or large capacity or interleaved memory through the libmemkind.so library

2022-06-10 Thread Jakub Jelinek via Gcc-patches
On Thu, Jun 09, 2022 at 01:57:52PM +0200, Jakub Jelinek via Gcc-patches wrote:
> On Thu, Jun 09, 2022 at 12:11:28PM +0200, Thomas Schwinge wrote:
> > On 2022-06-09T10:19:03+0200, Jakub Jelinek via Gcc-patches 
> >  wrote:
> > > This patch adds support for dlopening libmemkind.so
> > 
> > Instead of 'dlopen'ing literally 'libmemkind.so':
> > 
> > > --- libgomp/allocator.c.jj2022-06-08 08:21:03.099446883 +0200
> > > +++ libgomp/allocator.c   2022-06-08 13:41:45.647133610 +0200
> > 
> > > +  void *handle = dlopen ("libmemkind.so", RTLD_LAZY);
> > 
> > ..., shouldn't this instead 'dlopen' 'libmemkind.so.0'?  At least for
> > Debian/Ubuntu, the latter ('libmemkind.so.0') is shipped in the "library"
> > package:
> 
> I agree and I've actually noticed it too right before committing, but I 
> thought
> I'll investigate and tweak incrementally because "libmemkind.so"
> is what I've actually tested (it is what llvm libomp uses).

And here is what I've committed after bootstrapping/regtesting it on
x86_64-linux and i686-linux.

2022-06-10  Jakub Jelinek  

* allocator.c (gomp_init_memkind): Call dlopen with "libmemkind.so.0"
rather than "libmemkind.so".

--- libgomp/allocator.c.jj  2022-06-09 10:14:33.470973961 +0200
+++ libgomp/allocator.c 2022-06-09 14:05:33.665803457 +0200
@@ -99,7 +99,7 @@ static pthread_once_t memkind_data_once
 static void
 gomp_init_memkind (void)
 {
-  void *handle = dlopen ("libmemkind.so", RTLD_LAZY);
+  void *handle = dlopen ("libmemkind.so.0", RTLD_LAZY);
   struct gomp_memkind_data *data;
   int i;
   static const char *kinds[] = {


Jakub



[PING][PATCH] Add instruction level discriminator support.

2022-06-10 Thread Eugene Rozenfeld via Gcc-patches
Hello,

I'd like to ping this patch: 
https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596065.html

Thanks,

Eugene

-Original Message-
From: Gcc-patches  On 
Behalf Of Eugene Rozenfeld via Gcc-patches
Sent: Thursday, June 02, 2022 12:22 AM
To: gcc-patches@gcc.gnu.org; Andi Kleen ; Jan Hubicka 

Subject: [EXTERNAL] [PATCH] Add instruction level discriminator support.

This is the first in a series of patches to enable discriminator support in 
AutoFDO.

This patch switches to tracking discriminators per statement/instruction 
instead of per basic block. Tracking per basic block was problematic since not 
all statements in a basic block needed a discriminator and, also, later 
optimizations could move statements between basic blocks making correlation 
during AutoFDO compilation unreliable. Tracking per statement also allows us to 
assign different discriminators to multiple function calls in the same basic 
block. A subsequent patch will add that support.

The idea of this patch is based on commit 
4c311d95cf6d9519c3c20f641cc77af7df491fdf
by Dehao Chen in vendors/google/heads/gcc-4_8 but uses a slightly different 
approach. In Dehao's work special (normally unused) location ids and side 
tables were used to keep track of locations with discriminators. Things have 
changed since then and I don't think we have unused location ids anymore. 
Instead, I made discriminators a part of ad-hoc locations.

The difference from Dehao's work also includes support for discriminator 
reading/writing in lto streaming and in modules.

Tested on x86_64-pc-linux-gnu.


0001-Add-instruction-level-discriminator-support.patch
Description: 0001-Add-instruction-level-discriminator-support.patch


[PATCH] x86: Require AVX for F16C and VAES

2022-06-10 Thread H.J. Lu via Gcc-patches
Since F16C and VAES are only usable with AVX, require AVX for F16C and
VAES.

OK for master and release branches?

Thanks.

H.J.
---
libgcc/105920
* common/config/i386/cpuinfo.h (get_available_features): Require
AVX for F16C and VAES.
---
 gcc/common/config/i386/cpuinfo.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index adc02bc3d98..bbced8a23b9 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -651,8 +651,6 @@ get_available_features (struct __processor_model *cpu_model,
 set_feature (FEATURE_MOVBE);
   if (ecx & bit_AES)
 set_feature (FEATURE_AES);
-  if (ecx & bit_F16C)
-set_feature (FEATURE_F16C);
   if (ecx & bit_RDRND)
 set_feature (FEATURE_RDRND);
   if (ecx & bit_XSAVE)
@@ -663,6 +661,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX);
   if (ecx & bit_FMA)
set_feature (FEATURE_FMA);
+  if (ecx & bit_F16C)
+   set_feature (FEATURE_F16C);
 }
 
   /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
@@ -683,6 +683,8 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX2);
  if (ecx & bit_VPCLMULQDQ)
set_feature (FEATURE_VPCLMULQDQ);
+ if (ecx & bit_VAES)
+   set_feature (FEATURE_VAES);
}
   if (ebx & bit_BMI2)
set_feature (FEATURE_BMI2);
@@ -705,8 +707,6 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_PKU);
   if (ecx & bit_RDPID)
set_feature (FEATURE_RDPID);
-  if (ecx & bit_VAES)
-   set_feature (FEATURE_VAES);
   if (ecx & bit_GFNI)
set_feature (FEATURE_GFNI);
   if (ecx & bit_MOVDIRI)
-- 
2.36.1



Re: [PATCH] Add optional __Bfloat16 support

2022-06-10 Thread H.J. Lu via Gcc-patches
On Fri, Jun 10, 2022 at 7:44 AM H.J. Lu  wrote:
>
> On Fri, Jun 10, 2022 at 2:38 AM Florian Weimer  wrote:
> >
> > * liuhongt via Libc-alpha:
> >
> > > +\subsubsection{Special Types}
> > > +
> > > +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa.
> > > +It is used for \code{BF16} related intrinsics, it cannot be
>
> Please mention that this is an alternate encoding format for 16-bit floating
> point.  It has the same size and alignment as _Float16.

It also follows the same rules as _Float16 for parameter passing and function
return.

> > > +used with standard C operators.
> >
> > I think it's not necessary to specify whether the type supports certain
> > C operators (surely assignment will work?).  If they are added later,
> > the ABI won't need changing.
> >
>
> If _Bfloat16 becomes a fundamental type, the ABI should be changed to
> move it together with other scalar types.
>
> --
> H.J.



-- 
H.J.


Re: [PATCH V2] Disable generating load/store vector pairs for block copies.

2022-06-10 Thread Segher Boessenkool
Hi!

On Fri, Jun 10, 2022 at 11:27:40AM -0400, Michael Meissner wrote:
> Testing has found that using store vector pair for block copies can result
> in a slow down on power10.  This patch disables using the vector pair
> instructions for block copies if we are tuning for power10.

Load paired should be disabled as well, for the same reason.  The patch
seems to do that fine?  Please fix the commit message.

Thanks,


Segher


> 2022-06-09   Michael Meissner  
> 
> gcc/
>   * config/rs6000/rs6000.cc (rs6000_option_override_internal): Do
>   not generate block copies with vector pair instructions if we are
>   tuning for power10.
> ---
>  gcc/config/rs6000/rs6000.cc | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
> index 0af2085adc0..59481d9ac70 100644
> --- a/gcc/config/rs6000/rs6000.cc
> +++ b/gcc/config/rs6000/rs6000.cc
> @@ -4141,7 +4141,10 @@ rs6000_option_override_internal (bool global_init_p)
>  
>if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
>  {
> -  if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
> +  /* Do not generate lxvp and stxvp on power10 since there are some
> +  performance issues.  */
> +  if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX
> +   && rs6000_tune != PROCESSOR_POWER10)
>   rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
>else
>   rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;


Re: [PATCH] c++: Add support for __real__/__imag__ modifications in constant expressions [PR88174]

2022-06-10 Thread Jason Merrill via Gcc-patches

On 6/9/22 04:37, Jakub Jelinek wrote:

Hi!

We claim we support P0415R1 (constexpr complex), but e.g.
#include 

constexpr bool
foo ()
{
   std::complex a (1.0, 2.0);
   a += 3.0;
   a.real (6.0);
   return a.real () == 6.0 && a.imag () == 2.0;
}

static_assert (foo ());

fails with
test.C:12:20: error: non-constant condition for static assertion
12 | static_assert (foo ());
   |^~
test.C:12:20:   in ‘constexpr’ expansion of ‘foo()’
test.C:8:10:   in ‘constexpr’ expansion of 
‘a.std::complex::real(6.0e+0)’
test.C:12:20: error: modification of ‘__real__ 
a.std::complex::_M_value’ is not a constant expression

The problem is we don't handle REALPART_EXPR and IMAGPART_EXPR
in cxx_eval_store_expression.
The following patch attempts to support it (with a requirement
that those are the outermost expressions, ARRAY_REF/COMPONENT_REF
etc. are just not possible on the result of these, BIT_FIELD_REF
would be theoretically possible if trying to extract some bits
from one part of a complex int, but I don't see how it could appear
in the FE trees.

For these references, the code handles value being COMPLEX_CST,
COMPLEX_EXPR or CONSTRUCTOR_NO_CLEARING empty CONSTRUCTOR (what we use
to represent uninitialized values for C++20 and later) and the
code starts by rewriting it to COMPLEX_EXPR, so that we can freely
adjust the individual parts and later on possibly optimize it back
to COMPLEX_CST if both halves are constant.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-06-09  Jakub Jelinek  

PR c++/88174
* constexpr.cc (cxx_eval_store_expression): Handle REALPART_EXPR
and IMAGPART_EXPR.

* g++.dg/cpp1y/constexpr-complex1.C: New test.

--- gcc/cp/constexpr.cc.jj  2022-06-08 08:21:02.973448193 +0200
+++ gcc/cp/constexpr.cc 2022-06-08 17:13:04.986040449 +0200
@@ -5707,6 +5707,20 @@ cxx_eval_store_expression (const constex
  }
  break;
  
+	case REALPART_EXPR:

+ gcc_assert (probe == target);


Doesn't this assert mean that complex_expr will always be == valp?


+ vec_safe_push (refs, integer_zero_node);
+ vec_safe_push (refs, TREE_TYPE (probe));
+ probe = TREE_OPERAND (probe, 0);
+ break;
+
+   case IMAGPART_EXPR:
+ gcc_assert (probe == target);
+ vec_safe_push (refs, integer_one_node);
+ vec_safe_push (refs, TREE_TYPE (probe));
+ probe = TREE_OPERAND (probe, 0);
+ break;
+
default:
  if (evaluated)
object = probe;
@@ -5749,6 +5763,8 @@ cxx_eval_store_expression (const constex
auto_vec index_pos_hints;
bool activated_union_member_p = false;
bool empty_base = false;
+  int complex_part = -1;
+  tree *complex_expr = NULL;
while (!refs->is_empty ())
  {
if (*valp == NULL_TREE)
@@ -5785,14 +5801,36 @@ cxx_eval_store_expression (const constex
  *valp = ary_ctor;
}
  
-  /* If the value of object is already zero-initialized, any new ctors for

-subobjects will also be zero-initialized.  */
-  no_zero_init = CONSTRUCTOR_NO_CLEARING (*valp);
-
enum tree_code code = TREE_CODE (type);
tree reftype = refs->pop();
tree index = refs->pop();
  
+  if (code == COMPLEX_TYPE)

+   {
+ if (TREE_CODE (*valp) == COMPLEX_CST)
+   *valp = build2 (COMPLEX_EXPR, type, TREE_REALPART (*valp),
+   TREE_IMAGPART (*valp));
+ else if (TREE_CODE (*valp) == CONSTRUCTOR
+  && CONSTRUCTOR_NELTS (*valp) == 0
+  && CONSTRUCTOR_NO_CLEARING (*valp))
+   {
+ tree r = build_constructor (reftype, NULL);
+ CONSTRUCTOR_NO_CLEARING (r) = 1;
+ *valp = build2 (COMPLEX_EXPR, type, r, r);
+   }
+ gcc_assert (TREE_CODE (*valp) == COMPLEX_EXPR);
+ complex_expr = valp;
+ valp = _OPERAND (*valp, index != integer_zero_node);
+ gcc_checking_assert (refs->is_empty ());
+ type = reftype;
+ complex_part = index != integer_zero_node;
+ break;
+   }
+
+  /* If the value of object is already zero-initialized, any new ctors for
+subobjects will also be zero-initialized.  */
+  no_zero_init = CONSTRUCTOR_NO_CLEARING (*valp);
+
if (code == RECORD_TYPE && is_empty_field (index))
/* Don't build a sub-CONSTRUCTOR for an empty base or field, as they
   have no data and might have an offset lower than previously declared
@@ -5946,6 +5984,24 @@ cxx_eval_store_expression (const constex
= get_or_insert_ctor_field (*valp, indexes[i], index_pos_hints[i]);
  valp = >value;
}
+  if (complex_part != -1)
+   {
+ if (TREE_CODE (*valp) == COMPLEX_CST)
+   *valp = build2 (COMPLEX_EXPR, TREE_TYPE (*valp),
+   TREE_REALPART (*valp),
+   TREE_IMAGPART (*valp));
+ 

Re: [PATCH] c++: optimize specialization of nested class templates

2022-06-10 Thread Jason Merrill via Gcc-patches

On 6/10/22 12:00, Patrick Palka wrote:

On Fri, 10 Jun 2022, Patrick Palka wrote:


On Thu, 9 Jun 2022, Patrick Palka wrote:


On Thu, 9 Jun 2022, Jason Merrill wrote:


On 6/8/22 14:21, Patrick Palka wrote:

When substituting a class template specialization, tsubst_aggr_type
substitutes the TYPE_CONTEXT before passing it to lookup_template_class.
This appears to be unnecessary, however, because the the initial value
of lookup_template_class's context parameter is unused outside of the
IDENTIFIER_NODE case, and l_t_c performs its own substitution of the
context, anyway.  So this patch removes the redundant substitution in
tsubst_aggr_type.  Doing so causes us to ICE on template/nested5.C
because during lookup_template_class for A::C::D with T=E and S=S,
we substitute and complete the context A::C with T=E, which in turn
registers the desired dependent specialization of D for us and we end up
trying to register it again.  This patch fixes this by checking the
specializations table again after completion of the context.

This patch also implements a couple of other optimizations:

* In lookup_template_class, if the context of the partially
  instantiated template is already non-dependent, then we could
  reuse that instead of substituting the context of the most
  general template.
* When substituting the TYPE_DECL for an injected-class-name
  in tsubst_decl, we can avoid substituting its TREE_TYPE and
  DECL_TI_ARGS.

Together these optimizations improve memory usage for the range-v3
testcase test/view/split.cc by about 5%.  The improvement is probably
more significant when dealing with deeply nested class templates.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk?

gcc/cp/ChangeLog:

* pt.cc (lookup_template_class): Remove dead stores to
context parameter.  Don't substitute the context of the
most general template if that of the partially instantiated
template is non-dependent.  Check the specializations table
again after completing the context of a nested dependent
specialization.
(tsubst_aggr_type) : Don't substitute
TYPE_CONTEXT or pass it to lookup_template_class.
(tsubst_decl) : Avoid substituting the
TREE_TYPE and DECL_TI_ARGS when DECL_SELF_REFERENCE_P.
---
   gcc/cp/pt.cc | 69 +++-
   1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 59b94317e88..28023d60684 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -9840,8 +9840,6 @@ lookup_template_class (tree d1, tree arglist, tree
in_decl, tree context,
  if (context)
pop_decl_namespace ();
}
-  if (templ)
-   context = DECL_CONTEXT (templ);
   }
 else if (TREE_CODE (d1) == TYPE_DECL && MAYBE_CLASS_TYPE_P (TREE_TYPE
(d1)))
   {
@@ -9868,7 +9866,6 @@ lookup_template_class (tree d1, tree arglist, tree
in_decl, tree context,
   {
 templ = d1;
 d1 = DECL_NAME (templ);
-  context = DECL_CONTEXT (templ);
   }
 else if (DECL_TEMPLATE_TEMPLATE_PARM_P (d1))
   {
@@ -10059,8 +10056,25 @@ lookup_template_class (tree d1, tree arglist, tree
in_decl, tree context,
 context = DECL_CONTEXT (gen_tmpl);
 if (context && TYPE_P (context))
{
- context = tsubst_aggr_type (context, arglist, complain, in_decl,
true);
- context = complete_type (context);
+ if (!uses_template_parms (DECL_CONTEXT (templ)))
+   /* If the context of the partially instantiated template is
+  already non-dependent, then we might as well use it.  */
+   context = DECL_CONTEXT (templ);
+ else
+   {
+ context = tsubst_aggr_type (context, arglist, complain, in_decl,
true);
+ context = complete_type (context);
+ if (is_dependent_type && arg_depth > 1)
+   {
+ /* If this is a dependent nested specialization such as
+A::B, then completion of A might have
+registered this specialization of B for us, so check
+the table again (33959).  */
+ entry = type_specializations->find_with_hash (, hash);
+ if (entry)
+   return entry->spec;
+   }
+   }
}
 else
context = tsubst (context, arglist, complain, in_decl);
@@ -13711,25 +13725,12 @@ tsubst_aggr_type (tree t,
 if (TYPE_TEMPLATE_INFO (t) && uses_template_parms (t))
{
  tree argvec;
- tree context;
  tree r;
  /* In "sizeof(X)" we need to evaluate "I".  */
  cp_evaluated ev;
   -  /* First, determine the context for the type we are looking
-up.  */
- context = TYPE_CONTEXT (t);
- if (context && TYPE_P (context))
-   {
- context = 

c++: Adjust module initializer calling emission

2022-06-10 Thread Nathan Sidwell via Gcc-patches


We special-case emitting the calls of module initializer functions. 
It's simpler to just emit a static fn do do that, and add it onto the 
front of the global init fn chain.  We can also move the calculation of 
the set of initializers to call to the point of use.


nathan

--
Nathan SidwellFrom 8834d2d35fcc229c00e2e06e8be8b052c803d8cd Mon Sep 17 00:00:00 2001
From: Nathan Sidwell 
Date: Fri, 10 Jun 2022 05:22:21 -0700
Subject: [PATCH] c++: Adjust module initializer calling emission

We special-case emitting the calls of module initializer functions.  It's
simpler to just emit a static fn do do that, and add it onto the front of
the global init fn chain.  We can also move the calculation of the set of
initializers to call to the point of use.

	gcc/cp/
	* cp-tree.h (module_has_import_init): Rename to ...
	(module_determined_import_inits): ... here.
	* decl2.cc (start_objects): Do not handle module initializers
	here.
	(c_parse_final_cleanups): Generate a separate module
	initializer calling function and add it to the list.  Shrink
	the c-lang region.
	* module.cc (num_init_calls_needed): Delete.
	 (module_has_import_init): Rename to ...
	(module_determined_import_inits): ... here. Do the
	calculation here ...
	(finish_module_processing): ... rather than here.
	(module_add_import_initializers): Reformat.

	gcc/testsuite/
	* g++.dg/modules/init-3_a.C: New.
	* g++.dg/modules/init-3_b.C: New.
	* g++.dg/modules/init-3_c.C: New.
---
 gcc/cp/cp-tree.h|   2 +-
 gcc/cp/decl2.cc |  47 +-
 gcc/cp/module.cc| 110 +++-
 gcc/testsuite/g++.dg/modules/init-3_a.C |  17 
 gcc/testsuite/g++.dg/modules/init-3_b.C |   6 ++
 gcc/testsuite/g++.dg/modules/init-3_c.C |  17 
 6 files changed, 117 insertions(+), 82 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/modules/init-3_a.C
 create mode 100644 gcc/testsuite/g++.dg/modules/init-3_b.C
 create mode 100644 gcc/testsuite/g++.dg/modules/init-3_c.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index a5d93282167..f1294dac7d5 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -7180,7 +7180,7 @@ extern module_state *get_module (tree name, module_state *parent = NULL,
 extern bool module_may_redeclare (tree decl);
 
 extern bool module_global_init_needed ();
-extern bool module_has_import_inits ();
+extern bool module_determine_import_inits ();
 extern void module_add_import_initializers ();
 
 /* Where the namespace-scope decl was originally declared.  */
diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc
index 9de9a7a4f8a..ff1c36745cf 100644
--- a/gcc/cp/decl2.cc
+++ b/gcc/cp/decl2.cc
@@ -3903,8 +3903,7 @@ start_objects (bool initp, unsigned priority, bool has_body)
 
   tree body = begin_compound_stmt (BCS_FN_BODY);
 
-  bool has_import_inits = default_init && module_has_import_inits ();
-  if (is_module_init && (has_import_inits || has_body))
+  if (is_module_init && has_body)
 {
   // If the function is going to be empty, don't emit idempotency.
   // 'static bool __in_chrg = false;
@@ -3930,9 +3929,6 @@ start_objects (bool initp, unsigned priority, bool has_body)
   finish_expr_stmt (assign);
 }
 
-  if (has_import_inits)
-module_add_import_initializers ();
-
   return body;
 }
 
@@ -5195,6 +5191,12 @@ c_parse_final_cleanups (void)
 
   maybe_warn_sized_delete ();
 
+  // Place the init fns in the right order.  We need to do this now,
+  // so that any module init will go at the start.
+  if (static_init_fini_fns[true])
+for (auto iter : *static_init_fini_fns[true])
+  iter.second = nreverse (iter.second);
+  
   /* Then, do the Objective-C stuff.  This is where all the
  Objective-C module stuff gets generated (symtab,
  class/protocol/selector lists etc).  This must be done after C++
@@ -5203,11 +5205,18 @@ c_parse_final_cleanups (void)
   if (c_dialect_objc ())
 objc_write_global_declarations ();
 
-  /* We give C linkage to static constructors and destructors.  */
-  push_lang_context (lang_name_c);
+  if (module_determine_import_inits ())
+{
+  input_location = locus_at_end_of_parsing;
+  tree body = start_partial_init_fini_fn (true, DEFAULT_INIT_PRIORITY,
+	  ssdf_count++);
+  module_add_import_initializers ();
+  input_location = locus_at_end_of_parsing;
+  finish_partial_init_fini_fn (body);
+}
 
   if ((c_dialect_objc () && objc_static_init_needed_p ())
-  || module_global_init_needed () || module_has_import_inits ())
+  || module_global_init_needed ())
 {
   // Make sure there's a default priority entry.
   if (!static_init_fini_fns[true])
@@ -5216,32 +5225,24 @@ c_parse_final_cleanups (void)
 } 
 
   /* Generate initialization and destruction functions for all
- priorities for which they are required.  */
+ priorities for which they are required.  They have C-language
+ linkage.  */
+  push_lang_context (lang_name_c);
   for (unsigned 

Re: [PATCH 2/1] c++: optimize specialization of templated member functions

2022-06-10 Thread Jason Merrill via Gcc-patches

On 6/9/22 15:37, Patrick Palka wrote:

On Thu, 9 Jun 2022, Jason Merrill wrote:


On 6/9/22 09:00, Patrick Palka wrote:

This performs one of the optimizations added by the previous
patch to lookup_template_class, to instantiate_template as well.
(For the libstdc++ ranges tests this optimization appears to be
effective around 30% of the time, i.e. 30% of the time context of 'tmpl'
is non-dependent while the context of 'gen_tmpl' is dependent.)


If this is a significant optimization, how about doing it in tsubst_aggr_type
rather than its callers?


I'm not sure how we'd do this optimization in tsubst_aggr_type?


Oops, I was overlooking the gen_tmpl vs. tmpl difference.


I haven't observed any significant time/memory improvements based on my
limited benchmarking, but I can imagine for deeply nested templates it
could be significant.  And avoiding redundant work should hopefully help
streamline debugging I suppose.


OK.




gcc/cp/ChangeLog:

* pt.cc (instantiate_template): Don't substitute the context
of the most general template if that of the partially
instantiated template is non-dependent.
---
   gcc/cp/pt.cc | 10 --
   1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index e021c254872..208daad298a 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -21661,8 +21661,14 @@ instantiate_template (tree tmpl, tree orig_args,
tsubst_flags_t complain)
   ++processing_template_decl;
 if (DECL_CLASS_SCOPE_P (gen_tmpl))
   {
-  tree ctx = tsubst_aggr_type (DECL_CONTEXT (gen_tmpl), targ_ptr,
-  complain, gen_tmpl, true);
+  tree ctx;
+  if (!uses_template_parms (DECL_CONTEXT (tmpl)))
+   /* If the context of the partially instantiated template is already
+  non-dependent, then we might as well use it.  */
+   ctx = DECL_CONTEXT (tmpl);
+  else
+   ctx = tsubst_aggr_type (DECL_CONTEXT (gen_tmpl), targ_ptr,
+   complain, gen_tmpl, true);
 push_nested_class (ctx);
   }
   









Re: [PATCH] c++: improve TYPENAME_TYPE hashing [PR65328]

2022-06-10 Thread Jason Merrill via Gcc-patches

On 6/10/22 09:40, Patrick Palka wrote:

The reason compiling the testcase in this PR is so slow is ultimately
due to our poor hashing of TYPENAME_TYPE causing a huge amount of hash
table collisions in the spec_hasher and typename_hasher tables.

In spec_hasher, we don't hash the components of a TYPENAME_TYPE at all,
presumably because TYPENAME_TYPE equivalence as determined by
structural_comptypes depends on whether the comparing_specializations
flag is set.  This patch fixes this by setting comparing_specializations
from spec_hasher::hash, and making iterative_hash_template_arg hash the
relevant components of a TYPENAME_TYPE when this flag is set.
consistently.

And in typename_hasher, the hash function doesn't consider the
TYPENAME_TYPE_FULLNAME, which this patch fixes accordingly.

After this patch, compile time for the testcase in the PR is around
34 seconds (10% faster than Clang).

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK
for trunk?

PR c++/65328

gcc/cp/ChangeLog:

* decl.cc (typename_hasher::hash): Add extra overloads.
Use iterative_hash_object instead of htab_hash_pointer.
Hash the TYPENAME_TYPE_FULLNAME too.
(build_typename_type): Use typename_hasher::hash.
* pt.cc (spec_hasher::hash): Add two-parameter overload.
Set comparing_specializations around the call to
hash_tmpl_and_args.
(iterative_hash_template_arg) :
When comparing_specializations, hash the TYPE_CONTEXT
and TYPENAME_TYPE_FULLNAME.
(tsubst_function_decl): Use spec_hasher::hash instead of
hash_tmpl_and_args.
(tsubst_template_decl): Likewise.
(tsubst_decl): Likewise.
---
  gcc/cp/decl.cc | 26 +++---
  gcc/cp/pt.cc   | 28 
  2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 7f3b3c3c588..b7f624ca50b 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -4007,14 +4007,27 @@ struct typename_hasher : ggc_ptr_hash
/* Hash a TYPENAME_TYPE.  */
  
static hashval_t

-  hash (tree t)
+  hash (tree context, tree name, tree fullname)
{
-hashval_t hash;
+hashval_t hash = 0;
+hash = iterative_hash_object (context, hash);
+hash = iterative_hash_object (name, hash);


I'd think we could omit considering 'name', since fullname is either the 
same as name or a wrapper for it?



+hash = iterative_hash_object (fullname, hash);
+return hash;
+  }
  
-hash = (htab_hash_pointer (TYPE_CONTEXT (t))

-   ^ htab_hash_pointer (TYPE_IDENTIFIER (t)));
+  static hashval_t
+  hash (const typename_info *ti)
+  {
+return typename_hasher::hash (ti->scope, ti->name, ti->template_id);
+  }
  
-return hash;

+  static hashval_t
+  hash (tree t)
+  {
+return typename_hasher::hash (TYPE_CONTEXT (t),
+ TYPE_IDENTIFIER (t),
+ TYPENAME_TYPE_FULLNAME (t));
}
  
/* Compare two TYPENAME_TYPEs.  */

@@ -4053,8 +4066,7 @@ build_typename_type (tree context, tree name, tree 
fullname,
ti.class_p = (tag_type == class_type
|| tag_type == record_type
|| tag_type == union_type);
-  hashval_t hash =  (htab_hash_pointer (ti.scope)
-^ htab_hash_pointer (ti.name));
+  hashval_t hash = typename_hasher::hash ();
  
/* See if we already have this type.  */

tree *e = typename_htab->find_slot_with_hash (, hash, INSERT);
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 55129cf6f2c..381fc337cb0 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -107,6 +107,7 @@ static bool excessive_deduction_depth;
  struct spec_hasher : ggc_ptr_hash
  {
static hashval_t hash (spec_entry *);
+  static hashval_t hash (tree, tree);
static bool equal (spec_entry *, spec_entry *);
  };
  
@@ -1768,13 +1769,22 @@ hash_tmpl_and_args (tree tmpl, tree args)

return iterative_hash_template_arg (args, val);
  }
  
+hashval_t

+spec_hasher::hash (tree tmpl, tree args)
+{
+  ++comparing_specializations;
+  hashval_t val = hash_tmpl_and_args (tmpl, args);
+  --comparing_specializations;
+  return val;
+}
+
  /* Returns a hash for a spec_entry node based on the TMPL and ARGS members,
 ignoring SPEC.  */
  
  hashval_t

  spec_hasher::hash (spec_entry *e)
  {
-  return hash_tmpl_and_args (e->tmpl, e->args);
+  return spec_hasher::hash (e->tmpl, e->args);
  }
  
  /* Recursively calculate a hash value for a template argument ARG, for use

@@ -1960,6 +1970,16 @@ iterative_hash_template_arg (tree arg, hashval_t val)
  val = iterative_hash_template_arg (DECLTYPE_TYPE_EXPR (arg), val);
  break;
  
+	case TYPENAME_TYPE:

+ if (comparing_specializations)


Please add a comment that this is to match structural_comptypes.

OK with these changes.


+   {
+ tree context = TYPE_MAIN_VARIANT (TYPE_CONTEXT (arg));
+ tree fullname = 

Re: [PATCH] c++: optimize specialization of nested class templates

2022-06-10 Thread Patrick Palka via Gcc-patches
On Fri, 10 Jun 2022, Patrick Palka wrote:

> On Thu, 9 Jun 2022, Patrick Palka wrote:
> 
> > On Thu, 9 Jun 2022, Jason Merrill wrote:
> > 
> > > On 6/8/22 14:21, Patrick Palka wrote:
> > > > When substituting a class template specialization, tsubst_aggr_type
> > > > substitutes the TYPE_CONTEXT before passing it to lookup_template_class.
> > > > This appears to be unnecessary, however, because the the initial value
> > > > of lookup_template_class's context parameter is unused outside of the
> > > > IDENTIFIER_NODE case, and l_t_c performs its own substitution of the
> > > > context, anyway.  So this patch removes the redundant substitution in
> > > > tsubst_aggr_type.  Doing so causes us to ICE on template/nested5.C
> > > > because during lookup_template_class for A::C::D with T=E and S=S,
> > > > we substitute and complete the context A::C with T=E, which in turn
> > > > registers the desired dependent specialization of D for us and we end up
> > > > trying to register it again.  This patch fixes this by checking the
> > > > specializations table again after completion of the context.
> > > > 
> > > > This patch also implements a couple of other optimizations:
> > > > 
> > > >* In lookup_template_class, if the context of the partially
> > > >  instantiated template is already non-dependent, then we could
> > > >  reuse that instead of substituting the context of the most
> > > >  general template.
> > > >* When substituting the TYPE_DECL for an injected-class-name
> > > >  in tsubst_decl, we can avoid substituting its TREE_TYPE and
> > > >  DECL_TI_ARGS.
> > > > 
> > > > Together these optimizations improve memory usage for the range-v3
> > > > testcase test/view/split.cc by about 5%.  The improvement is probably
> > > > more significant when dealing with deeply nested class templates.
> > > > 
> > > > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> > > > trunk?
> > > > 
> > > > gcc/cp/ChangeLog:
> > > > 
> > > > * pt.cc (lookup_template_class): Remove dead stores to
> > > > context parameter.  Don't substitute the context of the
> > > > most general template if that of the partially instantiated
> > > > template is non-dependent.  Check the specializations table
> > > > again after completing the context of a nested dependent
> > > > specialization.
> > > > (tsubst_aggr_type) : Don't substitute
> > > > TYPE_CONTEXT or pass it to lookup_template_class.
> > > > (tsubst_decl) : Avoid substituting the
> > > > TREE_TYPE and DECL_TI_ARGS when DECL_SELF_REFERENCE_P.
> > > > ---
> > > >   gcc/cp/pt.cc | 69 +++-
> > > >   1 file changed, 41 insertions(+), 28 deletions(-)
> > > > 
> > > > diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
> > > > index 59b94317e88..28023d60684 100644
> > > > --- a/gcc/cp/pt.cc
> > > > +++ b/gcc/cp/pt.cc
> > > > @@ -9840,8 +9840,6 @@ lookup_template_class (tree d1, tree arglist, tree
> > > > in_decl, tree context,
> > > >   if (context)
> > > > pop_decl_namespace ();
> > > > }
> > > > -  if (templ)
> > > > -   context = DECL_CONTEXT (templ);
> > > >   }
> > > > else if (TREE_CODE (d1) == TYPE_DECL && MAYBE_CLASS_TYPE_P 
> > > > (TREE_TYPE
> > > > (d1)))
> > > >   {
> > > > @@ -9868,7 +9866,6 @@ lookup_template_class (tree d1, tree arglist, tree
> > > > in_decl, tree context,
> > > >   {
> > > > templ = d1;
> > > > d1 = DECL_NAME (templ);
> > > > -  context = DECL_CONTEXT (templ);
> > > >   }
> > > > else if (DECL_TEMPLATE_TEMPLATE_PARM_P (d1))
> > > >   {
> > > > @@ -10059,8 +10056,25 @@ lookup_template_class (tree d1, tree arglist, 
> > > > tree
> > > > in_decl, tree context,
> > > > context = DECL_CONTEXT (gen_tmpl);
> > > > if (context && TYPE_P (context))
> > > > {
> > > > - context = tsubst_aggr_type (context, arglist, complain, 
> > > > in_decl,
> > > > true);
> > > > - context = complete_type (context);
> > > > + if (!uses_template_parms (DECL_CONTEXT (templ)))
> > > > +   /* If the context of the partially instantiated template is
> > > > +  already non-dependent, then we might as well use it.  */
> > > > +   context = DECL_CONTEXT (templ);
> > > > + else
> > > > +   {
> > > > + context = tsubst_aggr_type (context, arglist, complain, 
> > > > in_decl,
> > > > true);
> > > > + context = complete_type (context);
> > > > + if (is_dependent_type && arg_depth > 1)
> > > > +   {
> > > > + /* If this is a dependent nested specialization such 
> > > > as
> > > > +A::B, then completion of A might have
> > > > +registered this specialization of B for us, so 
> > > > check
> > > > +the table again (33959).  */
> > > > 

[PATCH] libgompd: Fix sizes in OMPD support and add local ICVs finctions.

2022-06-10 Thread Mohamed Atef via Gcc-patches
libgomp/ChangeLog

2022-06-10  Mohamed Atef  

* ompd-helper.h (DEREFERENCE, ACCESS_VALUE): New macros.
* ompd-helper.c (gompd_get_nthread, gompd_get_thread_limit,
gomp_get_run_shed, gompd_get_run_sched_chunk_size,
gompd_get_default_device, gompd_get_dynamic,
gompd_get_max_active_levels, gompd_get_proc_bind,
gompd_is_final, gompd_is_implicit, gompd_get_team_size): defined.
* ompd-icv.c (ompd_get_icv_from_scope): call the previous fincions,
thread_handle, task_handle and parallel handle: New variable.
Fix format in ashandle definition.
* ompd-init.c: call GET_VALUE with sizeof_short for gompd_state.
* ompd-support.h (gompd_state): size of short instead of long.
(GOMPD_FOREACH_ACCESS): Add
gompd_access (gomp_task, kind)
gompd_access (gomp_task, final_task)
gompd_access (gomp_team, nthreads)
* ompd-support.c: Define
gompd_get_offset
gompd_get_sizeof_member
gompd_get_size.
(gompd_load): Remove gompd_init_access,
gompd_init_sizeof_members, gompd_init_sizes
define gompd_access_gomp_thread_handle with __UINT16_TYPE__.
diff --git a/libgomp/ompd-helper.c b/libgomp/ompd-helper.c
index a488ba7df2e..5a79ef9581d 100644
--- a/libgomp/ompd-helper.c
+++ b/libgomp/ompd-helper.c
@@ -256,6 +256,350 @@ gompd_stringize_gompd_enabled 
(ompd_address_space_handle_t *ah,
 
 /* End of global ICVs functions.  */
 
+/* Get per thread ICVs.  */
+
+ompd_rc_t
+gompd_get_nthread (ompd_thread_handle_t *thread_handle,
+   ompd_word_t *nthreads_var)
+{
+  /* gomp_thread->task->gomp_task_icv.nthreads_var.  */
+  if (thread_handle == NULL)
+return ompd_rc_stale_handle;
+  if (nthreads_var == NULL)
+return ompd_rc_bad_input;
+  CHECK (thread_handle->ah);
+
+  ompd_word_t res = 0;
+  ompd_address_t symbol_addr = thread_handle->th;
+  ompd_word_t temp_offset;
+  ompd_address_t temp_sym_addr;
+  ompd_addr_t temp_addr;
+  ompd_address_space_context_t *context = thread_handle->ah->context;
+  ompd_thread_context_t *t_context = thread_handle->thread_context; 
+  ompd_rc_t ret;
+  /* gomp_thread->task.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_thread_task",
+temp_offset, 1, ret, symbol_addr, temp_sym_addr, temp_addr);
+  /* gomp_thread->task->task_icv.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv", temp_offset,
+1, ret, symbol_addr, temp_sym_addr, temp_addr);
+  /* gomp_thread->task->task_icv.nthreads_var.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv_nthreads_var",
+temp_offset, 0, ret, symbol_addr, temp_sym_addr, temp_addr);
+  DEREFERENCE (context, t_context, symbol_addr, target_sizes.sizeof_long_long,
+   1, res, ret, 0);
+  *nthreads_var = res;
+  return ompd_rc_ok;
+}
+
+ompd_rc_t
+gompd_get_default_device (ompd_thread_handle_t *thread_handle,
+  ompd_word_t *defalut_device_var)
+{
+  /* gomp_thread->task->gomp_task_icv.default_device_var.  */
+  if (thread_handle == NULL)
+return ompd_rc_stale_handle;
+  if (defalut_device_var == NULL)
+return ompd_rc_bad_input;
+  CHECK (thread_handle->ah);
+
+  ompd_word_t res = 0;
+  ompd_address_t symbol_addr = thread_handle->th;
+  ompd_word_t temp_offset;
+  ompd_address_t temp_sym_addr;
+  ompd_addr_t temp_addr;
+  ompd_address_space_context_t *context = thread_handle->ah->context;
+  ompd_thread_context_t *t_context = thread_handle->thread_context; 
+  ompd_rc_t ret;
+  /* gomp_thread->task.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_thread_task",
+temp_offset, 1, ret, symbol_addr, temp_sym_addr, temp_addr);
+  /* gomp_thread->task->task_icv.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv", temp_offset,
+1, ret, symbol_addr, temp_sym_addr, temp_addr);
+  /* gomp_thread->task->task_icv.default_device_var.  */
+  ACCESS_VALUE (context, t_context,
+"gompd_access_gomp_task_icv_default_device_var", temp_offset, 
0,
+ret, symbol_addr, temp_sym_addr, temp_addr);
+  DEREFERENCE (context, t_context, symbol_addr, target_sizes.sizeof_int, 1,
+   res, ret, 0);
+  *defalut_device_var = res;
+  return ompd_rc_ok;
+}
+
+ompd_rc_t
+gompd_get_dynamic (ompd_thread_handle_t *thread_handle, ompd_word_t *dyn_var)
+{
+  /* gomp_thread->task->gomp_task_icv.dyn_var.  */
+  if (thread_handle == NULL)
+return ompd_rc_stale_handle;
+  if (dyn_var == NULL)
+return ompd_rc_bad_input;
+  CHECK (thread_handle->ah);
+
+  ompd_word_t res = 0;
+  ompd_address_t symbol_addr = thread_handle->th;
+  ompd_word_t temp_offset;
+  ompd_address_t temp_sym_addr;
+  ompd_addr_t temp_addr;
+  ompd_address_space_context_t *context = thread_handle->ah->context;
+  ompd_thread_context_t *t_context = thread_handle->thread_context; 
+  ompd_rc_t ret;
+  /* gomp_thread->task.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_thread_task",
+temp_offset, 1, ret, symbol_addr, temp_sym_addr, temp_addr);
+  /* 

Re: [PATCH] regrename: Fix -fcompare-debug issue in check_new_reg_p [PR105041]

2022-06-10 Thread Segher Boessenkool
Hi!

On Fri, Jun 10, 2022 at 07:52:57PM +0530, Surya Kumari Jangala wrote:
> In check_new_reg_p, the nregs of a du chain is computed by obtaining the MODE
> of the first element in the chain, and then calling hard_regno_nregs() with 
> the
> MODE. But the first element of the chain can be a DEBUG_INSN whose mode need
> not be the same as the rest of the elements in the du chain. This
> was resulting in fcompare-debug failure as check_new_reg_p was returning a
> different result with -g for the same candidate register. We can instead 
> obtain
> nregs from the du chain itself.

Great, thanks for finding and fixing this!  I cannot approve it, you'll
have to wait for someone who can.  It looks fine to me, but that does
not mean so much in regrename.c :-)

> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr105041.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */

Please delete this line, it is the default.

> +/* { dg-require-effective-target be } */

Is there a reason to not test this on LE?  If not, please remove this
line as well.

> +/* { dg-options "-m32 -mdejagnu-cpu=power4 -O2 -fcompare-debug 
> -fharden-compares -frename-registers" } */

Aha.  You check for LE because you use -m32 in the test?  Don't, then!
Instead, test with -m32 in your RUNTESTFLAGS, like
  make check-gcc-c RUNTESTFLAGS="--target_board=unix'{-m64,-m32}' 
powerpc.exp=pr105041.c"
or similar.

It's a good idea to add a comment a la
/* PR rtl-optimization/105041: This test failed with -m32.  */

Thanks again for the patch!


Segher


[PATCH v2 4/4] xtensa: Improve constant synthesis for both integer and floating-point

2022-06-10 Thread Takayuki 'January June' Suwa via Gcc-patches

This patch revises the previous implementation of constant synthesis.

First, changed to use define_split machine description pattern and to run
after reload pass, in order not to interfere some optimizations such as
the loop invariant motion.

Second, not only integer but floating-point is subject to processing.

Third, several new synthesis patterns - when the constant cannot fit into
a "MOVI Ax, simm12" instruction, but:

I.   can be represented as a power of two minus one (eg. 32767, 65535 or
 0x7fffUL)
   => "MOVI(.N) Ax, -1" + "SRLI Ax, Ax, 1 ... 31" (or "EXTUI")
II.  is between -34816 and 34559
   => "MOVI(.N) Ax, -2048 ... 2047" + "ADDMI Ax, Ax, -32768 ... 32512"
III. (existing case) can fit into a signed 12-bit if the trailing zero bits
 are stripped
   => "MOVI(.N) Ax, -2048 ... 2047" + "SLLI Ax, Ax, 1 ... 31"

The above sequences consist of 5 or 6 bytes and have latency of 2 clock 
cycles,
in contrast with "L32R Ax, " (3 bytes and one clock latency, 
but may

suffer additional one clock pipeline stall and implementation-specific
InstRAM/ROM access penalty) plus 4 bytes of constant value.

In addition, 3-instructions synthesis patterns (8 or 9 bytes, 3 clock 
latency)

are also provided when optimizing for speed and L32R instruction has
considerable access penalty:

IV.  2-instructions synthesis (any of I ... III) followed by
 "SLLI Ax, Ax, 1 ... 31"
V.   2-instructions synthesis followed by either "ADDX[248] Ax, Ax, Ax"
 or "SUBX8 Ax, Ax, Ax" (multiplying by 3, 5, 7 or 9)

gcc/ChangeLog:

* config/xtensa/xtensa-protos.h (xtensa_constantsynth):
New prototype.
* config/xtensa/xtensa.cc (xtensa_emit_constantsynth,
xtensa_constantsynth_2insn, xtensa_constantsynth_rtx_SLLI,
xtensa_constantsynth_rtx_ADDSUBX, xtensa_constantsynth):
New backend functions that process the abovementioned logic.
(xtensa_emit_move_sequence): Revert the previous changes.
* config/xtensa/xtensa.md: New split patterns for integer
and floating-point, as the frontend part.

gcc/testsuite/ChangeLog:

* gcc.target/xtensa/constsynth_2insns.c: New.
* gcc.target/xtensa/constsynth_3insns.c: Ditto.
* gcc.target/xtensa/constsynth_double.c: Ditto.
---
 gcc/config/xtensa/xtensa-protos.h |   1 +
 gcc/config/xtensa/xtensa.cc   | 133 +++---
 gcc/config/xtensa/xtensa.md   |  50 +++
 .../gcc.target/xtensa/constsynth_2insns.c |  44 ++
 .../gcc.target/xtensa/constsynth_3insns.c |  24 
 .../gcc.target/xtensa/constsynth_double.c |  11 ++
 6 files changed, 247 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/xtensa/constsynth_2insns.c
 create mode 100644 gcc/testsuite/gcc.target/xtensa/constsynth_3insns.c
 create mode 100644 gcc/testsuite/gcc.target/xtensa/constsynth_double.c

diff --git a/gcc/config/xtensa/xtensa-protos.h 
b/gcc/config/xtensa/xtensa-protos.h

index 30e4b54394a..c2fd750cd3a 100644
--- a/gcc/config/xtensa/xtensa-protos.h
+++ b/gcc/config/xtensa/xtensa-protos.h
@@ -44,6 +44,7 @@ extern int xtensa_expand_block_move (rtx *);
 extern int xtensa_expand_block_set_unrolled_loop (rtx *);
 extern int xtensa_expand_block_set_small_loop (rtx *);
 extern void xtensa_split_operand_pair (rtx *, machine_mode);
+extern int xtensa_constantsynth (rtx, HOST_WIDE_INT);
 extern int xtensa_emit_move_sequence (rtx *, machine_mode);
 extern rtx xtensa_copy_incoming_a7 (rtx);
 extern void xtensa_expand_nonlocal_goto (rtx *);
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 1769e43c7b5..b48ac5063c0 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -1037,6 +1037,123 @@ xtensa_split_operand_pair (rtx operands[4], 
machine_mode mode)

 }


+/* Try to emit insns to load srcval (that cannot fit into signed 12-bit)
+   into dst with synthesizing a such constant value from a sequence of
+   load-immediate / arithmetic ones, instead of a L32R instruction
+   (plus a constant in litpool).  */
+
+static void
+xtensa_emit_constantsynth (rtx dst, enum rtx_code code,
+  HOST_WIDE_INT imm0, HOST_WIDE_INT imm1,
+  rtx (*gen_op)(rtx, HOST_WIDE_INT),
+  HOST_WIDE_INT imm2)
+{
+  gcc_assert (REG_P (dst));
+  emit_move_insn (dst, GEN_INT (imm0));
+  emit_move_insn (dst, gen_rtx_fmt_ee (code, SImode,
+  dst, GEN_INT (imm1)));
+  if (gen_op)
+emit_move_insn (dst, gen_op (dst, imm2));
+}
+
+static int
+xtensa_constantsynth_2insn (rtx dst, HOST_WIDE_INT srcval,
+   rtx (*gen_op)(rtx, HOST_WIDE_INT),
+   HOST_WIDE_INT op_imm)
+{
+  int shift = exact_log2 (srcval + 1);
+
+  if (IN_RANGE (shift, 1, 31))
+{
+  xtensa_emit_constantsynth (dst, LSHIFTRT, -1, 32 - shift,
+gen_op, op_imm);
+  return 1;

[PATCH V2] Disable generating load/store vector pairs for block copies.

2022-06-10 Thread Michael Meissner via Gcc-patches
[PATCH, V2] Disable generating load/store vector pairs for block copies.

Testing has found that using store vector pair for block copies can result
in a slow down on power10.  This patch disables using the vector pair
instructions for block copies if we are tuning for power10.

This is version 2 of the patch.

| Date: Mon, 6 Jun 2022 20:55:55 -0400
| Subject: [PATCH 2/3] Disable generating load/store vector pairs for block 
copies.
| Message-ID: 
| https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596275.html

Compared to version 1, this patch is a stand-alone patch, and it doesn't depend
on a new switch (-mno-store-vector-pair).  Instead, this patch just sets the
default for -mblock-ops-vector-pair to be off if the current cpu being tuned
for is power10.  It would be anticipated that it would automatically be eabled
when tuning for a future cpu.

I have tested this patch on:

little endian power10 using --with-cpu=power10
little endian power9 using --with-cpu=power9
big endian power8 using --with-cpu=power8, both 32/64-bit tested

there were no regressions.  Can I apply this to the master branch, and then
apply it to the GCC 12 patch after a burn-in period?


2022-06-09   Michael Meissner  

gcc/
* config/rs6000/rs6000.cc (rs6000_option_override_internal): Do
not generate block copies with vector pair instructions if we are
tuning for power10.
---
 gcc/config/rs6000/rs6000.cc | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 0af2085adc0..59481d9ac70 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -4141,7 +4141,10 @@ rs6000_option_override_internal (bool global_init_p)
 
   if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR))
 {
-  if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX)
+  /* Do not generate lxvp and stxvp on power10 since there are some
+performance issues.  */
+  if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX
+ && rs6000_tune != PROCESSOR_POWER10)
rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
   else
rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
-- 
2.35.3


-- 
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: meiss...@linux.ibm.com


Re: [PATCH] Darwin: Future-proof -mmacosx-version-min

2022-06-10 Thread Iain Sandoe
Hi Mark,

> On 10 Jun 2022, at 15:56, Mark Mentovai  wrote:
> 
> f18cbc1ee1f4 (2021-12-18) updated various parts of gcc to not impose a
> Darwin or macOS version maximum of the current known release. Different
> parts of gcc accept, variously, Darwin version numbers matching
> darwin2*, and macOS major version numbers up to 99. The current released
> version is Darwin 21 and macOS 12, with Darwin 22 and macOS 13 expected
> for public release later this year. With one major OS release per year,
> this strategy is expected to provide another 8 years of headroom.
> 
> However, f18cbc1ee1f4 missed config/darwin-c.c (now .cc), which
> continued to impose a maximum of macOS 12 on the -mmacosx-version-min
> compiler driver argument. This was last updated from 11 to 12 in
> 11b967577483 (2021-10-27), but kicking the can down the road one year at
> a time is not a viable strategy, and is not in line with the more recent
> technique from f18cbc1ee1f4.
> 
> Prior to 556ab5125912 (2020-11-06), config/darwin-c.c did not impose a
> maximum that needed annual maintenance, as at that point, all macOS
> releases had used a major version of 10. The stricter approach imposed
> since then was valuable for a time until the particulars of the new
> versioning scheme were established and understood, but now that they
> are, it's prudent to restore a more permissive approach.

OK for master and open branches, thanks 
Iain

> 
> gcc/ChangeLog:
> 
>   * config/darwin-c.cc: Make -mmacosx-version-min more future-proof.
> 
> Signed-off-by: Mark Mentovai 
> ---
> gcc/config/darwin-c.cc | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/darwin-c.cc b/gcc/config/darwin-c.cc
> index 9203c84d2c26..00fc1253e265 100644
> --- a/gcc/config/darwin-c.cc
> +++ b/gcc/config/darwin-c.cc
> @@ -691,7 +691,8 @@ macosx_version_as_macro (void)
>   if (!version_array)
> goto fail;
> 
> -  if (version_array[MAJOR] < 10 || version_array[MAJOR] > 12)
> +  /* clang accepts up to 99. */
> +  if (version_array[MAJOR] < 10 || version_array[MINOR] > 99)
> goto fail;
> 
>   if (version_array[MAJOR] == 10 && version_array[MINOR] < 10)
> -- 
> 2.36.1
> 



[PATCH] Darwin: Future-proof -mmacosx-version-min

2022-06-10 Thread Mark Mentovai
f18cbc1ee1f4 (2021-12-18) updated various parts of gcc to not impose a
Darwin or macOS version maximum of the current known release. Different
parts of gcc accept, variously, Darwin version numbers matching
darwin2*, and macOS major version numbers up to 99. The current released
version is Darwin 21 and macOS 12, with Darwin 22 and macOS 13 expected
for public release later this year. With one major OS release per year,
this strategy is expected to provide another 8 years of headroom.

However, f18cbc1ee1f4 missed config/darwin-c.c (now .cc), which
continued to impose a maximum of macOS 12 on the -mmacosx-version-min
compiler driver argument. This was last updated from 11 to 12 in
11b967577483 (2021-10-27), but kicking the can down the road one year at
a time is not a viable strategy, and is not in line with the more recent
technique from f18cbc1ee1f4.

Prior to 556ab5125912 (2020-11-06), config/darwin-c.c did not impose a
maximum that needed annual maintenance, as at that point, all macOS
releases had used a major version of 10. The stricter approach imposed
since then was valuable for a time until the particulars of the new
versioning scheme were established and understood, but now that they
are, it's prudent to restore a more permissive approach.

gcc/ChangeLog:

* config/darwin-c.cc: Make -mmacosx-version-min more future-proof.

Signed-off-by: Mark Mentovai 
---
 gcc/config/darwin-c.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/darwin-c.cc b/gcc/config/darwin-c.cc
index 9203c84d2c26..00fc1253e265 100644
--- a/gcc/config/darwin-c.cc
+++ b/gcc/config/darwin-c.cc
@@ -691,7 +691,8 @@ macosx_version_as_macro (void)
   if (!version_array)
 goto fail;
 
-  if (version_array[MAJOR] < 10 || version_array[MAJOR] > 12)
+  /* clang accepts up to 99. */
+  if (version_array[MAJOR] < 10 || version_array[MINOR] > 99)
 goto fail;
 
   if (version_array[MAJOR] == 10 && version_array[MINOR] < 10)
-- 
2.36.1



Re: [PATCH] Add optional __Bfloat16 support

2022-06-10 Thread H.J. Lu via Gcc-patches
On Fri, Jun 10, 2022 at 2:38 AM Florian Weimer  wrote:
>
> * liuhongt via Libc-alpha:
>
> > +\subsubsection{Special Types}
> > +
> > +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa.
> > +It is used for \code{BF16} related intrinsics, it cannot be

Please mention that this is an alternate encoding format for 16-bit floating
point.  It has the same size and alignment as _Float16.

> > +used with standard C operators.
>
> I think it's not necessary to specify whether the type supports certain
> C operators (surely assignment will work?).  If they are added later,
> the ABI won't need changing.
>

If _Bfloat16 becomes a fundamental type, the ABI should be changed to
move it together with other scalar types.

-- 
H.J.


[committed] libstdc++: Make std::lcm and std::gcd detect overflow [PR105844]

2022-06-10 Thread Jonathan Wakely via Gcc-patches
Tested powerpc64le-linux, pushed to trunk.

-- >8 --

When I fixed PR libstdc++/92978 I introduced a regression whereby
std::lcm(INT_MIN, 1) and std::lcm(5, 4) would no longer produce
errors during constant evaluation. Those calls are undefined, because
they violate the preconditions that |m| and the result can be
represented in the return type (which is int in both those cases). The
regression occurred because __absu(INT_MIN) is well-formed,
due to the explicit casts to unsigned in that new helper function, and
the out-of-range multiplication is well-formed, because unsigned
arithmetic wraps instead of overflowing.

To fix 92978 I made std::gcm and std::lcm calculate |m| and |n|
immediately, yielding a common unsigned type that was used to calculate
the result. That was partly correct, but there's no need to use an
unsigned type. Doing so only suppresses the overflow errors so the
compiler can't detect them. This change replaces __absu with __abs_r
that returns the common type (not its corresponding unsigned type). This
way we can detect overflow in __abs_r when required, while still
supporting the most-negative value when it can be represented in the
result type. To detect LCM results that are out of range of the result
type we still need explicit checks, because neither constant evaluation
nor UBsan will complain about unsigned wrapping for cases such as
std::lcm(50u, 49u). We can detect those overflows efficiently by
using __builtin_mul_overflow and asserting.

libstdc++-v3/ChangeLog:

PR libstdc++/105844
* include/experimental/numeric (experimental::gcd): Simplify
assertions. Use __abs_r instead of __absu.
(experimental::lcm): Likewise. Remove use of __detail::__lcm so
overflow can be detected.
* include/std/numeric (__detail::__absu): Rename to __abs_r and
change to allow signed result type, so overflow can be detected.
(__detail::__lcm): Remove.
(gcd): Simplify assertions. Use __abs_r instead of __absu.
(lcm): Likewise. Remove use of __detail::__lcm so overflow can
be detected.
* testsuite/26_numerics/gcd/gcd_neg.cc: Adjust dg-error lines.
* testsuite/26_numerics/lcm/lcm_neg.cc: Likewise.
* testsuite/26_numerics/gcd/105844.cc: New test.
* testsuite/26_numerics/lcm/105844.cc: New test.
---
 libstdc++-v3/include/experimental/numeric | 46 +++-
 libstdc++-v3/include/std/numeric  | 75 +++
 .../testsuite/26_numerics/gcd/105844.cc   | 21 ++
 .../testsuite/26_numerics/gcd/gcd_neg.cc  | 10 ++-
 .../testsuite/26_numerics/lcm/105844.cc   | 22 ++
 .../testsuite/26_numerics/lcm/lcm_neg.cc  | 10 ++-
 6 files changed, 123 insertions(+), 61 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/26_numerics/gcd/105844.cc
 create mode 100644 libstdc++-v3/testsuite/26_numerics/lcm/105844.cc

diff --git a/libstdc++-v3/include/experimental/numeric 
b/libstdc++-v3/include/experimental/numeric
index 4c6a662fdd6..426d9430dd6 100644
--- a/libstdc++-v3/include/experimental/numeric
+++ b/libstdc++-v3/include/experimental/numeric
@@ -56,17 +56,15 @@ inline namespace fundamentals_v2
 constexpr common_type_t<_Mn, _Nn>
 gcd(_Mn __m, _Nn __n) noexcept
 {
-  static_assert(is_integral_v<_Mn>,
- "std::experimental::gcd arguments must be integers");
-  static_assert(is_integral_v<_Nn>,
- "std::experimental::gcd arguments must be integers");
-  static_assert(_Mn(2) != _Mn(1),
- "std::experimental::gcd arguments must not be bool");
-  static_assert(_Nn(2) != _Nn(1),
- "std::experimental::gcd arguments must not be bool");
-  using _Up = make_unsigned_t>;
-  return std::__detail::__gcd(std::__detail::__absu<_Up>(__m),
- std::__detail::__absu<_Up>(__n));
+  static_assert(is_integral_v<_Mn> && is_integral_v<_Nn>,
+   "std::experimental::gcd arguments must be integers");
+  static_assert(_Mn(2) == 2 && _Nn(2) == 2,
+   "std::experimental::gcd arguments must not be bool");
+  namespace __detail = std::__detail;
+  using _Ct = common_type_t<_Mn, _Nn>;
+  const _Ct __m2 = __detail::__abs_r<_Ct>(__m);
+  const _Ct __n2 = __detail::__abs_r<_Ct>(__n);
+  return __detail::__gcd>(__m2, __n2);
 }
 
   /// Least common multiple
@@ -74,17 +72,25 @@ inline namespace fundamentals_v2
 constexpr common_type_t<_Mn, _Nn>
 lcm(_Mn __m, _Nn __n)
 {
-  static_assert(is_integral_v<_Mn>,
+  static_assert(is_integral_v<_Mn> && is_integral_v<_Nn>,
  "std::experimental::lcm arguments must be integers");
-  static_assert(is_integral_v<_Nn>,
- "std::experimental::lcm arguments must be integers");
-  static_assert(_Mn(2) != _Mn(1),
+  static_assert(_Mn(2) == 2 && _Nn(2) == 2,
  "std::experimental::lcm arguments must not be bool");
-  

[committed] libstdc++: Fix lifetime bugs for non-TLS eh_globals [PR105880]

2022-06-10 Thread Jonathan Wakely via Gcc-patches
Tested powerpc64le-linux, pushed to trunk.

-- >8 --

This ensures that the single-threaded fallback buffer eh_globals is not
destroyed during program termination, using the same immortalization
technique used for error category objects.

Also ensure that init._M_init can still be read after init has been
destroyed, by making it a static data member.

libstdc++-v3/ChangeLog:

PR libstdc++/105880
* libsupc++/eh_globals.cc (eh_globals): Ensure constant init and
prevent destruction during termination.
(__eh_globals_init::_M_init): Replace with static member _S_init.
(__cxxabiv1::__cxa_get_globals_fast): Update.
(__cxxabiv1::__cxa_get_globals): Likewise.
---
 libstdc++-v3/libsupc++/eh_globals.cc | 51 
 1 file changed, 37 insertions(+), 14 deletions(-)

diff --git a/libstdc++-v3/libsupc++/eh_globals.cc 
b/libstdc++-v3/libsupc++/eh_globals.cc
index 3a003b89edf..768425c0f40 100644
--- a/libstdc++-v3/libsupc++/eh_globals.cc
+++ b/libstdc++-v3/libsupc++/eh_globals.cc
@@ -64,8 +64,26 @@ __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW
 
 #else
 
-// Single-threaded fallback buffer.
-static __cxa_eh_globals eh_globals;
+#if __has_cpp_attribute(clang::require_constant_initialization)
+#  define __constinit [[clang::require_constant_initialization]]
+#endif
+
+namespace
+{
+  struct constant_init
+  {
+union {
+  unsigned char unused;
+  __cxa_eh_globals obj;
+};
+constexpr constant_init() : obj() { }
+
+~constant_init() { /* do nothing, union member is not destroyed */ }
+  };
+
+  // Single-threaded fallback buffer.
+  __constinit constant_init eh_globals;
+}
 
 #if __GTHREADS
 
@@ -90,32 +108,37 @@ eh_globals_dtor(void* ptr)
 struct __eh_globals_init
 {
   __gthread_key_t  _M_key;
-  bool _M_init;
+  static bool  _S_init;
 
-  __eh_globals_init() : _M_init(false)
-  { 
+  __eh_globals_init()
+  {
 if (__gthread_active_p())
-  _M_init = __gthread_key_create(&_M_key, eh_globals_dtor) == 0; 
+  _S_init = __gthread_key_create(&_M_key, eh_globals_dtor) == 0;
   }
 
   ~__eh_globals_init()
   {
-if (_M_init)
+if (_S_init)
   __gthread_key_delete(_M_key);
-_M_init = false;
+_S_init = false;
   }
+
+  __eh_globals_init(const __eh_globals_init&) = delete;
+  __eh_globals_init& operator=(const __eh_globals_init&) = delete;
 };
 
+bool __eh_globals_init::_S_init = false;
+
 static __eh_globals_init init;
 
 extern "C" __cxa_eh_globals*
 __cxxabiv1::__cxa_get_globals_fast() _GLIBCXX_NOTHROW
 {
   __cxa_eh_globals* g;
-  if (init._M_init)
+  if (init._S_init)
 g = static_cast<__cxa_eh_globals*>(__gthread_getspecific(init._M_key));
   else
-g = _globals;
+g = _globals.obj;
   return g;
 }
 
@@ -123,7 +146,7 @@ extern "C" __cxa_eh_globals*
 __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW
 {
   __cxa_eh_globals* g;
-  if (init._M_init)
+  if (init._S_init)
 {
   g = static_cast<__cxa_eh_globals*>(__gthread_getspecific(init._M_key));
   if (!g)
@@ -140,7 +163,7 @@ __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW
}
 }
   else
-g = _globals;
+g = _globals.obj;
   return g;
 }
 
@@ -148,11 +171,11 @@ __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW
 
 extern "C" __cxa_eh_globals*
 __cxxabiv1::__cxa_get_globals_fast() _GLIBCXX_NOTHROW
-{ return _globals; }
+{ return _globals.obj; }
 
 extern "C" __cxa_eh_globals*
 __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW
-{ return _globals; }
+{ return _globals.obj; }
 
 #endif
 
-- 
2.34.3



[PATCH] regrename: Fix -fcompare-debug issue in check_new_reg_p [PR105041]

2022-06-10 Thread Surya Kumari Jangala via Gcc-patches
regrename: Fix -fcompare-debug issue in check_new_reg_p [PR105041]

In check_new_reg_p, the nregs of a du chain is computed by obtaining the MODE
of the first element in the chain, and then calling hard_regno_nregs() with the
MODE. But the first element of the chain can be a DEBUG_INSN whose mode need
not be the same as the rest of the elements in the du chain. This
was resulting in fcompare-debug failure as check_new_reg_p was returning a
different result with -g for the same candidate register. We can instead obtain
nregs from the du chain itself.

2022-06-10  Surya Kumari Jangala  

gcc/
PR rtl-optimization/105041
* regrename.cc (check_new_reg_p): Use nregs value from du chain.

gcc/testsuite/
PR rtl-optimization/105041
* gcc.target/powerpc/pr105041.c: New test.


diff --git a/gcc/regrename.cc b/gcc/regrename.cc
index 10271e1..f651351 100644
--- a/gcc/regrename.cc
+++ b/gcc/regrename.cc
@@ -324,8 +324,7 @@ static bool
 check_new_reg_p (int reg ATTRIBUTE_UNUSED, int new_reg,
 class du_head *this_head, HARD_REG_SET this_unavailable)
 {
-  machine_mode mode = GET_MODE (*this_head->first->loc);
-  int nregs = hard_regno_nregs (new_reg, mode);
+  int nregs = this_head->nregs;
   int i;
   struct du_chain *tmp;
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr105041.c 
b/gcc/testsuite/gcc.target/powerpc/pr105041.c
new file mode 100644
index 000..89eed1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr105041.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target be } */
+/* { dg-options "-m32 -mdejagnu-cpu=power4 -O2 -fcompare-debug 
-fharden-compares -frename-registers" } */
+
+double m;
+int n;
+
+unsigned int
+foo (unsigned int x, int y)
+{
+  long long int a = y, b = !a;
+  int c = 0;
+
+  if (b != x)
+while ((int) m == a)
+  {
+c = a;
+a = 0;
+  }
+
+  n = b = y;
+
+  return x + c;
+}


[committed] libstdc++: Make std::hash> allocator-agnostic (LWG 3705)

2022-06-10 Thread Jonathan Wakely via Gcc-patches
Tested powerpc64le-linux, pushed to trunk.

-- >8 --

This new library issue was recently moved to Tentatively Ready by an LWG
poll, so I'm making the change on trunk.

As noted in PR libstc++/105907 the std::hash specializations for PMR
strings were not treated as slow hashes by the unordered containers, so
this change preserves that. The new specializations for custom
allocators are also not treated as slow, for the same reason. For the
versioned namespace (i.e. unstable ABI) we don't have to worry about
that, so can enable hash code caching for all basic_string
specializations.

libstdc++-v3/ChangeLog:

* include/bits/basic_string.h (__hash_str_base): New class
template.
(hash, A>>): Define partial
specialization for each of the standard character types.
(hash, hash, hash, hash)
(hash): Remove explicit specializations.
* include/std/string (__hash_string_base): Remove class
template.
(hash, hash, hash)
(hash, hash): Remove explicit
specializations.
* testsuite/21_strings/basic_string/hash/hash.cc: Test with
custom allocators.
* testsuite/21_strings/basic_string/hash/hash_char8_t.cc:
Likewise.
---
 libstdc++-v3/include/bits/basic_string.h  | 108 --
 libstdc++-v3/include/std/string   |  33 --
 .../21_strings/basic_string/hash/hash.cc  |  16 +++
 .../basic_string/hash/hash_char8_t.cc |  12 ++
 4 files changed, 77 insertions(+), 92 deletions(-)

diff --git a/libstdc++-v3/include/bits/basic_string.h 
b/libstdc++-v3/include/bits/basic_string.h
index 6041d05815b..f76ddf970c6 100644
--- a/libstdc++-v3/include/bits/basic_string.h
+++ b/libstdc++-v3/include/bits/basic_string.h
@@ -4226,86 +4226,76 @@ namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  // DR 1182.
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 3705. Hashability shouldn't depend on basic_string's allocator
+
+  template, _Alloc>>
+struct __str_hash_base
+: public __hash_base
+{
+  size_t
+  operator()(const _StrT& __s) const noexcept
+  { return _Hash_impl::hash(__s.data(), __s.length() * sizeof(_CharT)); }
+};
 
 #ifndef _GLIBCXX_COMPATIBILITY_CXX0X
   /// std::hash specialization for string.
-  template<>
-struct hash
-: public __hash_base
-{
-  size_t
-  operator()(const string& __s) const noexcept
-  { return std::_Hash_impl::hash(__s.data(), __s.length()); }
-};
-
-  template<>
-struct __is_fast_hash> : std::false_type
+  template
+struct hash, _Alloc>>
+: public __str_hash_base
 { };
 
   /// std::hash specialization for wstring.
-  template<>
-struct hash
-: public __hash_base
-{
-  size_t
-  operator()(const wstring& __s) const noexcept
-  { return std::_Hash_impl::hash(__s.data(),
- __s.length() * sizeof(wchar_t)); }
-};
+  template
+struct hash, _Alloc>>
+: public __str_hash_base
+{ };
 
-  template<>
-struct __is_fast_hash> : std::false_type
+  template
+struct __is_fast_hash,
+   _Alloc>>>
+: std::false_type
 { };
 #endif /* _GLIBCXX_COMPATIBILITY_CXX0X */
 
 #ifdef _GLIBCXX_USE_CHAR8_T
   /// std::hash specialization for u8string.
-  template<>
-struct hash
-: public __hash_base
-{
-  size_t
-  operator()(const u8string& __s) const noexcept
-  { return std::_Hash_impl::hash(__s.data(),
- __s.length() * sizeof(char8_t)); }
-};
-
-  template<>
-struct __is_fast_hash> : std::false_type
+  template
+struct hash, _Alloc>>
+: public __str_hash_base
 { };
 #endif
 
   /// std::hash specialization for u16string.
-  template<>
-struct hash
-: public __hash_base
-{
-  size_t
-  operator()(const u16string& __s) const noexcept
-  { return std::_Hash_impl::hash(__s.data(),
- __s.length() * sizeof(char16_t)); }
-};
-
-  template<>
-struct __is_fast_hash> : std::false_type
+  template
+struct hash, _Alloc>>
+: public __str_hash_base
 { };
 
   /// std::hash specialization for u32string.
-  template<>
-struct hash
-: public __hash_base
-{
-  size_t
-  operator()(const u32string& __s) const noexcept
-  { return std::_Hash_impl::hash(__s.data(),
- __s.length() * sizeof(char32_t)); }
-};
-
-  template<>
-struct __is_fast_hash> : std::false_type
+  template
+struct hash, _Alloc>>
+: public __str_hash_base
 { };
 
+#if ! _GLIBCXX_INLINE_VERSION
+  // PR libstdc++/105907 - __is_fast_hash affects unordered container ABI.
+  template<> struct __is_fast_hash> : std::false_type { };
+  template<> struct __is_fast_hash> : std::false_type { };
+  template<> struct __is_fast_hash> : std::false_type { };
+  template<> struct 

[PATCH] c++: improve TYPENAME_TYPE hashing [PR65328]

2022-06-10 Thread Patrick Palka via Gcc-patches
The reason compiling the testcase in this PR is so slow is ultimately
due to our poor hashing of TYPENAME_TYPE causing a huge amount of hash
table collisions in the spec_hasher and typename_hasher tables.

In spec_hasher, we don't hash the components of a TYPENAME_TYPE at all,
presumably because TYPENAME_TYPE equivalence as determined by
structural_comptypes depends on whether the comparing_specializations
flag is set.  This patch fixes this by setting comparing_specializations
from spec_hasher::hash, and making iterative_hash_template_arg hash the
relevant components of a TYPENAME_TYPE when this flag is set.
consistently.

And in typename_hasher, the hash function doesn't consider the
TYPENAME_TYPE_FULLNAME, which this patch fixes accordingly.

After this patch, compile time for the testcase in the PR is around
34 seconds (10% faster than Clang).

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK
for trunk?

PR c++/65328

gcc/cp/ChangeLog:

* decl.cc (typename_hasher::hash): Add extra overloads.
Use iterative_hash_object instead of htab_hash_pointer.
Hash the TYPENAME_TYPE_FULLNAME too.
(build_typename_type): Use typename_hasher::hash.
* pt.cc (spec_hasher::hash): Add two-parameter overload.
Set comparing_specializations around the call to
hash_tmpl_and_args.
(iterative_hash_template_arg) :
When comparing_specializations, hash the TYPE_CONTEXT
and TYPENAME_TYPE_FULLNAME.
(tsubst_function_decl): Use spec_hasher::hash instead of
hash_tmpl_and_args.
(tsubst_template_decl): Likewise.
(tsubst_decl): Likewise.
---
 gcc/cp/decl.cc | 26 +++---
 gcc/cp/pt.cc   | 28 
 2 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 7f3b3c3c588..b7f624ca50b 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -4007,14 +4007,27 @@ struct typename_hasher : ggc_ptr_hash
   /* Hash a TYPENAME_TYPE.  */
 
   static hashval_t
-  hash (tree t)
+  hash (tree context, tree name, tree fullname)
   {
-hashval_t hash;
+hashval_t hash = 0;
+hash = iterative_hash_object (context, hash);
+hash = iterative_hash_object (name, hash);
+hash = iterative_hash_object (fullname, hash);
+return hash;
+  }
 
-hash = (htab_hash_pointer (TYPE_CONTEXT (t))
-   ^ htab_hash_pointer (TYPE_IDENTIFIER (t)));
+  static hashval_t
+  hash (const typename_info *ti)
+  {
+return typename_hasher::hash (ti->scope, ti->name, ti->template_id);
+  }
 
-return hash;
+  static hashval_t
+  hash (tree t)
+  {
+return typename_hasher::hash (TYPE_CONTEXT (t),
+ TYPE_IDENTIFIER (t),
+ TYPENAME_TYPE_FULLNAME (t));
   }
 
   /* Compare two TYPENAME_TYPEs.  */
@@ -4053,8 +4066,7 @@ build_typename_type (tree context, tree name, tree 
fullname,
   ti.class_p = (tag_type == class_type
|| tag_type == record_type
|| tag_type == union_type);
-  hashval_t hash =  (htab_hash_pointer (ti.scope)
-^ htab_hash_pointer (ti.name));
+  hashval_t hash = typename_hasher::hash ();
 
   /* See if we already have this type.  */
   tree *e = typename_htab->find_slot_with_hash (, hash, INSERT);
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 55129cf6f2c..381fc337cb0 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -107,6 +107,7 @@ static bool excessive_deduction_depth;
 struct spec_hasher : ggc_ptr_hash
 {
   static hashval_t hash (spec_entry *);
+  static hashval_t hash (tree, tree);
   static bool equal (spec_entry *, spec_entry *);
 };
 
@@ -1768,13 +1769,22 @@ hash_tmpl_and_args (tree tmpl, tree args)
   return iterative_hash_template_arg (args, val);
 }
 
+hashval_t
+spec_hasher::hash (tree tmpl, tree args)
+{
+  ++comparing_specializations;
+  hashval_t val = hash_tmpl_and_args (tmpl, args);
+  --comparing_specializations;
+  return val;
+}
+
 /* Returns a hash for a spec_entry node based on the TMPL and ARGS members,
ignoring SPEC.  */
 
 hashval_t
 spec_hasher::hash (spec_entry *e)
 {
-  return hash_tmpl_and_args (e->tmpl, e->args);
+  return spec_hasher::hash (e->tmpl, e->args);
 }
 
 /* Recursively calculate a hash value for a template argument ARG, for use
@@ -1960,6 +1970,16 @@ iterative_hash_template_arg (tree arg, hashval_t val)
  val = iterative_hash_template_arg (DECLTYPE_TYPE_EXPR (arg), val);
  break;
 
+   case TYPENAME_TYPE:
+ if (comparing_specializations)
+   {
+ tree context = TYPE_MAIN_VARIANT (TYPE_CONTEXT (arg));
+ tree fullname = TYPENAME_TYPE_FULLNAME (arg);
+ val = iterative_hash_template_arg (context, val);
+ val = iterative_hash_template_arg (fullname, val);
+   }
+ break;
+
default:
  if (tree canonical = TYPE_CANONICAL (arg))
val = 

[PATCH][AArch64] Implement ACLE Data Intrinsics

2022-06-10 Thread Andre Vieira (lists) via Gcc-patches

Hi,

This patch adds support for the ACLE Data Intrinsics to the AArch64 port.

Bootstrapped and regression tested on aarch64-none-linux.

OK for trunk?

gcc/ChangeLog:

2022-06-10  Andre Vieira  

    * config/aarch64/aarch64.md (rbit2): Rename this ...
    (@aarch64_rbit): ... this and change it in...
    (ffs2,ctz2): ... here.
    (@aarch64_rev16): New.
    * config/aarch64/aarch64-builtins.cc: (aarch64_builtins):
    Define the following enum AARCH64_REV16, AARCH64_REV16L, 
AARCH64_REV16LL,

    AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL.
    (aarch64_init_data_intrinsics): New.
    (handle_arm_acle_h): Add call to aarch64_init_data_intrinsics.
    (aarch64_expand_builtin_data_intrinsic): New.
    (aarch64_general_expand_builtin): Add call to 
aarch64_expand_builtin_data_intrinsic.
    * config/aarch64/arm_acle.h (__clz, __clzl, __clzll, __cls, 
__clsl, __clsll, __rbit,
    __rbitl, __rbitll, __rev, __revl, __revll, __rev16, __rev16l, 
__rev16ll, __ror, __rorl,

    __rorll, __revsh): New.

gcc/testsuite/ChangeLog:

2022-06-10  Andre Vieira  

    * gcc.target/aarch64/acle/data-intrinsics.c: New test.
diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
b/gcc/config/aarch64/aarch64-builtins.cc
index 
e0a741ac663188713e21f457affa57217d074783..91a687dee13a27c21f0c50de9ba777aa900d6096
 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -613,6 +613,12 @@ enum aarch64_builtins
   AARCH64_LS64_BUILTIN_ST64B,
   AARCH64_LS64_BUILTIN_ST64BV,
   AARCH64_LS64_BUILTIN_ST64BV0,
+  AARCH64_REV16,
+  AARCH64_REV16L,
+  AARCH64_REV16LL,
+  AARCH64_RBIT,
+  AARCH64_RBITL,
+  AARCH64_RBITLL,
   AARCH64_BUILTIN_MAX
 };
 
@@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void)
   = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code);
 }
 
+static void
+aarch64_init_data_intrinsics (void)
+{
+  tree uint32_fntype = build_function_type_list (uint32_type_node,
+uint32_type_node, NULL_TREE);
+  tree long_fntype = build_function_type_list (long_unsigned_type_node,
+  long_unsigned_type_node,
+  NULL_TREE);
+  tree uint64_fntype = build_function_type_list (uint64_type_node,
+uint64_type_node, NULL_TREE);
+  aarch64_builtin_decls[AARCH64_REV16]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype,
+  AARCH64_REV16);
+  aarch64_builtin_decls[AARCH64_REV16L]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16l", long_fntype,
+  AARCH64_REV16L);
+  aarch64_builtin_decls[AARCH64_REV16LL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype,
+  AARCH64_REV16LL);
+  aarch64_builtin_decls[AARCH64_RBIT]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype,
+  AARCH64_RBIT);
+  aarch64_builtin_decls[AARCH64_RBITL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbitl", long_fntype,
+  AARCH64_RBITL);
+  aarch64_builtin_decls[AARCH64_RBITLL]
+= aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype,
+  AARCH64_RBITLL);
+}
+
 /* Implement #pragma GCC aarch64 "arm_acle.h".  */
 void
 handle_arm_acle_h (void)
 {
+  aarch64_init_data_intrinsics ();
   if (TARGET_LS64)
 aarch64_init_ls64_builtins ();
 }
@@ -2393,6 +2430,32 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx 
target)
   emit_insn (pat);
   return target;
 }
+/* Function to expand an expression EXP which calls one of the ACLE Data
+   Intrinsic builtins FCODE with the result going to TARGET.  */
+static rtx
+aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx 
target)
+{
+  rtx op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
+  machine_mode mode = GET_MODE (op0);
+  rtx pat;
+  switch (fcode)
+{
+case AARCH64_REV16:
+case AARCH64_REV16L:
+case AARCH64_REV16LL:
+  pat = gen_aarch64_rev16 (mode, target, op0);
+  break;
+case AARCH64_RBIT:
+case AARCH64_RBITL:
+case AARCH64_RBITLL:
+  pat = gen_aarch64_rbit (mode, target, op0);
+  break;
+default:
+  gcc_unreachable ();
+}
+  emit_insn (pat);
+  return target;
+}
 
 /* Expand an expression EXP as fpsr or fpcr setter (depending on
UNSPEC) using MODE.  */
@@ -2551,6 +2614,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree 
exp, rtx target,
   if (fcode >= AARCH64_MEMTAG_BUILTIN_START
   && fcode <= AARCH64_MEMTAG_BUILTIN_END)
 return aarch64_expand_builtin_memtag (fcode, exp, target);
+  if (fcode >= AARCH64_REV16
+  && fcode <= AARCH64_RBITLL)
+return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
 

[committed] libstdc++: Partially revert r11-9772-g6f8133689f4397 [PR105915]

2022-06-10 Thread Jonathan Wakely via Gcc-patches
I have done a partial revert on the gcc-11 branch to fix PR105915.

I'll also backport it to gcc-10 after testing finishes.

-- >8 --

The r11-9772-g6f8133689f4397 backport made two changes, but only one was
needed on the gcc-11 branch. The other should not have been backported,
and causes errors with clang. This removes the unwanted part.

libstdc++-v3/ChangeLog:

PR libstdc++/105915
* include/experimental/bits/fs_path.h (path::begin, path::end):
Remove noexcept from declarations.
---
 libstdc++-v3/include/experimental/bits/fs_path.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/experimental/bits/fs_path.h 
b/libstdc++-v3/include/experimental/bits/fs_path.h
index 1cc1b3bf686..a2bc931c696 100644
--- a/libstdc++-v3/include/experimental/bits/fs_path.h
+++ b/libstdc++-v3/include/experimental/bits/fs_path.h
@@ -425,8 +425,8 @@ namespace __detail
 class iterator;
 typedef iterator const_iterator;
 
-iterator begin() const noexcept;
-iterator end() const noexcept;
+iterator begin() const;
+iterator end() const;
 
 /// @cond undocumented
 // Create a basic_string by reading until a null character.
-- 
2.34.3



Re: [PATCH] c++: optimize specialization of nested class templates

2022-06-10 Thread Patrick Palka via Gcc-patches
On Thu, 9 Jun 2022, Patrick Palka wrote:

> On Thu, 9 Jun 2022, Jason Merrill wrote:
> 
> > On 6/8/22 14:21, Patrick Palka wrote:
> > > When substituting a class template specialization, tsubst_aggr_type
> > > substitutes the TYPE_CONTEXT before passing it to lookup_template_class.
> > > This appears to be unnecessary, however, because the the initial value
> > > of lookup_template_class's context parameter is unused outside of the
> > > IDENTIFIER_NODE case, and l_t_c performs its own substitution of the
> > > context, anyway.  So this patch removes the redundant substitution in
> > > tsubst_aggr_type.  Doing so causes us to ICE on template/nested5.C
> > > because during lookup_template_class for A::C::D with T=E and S=S,
> > > we substitute and complete the context A::C with T=E, which in turn
> > > registers the desired dependent specialization of D for us and we end up
> > > trying to register it again.  This patch fixes this by checking the
> > > specializations table again after completion of the context.
> > > 
> > > This patch also implements a couple of other optimizations:
> > > 
> > >* In lookup_template_class, if the context of the partially
> > >  instantiated template is already non-dependent, then we could
> > >  reuse that instead of substituting the context of the most
> > >  general template.
> > >* When substituting the TYPE_DECL for an injected-class-name
> > >  in tsubst_decl, we can avoid substituting its TREE_TYPE and
> > >  DECL_TI_ARGS.
> > > 
> > > Together these optimizations improve memory usage for the range-v3
> > > testcase test/view/split.cc by about 5%.  The improvement is probably
> > > more significant when dealing with deeply nested class templates.
> > > 
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
> > > trunk?
> > > 
> > > gcc/cp/ChangeLog:
> > > 
> > >   * pt.cc (lookup_template_class): Remove dead stores to
> > >   context parameter.  Don't substitute the context of the
> > >   most general template if that of the partially instantiated
> > >   template is non-dependent.  Check the specializations table
> > >   again after completing the context of a nested dependent
> > >   specialization.
> > >   (tsubst_aggr_type) : Don't substitute
> > >   TYPE_CONTEXT or pass it to lookup_template_class.
> > >   (tsubst_decl) : Avoid substituting the
> > >   TREE_TYPE and DECL_TI_ARGS when DECL_SELF_REFERENCE_P.
> > > ---
> > >   gcc/cp/pt.cc | 69 +++-
> > >   1 file changed, 41 insertions(+), 28 deletions(-)
> > > 
> > > diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
> > > index 59b94317e88..28023d60684 100644
> > > --- a/gcc/cp/pt.cc
> > > +++ b/gcc/cp/pt.cc
> > > @@ -9840,8 +9840,6 @@ lookup_template_class (tree d1, tree arglist, tree
> > > in_decl, tree context,
> > > if (context)
> > >   pop_decl_namespace ();
> > >   }
> > > -  if (templ)
> > > - context = DECL_CONTEXT (templ);
> > >   }
> > > else if (TREE_CODE (d1) == TYPE_DECL && MAYBE_CLASS_TYPE_P (TREE_TYPE
> > > (d1)))
> > >   {
> > > @@ -9868,7 +9866,6 @@ lookup_template_class (tree d1, tree arglist, tree
> > > in_decl, tree context,
> > >   {
> > > templ = d1;
> > > d1 = DECL_NAME (templ);
> > > -  context = DECL_CONTEXT (templ);
> > >   }
> > > else if (DECL_TEMPLATE_TEMPLATE_PARM_P (d1))
> > >   {
> > > @@ -10059,8 +10056,25 @@ lookup_template_class (tree d1, tree arglist, 
> > > tree
> > > in_decl, tree context,
> > > context = DECL_CONTEXT (gen_tmpl);
> > > if (context && TYPE_P (context))
> > >   {
> > > -   context = tsubst_aggr_type (context, arglist, complain, in_decl,
> > > true);
> > > -   context = complete_type (context);
> > > +   if (!uses_template_parms (DECL_CONTEXT (templ)))
> > > + /* If the context of the partially instantiated template is
> > > +already non-dependent, then we might as well use it.  */
> > > + context = DECL_CONTEXT (templ);
> > > +   else
> > > + {
> > > +   context = tsubst_aggr_type (context, arglist, complain, in_decl,
> > > true);
> > > +   context = complete_type (context);
> > > +   if (is_dependent_type && arg_depth > 1)
> > > + {
> > > +   /* If this is a dependent nested specialization such as
> > > +  A::B, then completion of A might have
> > > +  registered this specialization of B for us, so check
> > > +  the table again (33959).  */
> > > +   entry = type_specializations->find_with_hash (, hash);
> > > +   if (entry)
> > > + return entry->spec;
> > > + }
> > > + }
> > >   }
> > > else
> > >   context = tsubst (context, arglist, complain, in_decl);
> > > @@ -13711,25 +13725,12 @@ tsubst_aggr_type (tree t,
> > > if (TYPE_TEMPLATE_INFO (t) && uses_template_parms (t))
> > >   {
> > > 

Fix ipa-prop wrt volatile memory accesses

2022-06-10 Thread Jan Hubicka via Gcc-patches
Hi,
this patch prevents ipa-prop from propagating aggregates when load is
volatile.  Martin, does this look OK?  It seem to me that ipa-prop may
need some additional volatile flag checks.

Bootstrapped/regtested x86_64-linux, OK?

Honza

gcc/ChangeLog:

2022-06-10  Jan Hubicka  

PR ipa/105739
* ipa-prop.cc (ipa_load_from_parm_agg): Disqualify volatile memory
accesses.

gcc/testsuite/ChangeLog:

2022-06-10  Jan Hubicka  

* gcc.dg/ipa/pr105739.c: New test.

diff --git a/gcc/ipa-prop.cc b/gcc/ipa-prop.cc
index afd9222b5a2..c037668e7d8 100644
--- a/gcc/ipa-prop.cc
+++ b/gcc/ipa-prop.cc
@@ -1112,6 +1112,10 @@ ipa_load_from_parm_agg (struct ipa_func_body_info *fbi,
   if (!base)
 return false;
 
+  /* We can not propagate across volatile loads.  */
+  if (TREE_THIS_VOLATILE (op))
+return false;
+
   if (DECL_P (base))
 {
   int index = ipa_get_param_decl_index_1 (descriptors, base);
diff --git a/gcc/testsuite/gcc.dg/ipa/pr105739.c 
b/gcc/testsuite/gcc.dg/ipa/pr105739.c
new file mode 100644
index 000..8dbe8fc2494
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/pr105739.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+
+__attribute__((noinline))
+static int
+test2(int a)
+{
+if (__builtin_constant_p (a))
+__builtin_abort ();
+return a;
+}
+static int
+test(int *a)
+{
+int val = *(volatile int *)a;
+if (__builtin_constant_p (val))
+__builtin_abort ();
+if (val)
+  return test2(val);
+return 0;
+}
+int a;
+int
+main()
+{
+a = 0;
+return test ();
+}
+/* { dg-final { scan-tree-dump "test2" "optimized" } } */


Re: [PATCH 2/2] Add a general mapping from internal fns to target insns

2022-06-10 Thread David Malcolm via Gcc-patches
On Fri, 2022-06-10 at 10:14 +0100, Richard Sandiford via Gcc-patches
wrote:
Several existing internal functions map directly to an instruction
defined in target-insns.def.  This patch makes it easier to define
more such functions in future.

This should help to reduce cut-&-paste, but more importantly, it allows
the difference between optab functions and target-insns.def functions
to be abstracted away; both are now treated as “directly-mapped”.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


gcc/
* internal-fn.def (DEF_INTERNAL_INSN_FN): New macro.
(GOMP_SIMT_ENTER_ALLOC, GOMP_SIMT_EXIT, GOMP_SIMT_LANE)
(GOMP_SIMT_LAST_LANE, GOMP_SIMT_ORDERED_PRED,
GOMP_SIMT_VOTE_ANY)
(GOMP_SIMT_XCHG_BFLY, GOMP_SIMT_XCHG_IDX): Use it.
* internal-fn.h (direct_internal_fn_info::directly_mapped): New
member variable.
(direct_internal_fn_info::vectorizable): Reduce to 1 bit.
(direct_internal_fn_p): Also return true for internal functions
that map directly to instructions defined target-insns.def.
(direct_internal_fn): Adjust comment accordingly.
* internal-fn.c (direct_insn, optab1, optab2,
vectorizable_optab1)

[...snip...]

---
 gcc/internal-fn.cc  | 152 +++-
 gcc/internal-fn.def |  34 +++---
 gcc/internal-fn.h   |  20 +++---
 3 files changed, 87 insertions(+), 119 deletions(-)


[...snip...]

I can't comment on the correctness of the patch, but I happened to spot
that the filename in the changelog entry needs renaming for the .c to
.cc transition, or the git hooks will complain when you try to push
this.


Dave



Re: [PING][PATCH][WIP] have configure probe prefix for gmp/mpfr/mpc [PR44425]

2022-06-10 Thread Xi Ruoyao via Gcc-patches
On Thu, 2022-06-09 at 16:04 -0400, Eric Gallager via Gcc-patches wrote:
> Hi, I'd like to ping this patch:
> https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596126.html
> (cc-ing the build machinery maintainers listed in MAINTAINERS this
> time)
> 
> On Thu, Jun 2, 2022 at 11:53 AM Eric Gallager 
> wrote:
> > 
> > So, I'm working on fixing PR bootstrap/44425, and have this patch to
> > have the top-level configure script check in the value passed to
> > `--prefix=` when looking for gmp/mpfr/mpc. It "works" (in that
> > configuring with just `--prefix=` and none of
> > `--with-gmp=`/`--with-mpfr=`/`--with-mpc=` now works where it failed
> > before), but unfortunately it results in a bunch of duplicated
> > `-I`/`-L` flags stuck in ${gmplibs} and ${gmpinc}... is that
> > acceptable or should I try another approach?
> > Eric

A patch should not edit configure directly.  configure.ac should be
edited and configure should be regenerated from it.

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[PATCH] Do not erase warning data in gimple_set_location

2022-06-10 Thread Eric Botcazou via Gcc-patches
Hi,

gimple_set_location is mostly invoked on newly built GIMPLE statements, so 
their location is UNKNOWN_LOCATION and setting it will clobber the warning 
data of the passed location, if any.

Tested on x86-64/Linux, OK for mainline and 12 branch?


2022-06-10  Eric Botcazou  

* gimple.h (gimple_set_location): Do not copy warning data from
the previous location when it is UNKNOWN_LOCATION.


2022-06-10  Eric Botcazou  

testsuite/
* c-c++-common/nonnull-1.c: Remove XFAIL for C++.

-- 
Eric Botcazoudiff --git a/gcc/gimple.h b/gcc/gimple.h
index 6b1e89ad74e..870629cd562 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -1913,7 +1913,8 @@ static inline void
 gimple_set_location (gimple *g, location_t location)
 {
   /* Copy the no-warning data to the statement location.  */
-  copy_warning (location, g->location);
+  if (g->location != UNKNOWN_LOCATION)
+copy_warning (location, g->location);
   g->location = location;
 }
 
diff --git a/gcc/testsuite/c-c++-common/nonnull-1.c b/gcc/testsuite/c-c++-common/nonnull-1.c
index ea987365302..7be4e3479dd 100644
--- a/gcc/testsuite/c-c++-common/nonnull-1.c
+++ b/gcc/testsuite/c-c++-common/nonnull-1.c
@@ -30,5 +30,5 @@ func (char *cp1, char *cp2, char *cp3, char *cp4)
 __attribute__((nonnull (1))) int
 func2 (char *cp)
 {
-  return (cp != NULL) ? 1 : 0; /* { dg-warning "'nonnull' argument" "cp compared to NULL" { xfail c++ } } */
+  return (cp != NULL) ? 1 : 0; /* { dg-warning "'nonnull' argument" "cp compared to NULL" } */
 }


Re: [PATCH] Add optional __Bfloat16 support

2022-06-10 Thread Florian Weimer via Gcc-patches
* liuhongt via Libc-alpha:

> +\subsubsection{Special Types}
> +
> +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa.
> +It is used for \code{BF16} related intrinsics, it cannot be
> +used with standard C operators.

I think it's not necessary to specify whether the type supports certain
C operators (surely assignment will work?).  If they are added later,
the ABI won't need changing.

Thanks,
Florian



Re: [PATCH] testsuite: Add -mtune=generic to dg-options for two testcases.

2022-06-10 Thread Hongtao Liu via Gcc-patches
On Fri, Jun 10, 2022 at 4:45 PM Cui,Lili via Gcc-patches
 wrote:
>
> This patch is to change dg-options for two testcases.
>
> Use -mtune=generic to limit these two testcases. Because configuring them with
> -mtune=cascadelake or znver3 will vectorize them.
>
> regtested on x86_64-linux-gnu{-m32,}. Ok for trunk?
Ok.
>
> Thanks,
> Lili.
>
> Use -mtune=generic to limit these two test cases. Because configuring them 
> with
> -mtune=cascadelake or znver3 will vectorize them.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c: Add
> -mtune=generic to dg-options.
> * gcc.target/i386/pr84101.c: Likewise.
> ---
>  .../gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c | 2 +-
>  gcc/testsuite/gcc.target/i386/pr84101.c | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git 
> a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c 
> b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c
> index 7637cdb4a97..d060135d877 100644
> --- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */
> +/* { dg-additional-options "-msse -mtune=generic -fdump-tree-slp2-details" } 
> */
>
>  struct S { unsigned long a, b; } s;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr84101.c 
> b/gcc/testsuite/gcc.target/i386/pr84101.c
> index cf144894f9b..2c5a97308ca 100644
> --- a/gcc/testsuite/gcc.target/i386/pr84101.c
> +++ b/gcc/testsuite/gcc.target/i386/pr84101.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O3 -fdump-tree-slp2-details" } */
> +/* { dg-options "-O3 -mtune=generic -fdump-tree-slp2-details" } */
>
>  typedef struct uint64_pair uint64_pair_t ;
>  struct uint64_pair
> --
> 2.17.1
>


-- 
BR,
Hongtao


[PATCH 2/2] Add a general mapping from internal fns to target insns

2022-06-10 Thread Richard Sandiford via Gcc-patches
Several existing internal functions map directly to an instruction
defined in target-insns.def.  This patch makes it easier to define
more such functions in future.

This should help to reduce cut-&-paste, but more importantly, it allows
the difference between optab functions and target-insns.def functions
to be abstracted away; both are now treated as “directly-mapped”.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


gcc/
* internal-fn.def (DEF_INTERNAL_INSN_FN): New macro.
(GOMP_SIMT_ENTER_ALLOC, GOMP_SIMT_EXIT, GOMP_SIMT_LANE)
(GOMP_SIMT_LAST_LANE, GOMP_SIMT_ORDERED_PRED, GOMP_SIMT_VOTE_ANY)
(GOMP_SIMT_XCHG_BFLY, GOMP_SIMT_XCHG_IDX): Use it.
* internal-fn.h (direct_internal_fn_info::directly_mapped): New
member variable.
(direct_internal_fn_info::vectorizable): Reduce to 1 bit.
(direct_internal_fn_p): Also return true for internal functions
that map directly to instructions defined target-insns.def.
(direct_internal_fn): Adjust comment accordingly.
* internal-fn.c (direct_insn, optab1, optab2, vectorizable_optab1)
(vectorizable_optab2): New local macros.
(not_direct): Initialize directly_mapped.
(mask_load_direct, load_lanes_direct, mask_load_lanes_direct)
(gather_load_direct, len_load_direct, mask_store_direct)
(store_lanes_direct, mask_store_lanes_direct, vec_cond_mask_direct)
(vec_cond_direct, scatter_store_direct, len_store_direct)
(vec_set_direct, unary_direct, binary_direct, ternary_direct)
(cond_unary_direct, cond_binary_direct, cond_ternary_direct)
(while_direct, fold_extract_direct, fold_left_direct)
(mask_fold_left_direct, check_ptrs_direct): Use the macros above.
(expand_GOMP_SIMT_ENTER_ALLOC, expand_GOMP_SIMT_EXIT): Delete
(expand_GOMP_SIMT_LANE, expand_GOMP_SIMT_LAST_LANE): Likewise;
(expand_GOMP_SIMT_ORDERED_PRED, expand_GOMP_SIMT_VOTE_ANY): Likewise.
(expand_GOMP_SIMT_XCHG_BFLY, expand_GOMP_SIMT_XCHG_IDX): Likewise.
(direct_internal_fn_types): Handle functions that map to instructions
defined in target-insns.def.
(direct_internal_fn_types): Likewise.
(direct_internal_fn_supported_p): Likewise.
(internal_fn_expanders): Likewise.
---
 gcc/internal-fn.cc  | 152 +++-
 gcc/internal-fn.def |  34 +++---
 gcc/internal-fn.h   |  20 +++---
 3 files changed, 87 insertions(+), 119 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index ab2b1baa893..a809953ce6f 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -105,37 +105,44 @@ init_internal_fns ()
 
 /* Create static initializers for the information returned by
direct_internal_fn.  */
-#define not_direct { -2, -2, false }
-#define mask_load_direct { -1, 2, false }
-#define load_lanes_direct { -1, -1, false }
-#define mask_load_lanes_direct { -1, -1, false }
-#define gather_load_direct { 3, 1, false }
-#define len_load_direct { -1, -1, false }
-#define mask_store_direct { 3, 2, false }
-#define store_lanes_direct { 0, 0, false }
-#define mask_store_lanes_direct { 0, 0, false }
-#define vec_cond_mask_direct { 1, 0, false }
-#define vec_cond_direct { 2, 0, false }
-#define scatter_store_direct { 3, 1, false }
-#define len_store_direct { 3, 3, false }
-#define vec_set_direct { 3, 3, false }
-#define unary_direct { 0, 0, true }
-#define binary_direct { 0, 0, true }
-#define ternary_direct { 0, 0, true }
-#define cond_unary_direct { 1, 1, true }
-#define cond_binary_direct { 1, 1, true }
-#define cond_ternary_direct { 1, 1, true }
-#define while_direct { 0, 2, false }
-#define fold_extract_direct { 2, 2, false }
-#define fold_left_direct { 1, 1, false }
-#define mask_fold_left_direct { 1, 1, false }
-#define check_ptrs_direct { 0, 0, false }
+#define not_direct { -2, -2, false, false }
+#define direct_insn{ -2, -2, true, false }
+#define optab1(TYPE0)  { TYPE0, TYPE0, true, false }
+#define optab2(TYPE0, TYPE1)   { TYPE0, TYPE1, true, false }
+#define vectorizable_optab1(TYPE0) { TYPE0, TYPE0, true, true }
+
+#define mask_load_direct   optab2 (-1, 2)
+#define load_lanes_direct  optab1 (-1)
+#define mask_load_lanes_direct optab1 (-1)
+#define gather_load_direct optab2 (3, 1)
+#define len_load_directoptab1 (-1)
+#define mask_store_direct  optab2 (3, 2)
+#define store_lanes_direct optab1 (0)
+#define mask_store_lanes_directoptab1 (0)
+#define vec_cond_mask_direct   optab2 (1, 0)
+#define vec_cond_directoptab2 (2, 0)
+#define scatter_store_direct   optab2 (3, 1)
+#define len_store_direct   optab1 (3)
+#define vec_set_direct optab1 (3)
+#define unary_direct   

[PATCH 1/2] Factor out common internal-fn idiom

2022-06-10 Thread Richard Sandiford via Gcc-patches
internal-fn.c has quite a few functions that simply map the result
of the call to an instruction's output operand (if any) and map
each argument to an instruction's input operand, in order.
This patch adds a single function for doing that.  It's really
just a generalisation of expand_direct_optab_fn, but with the
output operand being optional.

Unfortunately, it isn't possible to do this for vcond_mask
because the internal function has a different argument order
from the optab.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


gcc/
* internal-fn.cc (expand_fn_using_insn): New function,
split out and adapted from...
(expand_direct_optab_fn): ...here.
(expand_GOMP_SIMT_ENTER_ALLOC): Use it.
(expand_GOMP_SIMT_EXIT): Likewise.
(expand_GOMP_SIMT_LANE): Likewise.
(expand_GOMP_SIMT_LAST_LANE): Likewise.
(expand_GOMP_SIMT_ORDERED_PRED): Likewise.
(expand_GOMP_SIMT_VOTE_ANY): Likewise.
(expand_GOMP_SIMT_XCHG_BFLY): Likewise.
(expand_GOMP_SIMT_XCHG_IDX): Likewise.
---
 gcc/internal-fn.cc | 243 +
 1 file changed, 89 insertions(+), 154 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index 8b1733e20c4..ab2b1baa893 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -140,6 +140,86 @@ const direct_internal_fn_info 
direct_internal_fn_array[IFN_LAST + 1] = {
   not_direct
 };
 
+/* Expand STMT using instruction ICODE.  The instruction has NOUTPUTS
+   output operands and NINPUTS input operands, where NOUTPUTS is either
+   0 or 1.  The output operand (if any) comes first, followed by the
+   NINPUTS input operands.  */
+
+static void
+expand_fn_using_insn (gcall *stmt, insn_code icode, unsigned int noutputs,
+ unsigned int ninputs)
+{
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  expand_operand *ops = XALLOCAVEC (expand_operand, noutputs + ninputs);
+  unsigned int opno = 0;
+  rtx lhs_rtx = NULL_RTX;
+  tree lhs = gimple_call_lhs (stmt);
+
+  if (noutputs)
+{
+  gcc_assert (noutputs == 1);
+  if (lhs)
+   lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+
+  /* Do not assign directly to a promoted subreg, since there is no
+guarantee that the instruction will leave the upper bits of the
+register in the state required by SUBREG_PROMOTED_SIGN.  */
+  rtx dest = lhs_rtx;
+  if (dest && GET_CODE (dest) == SUBREG && SUBREG_PROMOTED_VAR_P (dest))
+   dest = NULL_RTX;
+  create_output_operand ([opno], dest,
+insn_data[icode].operand[opno].mode);
+  opno += 1;
+}
+  else
+gcc_assert (!lhs);
+
+  for (unsigned int i = 0; i < ninputs; ++i)
+{
+  tree rhs = gimple_call_arg (stmt, i);
+  tree rhs_type = TREE_TYPE (rhs);
+  rtx rhs_rtx = expand_normal (rhs);
+  if (INTEGRAL_TYPE_P (rhs_type))
+   create_convert_operand_from ([opno], rhs_rtx,
+TYPE_MODE (rhs_type),
+TYPE_UNSIGNED (rhs_type));
+  else
+   create_input_operand ([opno], rhs_rtx, TYPE_MODE (rhs_type));
+  opno += 1;
+}
+
+  gcc_assert (opno == noutputs + ninputs);
+  expand_insn (icode, opno, ops);
+  if (lhs_rtx && !rtx_equal_p (lhs_rtx, ops[0].value))
+{
+  /* If the return value has an integral type, convert the instruction
+result to that type.  This is useful for things that return an
+int regardless of the size of the input.  If the instruction result
+is smaller than required, assume that it is signed.
+
+If the return value has a nonintegral type, its mode must match
+the instruction result.  */
+  if (GET_CODE (lhs_rtx) == SUBREG && SUBREG_PROMOTED_VAR_P (lhs_rtx))
+   {
+ /* If this is a scalar in a register that is stored in a wider
+mode than the declared mode, compute the result into its
+declared mode and then convert to the wider mode.  */
+ gcc_checking_assert (INTEGRAL_TYPE_P (TREE_TYPE (lhs)));
+ rtx tmp = convert_to_mode (GET_MODE (lhs_rtx), ops[0].value, 0);
+ convert_move (SUBREG_REG (lhs_rtx), tmp,
+   SUBREG_PROMOTED_SIGN (lhs_rtx));
+   }
+  else if (GET_MODE (lhs_rtx) == GET_MODE (ops[0].value))
+   emit_move_insn (lhs_rtx, ops[0].value);
+  else
+   {
+ gcc_checking_assert (INTEGRAL_TYPE_P (TREE_TYPE (lhs)));
+ convert_move (lhs_rtx, ops[0].value, 0);
+   }
+}
+}
+
 /* ARRAY_TYPE is an array of vector modes.  Return the associated insn
for load-lanes-style optab OPTAB, or CODE_FOR_nothing if none.  */
 
@@ -233,22 +313,8 @@ expand_GOMP_SIMT_ENTER (internal_fn, gcall *)
 static void
 expand_GOMP_SIMT_ENTER_ALLOC (internal_fn, gcall *stmt)
 {
-  rtx target;
-  tree lhs = gimple_call_lhs (stmt);
-  if (lhs)
-target = expand_expr (lhs, NULL_RTX, 

[PATCH] testsuite: Add -mtune=generic to dg-options for two testcases.

2022-06-10 Thread Cui,Lili via Gcc-patches
This patch is to change dg-options for two testcases.

Use -mtune=generic to limit these two testcases. Because configuring them with
-mtune=cascadelake or znver3 will vectorize them.

regtested on x86_64-linux-gnu{-m32,}. Ok for trunk?

Thanks,
Lili.

Use -mtune=generic to limit these two test cases. Because configuring them with
-mtune=cascadelake or znver3 will vectorize them.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c: Add
-mtune=generic to dg-options.
* gcc.target/i386/pr84101.c: Likewise.
---
 .../gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c | 2 +-
 gcc/testsuite/gcc.target/i386/pr84101.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c
index 7637cdb4a97..d060135d877 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */
+/* { dg-additional-options "-msse -mtune=generic -fdump-tree-slp2-details" } */
 
 struct S { unsigned long a, b; } s;
 
diff --git a/gcc/testsuite/gcc.target/i386/pr84101.c 
b/gcc/testsuite/gcc.target/i386/pr84101.c
index cf144894f9b..2c5a97308ca 100644
--- a/gcc/testsuite/gcc.target/i386/pr84101.c
+++ b/gcc/testsuite/gcc.target/i386/pr84101.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O3 -fdump-tree-slp2-details" } */
+/* { dg-options "-O3 -mtune=generic -fdump-tree-slp2-details" } */
 
 typedef struct uint64_pair uint64_pair_t ;
 struct uint64_pair
-- 
2.17.1



Re: [PATCH] Add optional __Bfloat16 support

2022-06-10 Thread Hongtao Liu via Gcc-patches
On Fri, Jun 10, 2022 at 3:47 PM liuhongt via Libc-alpha
 wrote:
>
> Pass and return __Bfloat16 values in XMM registers.
>
> Background:
> __Bfloat16 (BF16) is a new floating-point format that can accelerate machine 
> learning (deep learning training, in particular) algorithms.
> It's first introduced by Intel AVX-512 extension called AVX-512_BF16. 
> __Bfloat16 has 8 bits of exponent and 7 bits of mantissa and it's different 
> from _Float16.
>
> Movivation:
> Currently __bfloat16 is a typedef of short, which creates a problem where the 
> compiler does not raise any alarms if it is used to add, subtract, multiply 
> or divide, but the result of the calculation is actually meaningless.
> To solve this problem, a real scalar type __Bfloat16 needs to be introduced. 
> It is mainly used for intrinsics, not available for C standard operators. 
> __Bfloat16 will also be used for movement like passing parameter, load and 
> store, vector initialization, vector shuffle, and .etc. It creates a need for 
> a corresponding psABI.
>
> ---
>  x86-64-ABI/low-level-sys-info.tex | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/x86-64-ABI/low-level-sys-info.tex 
> b/x86-64-ABI/low-level-sys-info.tex
> index a8b69db..ba8db0d 100644
> --- a/x86-64-ABI/low-level-sys-info.tex
> +++ b/x86-64-ABI/low-level-sys-info.tex
> @@ -302,6 +302,12 @@ be used to represent the type, is a family of integer 
> types.
>  This permits the use of these types in allocated arrays using the common
>  sizeof(Array)/sizeof(ElementType) pattern.
>
> +\subsubsection{Special Types}
> +
> +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa.
> +It is used for \code{BF16} related intrinsics, it cannot be
> +used with standard C operators.
> +
>  \subsubsection{Aggregates and Unions}
>
>  Structures and unions assume the alignment of their most strictly
> @@ -563,8 +569,8 @@ The basic types are assigned their natural classes:
>  \item Arguments of types (signed and unsigned) \code{_Bool}, \code{char},
>\code{short}, \code{int}, \code{long}, \code{long long}, and
>pointers are in the INTEGER class.
> -\item Arguments of types \code{_Float16}, \code{float}, \code{double},
> -  \code{_Decimal32},
> +\item Arguments of types \code{_Float16}, \code{__Bfloat16}, \code{float},
> +  \code{double}, \code{_Decimal32},
>\code{_Decimal64} and \code{__m64} are in class SSE.
>  \item Arguments of types \code{__float128}, \code{_Decimal128}
>and \code{__m128} are split into two halves.  The least significant
> --
> 2.18.1
>


-- 
BR,
Hongtao


[PATCH] Add optional __Bfloat16 support

2022-06-10 Thread liuhongt via Gcc-patches
Pass and return __Bfloat16 values in XMM registers.

Background:
__Bfloat16 (BF16) is a new floating-point format that can accelerate machine 
learning (deep learning training, in particular) algorithms.
It's first introduced by Intel AVX-512 extension called AVX-512_BF16. 
__Bfloat16 has 8 bits of exponent and 7 bits of mantissa and it's different 
from _Float16.

Movivation:
Currently __bfloat16 is a typedef of short, which creates a problem where the 
compiler does not raise any alarms if it is used to add, subtract, multiply or 
divide, but the result of the calculation is actually meaningless.
To solve this problem, a real scalar type __Bfloat16 needs to be introduced. It 
is mainly used for intrinsics, not available for C standard operators. 
__Bfloat16 will also be used for movement like passing parameter, load and 
store, vector initialization, vector shuffle, and .etc. It creates a need for a 
corresponding psABI.

---
 x86-64-ABI/low-level-sys-info.tex | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/x86-64-ABI/low-level-sys-info.tex 
b/x86-64-ABI/low-level-sys-info.tex
index a8b69db..ba8db0d 100644
--- a/x86-64-ABI/low-level-sys-info.tex
+++ b/x86-64-ABI/low-level-sys-info.tex
@@ -302,6 +302,12 @@ be used to represent the type, is a family of integer 
types.
 This permits the use of these types in allocated arrays using the common
 sizeof(Array)/sizeof(ElementType) pattern.
 
+\subsubsection{Special Types}
+
+The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa.
+It is used for \code{BF16} related intrinsics, it cannot be
+used with standard C operators.
+
 \subsubsection{Aggregates and Unions}
 
 Structures and unions assume the alignment of their most strictly
@@ -563,8 +569,8 @@ The basic types are assigned their natural classes:
 \item Arguments of types (signed and unsigned) \code{_Bool}, \code{char},
   \code{short}, \code{int}, \code{long}, \code{long long}, and
   pointers are in the INTEGER class.
-\item Arguments of types \code{_Float16}, \code{float}, \code{double},
-  \code{_Decimal32},
+\item Arguments of types \code{_Float16}, \code{__Bfloat16}, \code{float},
+  \code{double}, \code{_Decimal32},
   \code{_Decimal64} and \code{__m64} are in class SSE.
 \item Arguments of types \code{__float128}, \code{_Decimal128}
   and \code{__m128} are split into two halves.  The least significant
-- 
2.18.1



Re: [PATCH] aarch64: Lower vcombine to GIMPLE

2022-06-10 Thread Richard Sandiford via Gcc-patches
Andrew Carlotti via Gcc-patches  writes:
> Hi all,
>
> This lowers vcombine intrinsics to a GIMPLE vector constructor, which enables 
> better optimisation during GIMPLE passes.
>
> Bootstrapped and tested on aarch64-none-linux-gnu, and tested for 
> aarch64_be-none-linux-gnu via cross-compilation.
>
>
> gcc/
>
>   * config/aarch64/aarch64-builtins.c
>   (aarch64_general_gimple_fold_builtin): Add combine.
>
> gcc/testsuite/
>
>   * gcc.target/aarch64/advsimd-intrinsics/combine.c:
>   New test.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> b/gcc/config/aarch64/aarch64-builtins.cc
> index 
> 5217dbdb2ac78bba0a669d22af6d769d1fe91a3d..9d52fb8c5a48c9b743defb340a85fb20a1c8f014
>  100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -2827,6 +2827,18 @@ aarch64_general_gimple_fold_builtin (unsigned int 
> fcode, gcall *stmt,
> gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt));
> break;
>
> + BUILTIN_VDC (BINOP, combine, 0, AUTO_FP)
> + BUILTIN_VD_I (BINOPU, combine, 0, NONE)
> + BUILTIN_VDC_P (BINOPP, combine, 0, NONE)
> +   {
> + if (BYTES_BIG_ENDIAN)
> +   std::swap(args[0], args[1]);

We probably shouldn't do this swap in-place, since args refers directly
to the gimple statement.

> + tree ret_type = TREE_TYPE (gimple_call_lhs (stmt));
> + tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, args[0], 
> NULL_TREE, args[1]);

Minor formatting nit: lines should be under 80 chars.

Looks good otherwise, thanks, and sorry for the slow review.

Richard

> + new_stmt = gimple_build_assign (gimple_call_lhs (stmt), ctor);
> +   }
> +   break;
> +
>   /*lower store and load neon builtins to gimple.  */
>   BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD)
>   BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD)
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c 
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c
> new file mode 100644
> index 
> ..d08faf7a4a160a1e83428ed9b270731bbf7b8c8a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */
> +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
> +
> +#include 
> +
> +/*
> +** foo:
> +** umovw0, v1\.s\[1\]
> +** ret
> +*/
> +
> +int32_t foo (int32x2_t a, int32x2_t b)
> +{
> +  int32x4_t c = vcombine_s32(a, b);
> +  return vgetq_lane_s32(c, 3);
> +}
> +