Re: [PATCH 2/4] xtensa: Consider the Loop Option when setmemsi is expanded to small loop
On 2022/06/11 9:12, Max Filippov wrote: Hi Suwa-san, hi! This change results in a bunch of ICEs in tests that look like this: gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c: In function 'main': gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: error: unrecognizable insn: (insn 7 6 8 2 (set (reg:SI 45) (plus:SI (reg:SI 44) (const_int 262144 [0x4]))) oh, what a my mistake... it's so RISCy! int array[65535]; void test(void) { __builtin_memset(array, 0, sizeof(array)); } .literal_position .literal .LC0, array .literal .LC2, 65535 test: l32ra3, .LC0 l32ra2, .LC2 movi.n a4, 0 loopa2, .L2_LEND .L2: s32i.n a4, a3, 0 addi.n a3, a3, 4 .L2_LEND: ret.n --- gcc/config/xtensa/xtensa.cc | 71 ++--- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc index c7b54babc37..bc3330f836f 100644 --- a/gcc/config/xtensa/xtensa.cc +++ b/gcc/config/xtensa/xtensa.cc @@ -1483,7 +1483,7 @@ xtensa_expand_block_set_unrolled_loop (rtx *operands) int xtensa_expand_block_set_small_loop (rtx *operands) { - HOST_WIDE_INT bytes, value, align; + HOST_WIDE_INT bytes, value, align, count; int expand_len, funccall_len; rtx x, dst, end, reg; machine_mode unit_mode; @@ -1503,17 +1503,25 @@ xtensa_expand_block_set_small_loop (rtx *operands) /* Totally-aligned block only. */ if (bytes % align != 0) return 0; + count = bytes / align; - /* If 4-byte aligned, small loop substitution is almost optimal, thus - limited to only offset to the end address for ADDI/ADDMI instruction. */ - if (align == 4 - && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0))) -return 0; + /* If the Loop Option (zero-overhead looping) is configured and active, + almost no restrictions about the length of the block. */ + if (! (TARGET_LOOPS && optimize)) +{ + /* If 4-byte aligned, small loop substitution is almost optimal, +thus limited to only offset to the end address for ADDI/ADDMI +instruction. */ + if (align == 4 + && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0))) + return 0; - /* If no 4-byte aligned, loop count should be treated as the constraint. */ - if (align != 4 - && bytes / align > ((optimize > 1 && !optimize_size) ? 8 : 15)) -return 0; + /* If no 4-byte aligned, loop count should be treated as the +constraint. */ + if (align != 4 + && count > ((optimize > 1 && !optimize_size) ? 8 : 15)) + return 0; +} /* Insn expansion: holding the init value. Either MOV(.N) or L32R w/litpool. */ @@ -1523,16 +1531,33 @@ xtensa_expand_block_set_small_loop (rtx *operands) expand_len = TARGET_DENSITY ? 2 : 3; else expand_len = 3 + 4; - /* Insn expansion: Either ADDI(.N) or ADDMI for the end address. */ - expand_len += bytes > 127 ? 3 - : (TARGET_DENSITY && bytes <= 15) ? 2 : 3; - - /* Insn expansion: the loop body and branch instruction. - For store, one of S8I, S16I or S32I(.N). - For advance, ADDI(.N). - For branch, BNE. */ - expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3) - + (TARGET_DENSITY ? 2 : 3) + 3; + if (TARGET_LOOPS && optimize) /* zero-overhead looping */ +{ + /* Insn translation: Either MOV(.N) or L32R w/litpool for the +loop count. */ + expand_len += xtensa_simm12b (count) ? xtensa_sizeof_MOVI (count) + : 3 + 4; + /* Insn translation: LOOP, the zero-overhead looping setup +instruction. */ + expand_len += 3; + /* Insn expansion: the loop body instructions. + For store, one of S8I, S16I or S32I(.N). + For advance, ADDI(.N). */ + expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3) + + (TARGET_DENSITY ? 2 : 3); +} + else /* NO zero-overhead looping */ +{ + /* Insn expansion: Either ADDI(.N) or ADDMI for the end address. */ + expand_len += bytes > 127 ? 3 + : (TARGET_DENSITY && bytes <= 15) ? 2 : 3; + /* Insn expansion: the loop body and branch instruction. + For store, one of S8I, S16I or S32I(.N). + For advance, ADDI(.N). + For branch, BNE. */ + expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3) + + (TARGET_DENSITY ? 2 : 3) + 3; +} /* Function call: preparing two arguments. */ funccall_len = xtensa_sizeof_MOVI (value); @@ -1555,7 +1580,11 @@ xtensa_expand_block_set_small_loop (rtx *operands) dst = gen_reg_rtx (SImode); emit_move_insn (dst, x); end = gen_reg_rtx (SImode); - emit_insn (gen_addsi3 (end, dst, operands[1] /* the length */)); + if (TARGET_LOOPS && optimize) +x = force_reg (SImode,
Re: [PATCH 2/4] xtensa: Consider the Loop Option when setmemsi is expanded to small loop
Hi Suwa-san, On Thu, Jun 9, 2022 at 9:26 PM Takayuki 'January June' Suwa wrote: > > Now apply to almost any size of aligned block under such circumstances. > > gcc/ChangeLog: > > * config/xtensa/xtensa.cc (xtensa_expand_block_set_small_loop): > Pass through the block length / loop count conditions if > zero-overhead looping is configured and active, > --- > gcc/config/xtensa/xtensa.cc | 65 + > 1 file changed, 45 insertions(+), 20 deletions(-) This change results in a bunch of ICEs in tests that look like this: gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c: In function 'main': gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: error: unrecognizable insn: (insn 7 6 8 2 (set (reg:SI 45) (plus:SI (reg:SI 44) (const_int 262144 [0x4]))) "gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c":23:3 -1 (nil)) during RTL pass: vregs gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: internal compiler error: in extract_insn, at recog.cc:2791 0x6a21cf _fatal_insn(char const*, rtx_def const*, char const*, int, char const*) gcc/gcc/rtl-error.cc:108 0x6a2252 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*) gcc/gcc/rtl-error.cc:116 0x693824 extract_insn(rtx_insn*) gcc/gcc/recog.cc:2791 0xb27647 instantiate_virtual_regs_in_insn gcc/gcc/function.cc:1611 0xb27647 instantiate_virtual_regs gcc/gcc/function.cc:1985 0xb27647 execute gcc/gcc/function.cc:2034 -- Thanks. -- Max
Re: [PATCH] libstdc++: Rename __null_terminated to avoid collision with Apple SDK
Thanks, Jonathan. I am, in fact, so certifying. I do believe that bringing up support for new OS versions is in scope for open branches, and it makes sense to merge, particularly for a trivial and uncontentious patch like this one. Jonathan Wakely wrote: > On Fri, 10 Jun 2022 at 21:12, Mark Mentovai wrote: > > > > The macOS 13 SDK (and equivalent-version iOS and other Apple OS SDKs) > > contain this definition in : > > > > 863 #define __null_terminated > > > > This collides with the use of __null_terminated in libstdc++'s > > experimental fs_path.h. > > > > As libstdc++'s use of this token is entirely internal to fs_path.h, the > > simplest workaround, renaming it, is most appropriate. Here, it's > > renamed to __nul_terminated, referencing the NUL ('\0') value that is > > used to terminate the strings in the context in which this tag structure > > is used. > > > > libstdc++-v3/ChangeLog: > > > > * include/experimental/bits/fs_path.h: Rename __null_terminated > > to __nul_terminated avoid colliding with a macro in Apple's SDK. > > > > Signed-off-by: Mark Mentovai > > Thanks for the patch. The change makes sense so I'll get it committed. > Is this change needed on the release branches too? > > Just to be sure, could you please confirm that your Signed-off-by: tag > is to certify you agree with the DCO at https://gcc.gnu.org/dco.html > (and not just something you're doing because you've seen others doing > it :-) > > Thanks again. > >
Re: [PATCH] libstdc++: Rename __null_terminated to avoid collision with Apple SDK
On Fri, 10 Jun 2022 at 21:12, Mark Mentovai wrote: > > The macOS 13 SDK (and equivalent-version iOS and other Apple OS SDKs) > contain this definition in : > > 863 #define __null_terminated > > This collides with the use of __null_terminated in libstdc++'s > experimental fs_path.h. > > As libstdc++'s use of this token is entirely internal to fs_path.h, the > simplest workaround, renaming it, is most appropriate. Here, it's > renamed to __nul_terminated, referencing the NUL ('\0') value that is > used to terminate the strings in the context in which this tag structure > is used. > > libstdc++-v3/ChangeLog: > > * include/experimental/bits/fs_path.h: Rename __null_terminated > to __nul_terminated avoid colliding with a macro in Apple's SDK. > > Signed-off-by: Mark Mentovai Thanks for the patch. The change makes sense so I'll get it committed. Is this change needed on the release branches too? Just to be sure, could you please confirm that your Signed-off-by: tag is to certify you agree with the DCO at https://gcc.gnu.org/dco.html (and not just something you're doing because you've seen others doing it :-) Thanks again.
[PATCH] libstdc++: Rename __null_terminated to avoid collision with Apple SDK
The macOS 13 SDK (and equivalent-version iOS and other Apple OS SDKs) contain this definition in : 863 #define __null_terminated This collides with the use of __null_terminated in libstdc++'s experimental fs_path.h. As libstdc++'s use of this token is entirely internal to fs_path.h, the simplest workaround, renaming it, is most appropriate. Here, it's renamed to __nul_terminated, referencing the NUL ('\0') value that is used to terminate the strings in the context in which this tag structure is used. libstdc++-v3/ChangeLog: * include/experimental/bits/fs_path.h: Rename __null_terminated to __nul_terminated avoid colliding with a macro in Apple's SDK. Signed-off-by: Mark Mentovai --- libstdc++-v3/include/experimental/bits/fs_path.h | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libstdc++-v3/include/experimental/bits/fs_path.h b/libstdc++-v3/include/experimental/bits/fs_path.h index b0825ba76e80..19d246100cb5 100644 --- a/libstdc++-v3/include/experimental/bits/fs_path.h +++ b/libstdc++-v3/include/experimental/bits/fs_path.h @@ -140,10 +140,10 @@ namespace __detail inline _Source _S_range_begin(_Source __begin) { return __begin; } - struct __null_terminated { }; + struct __nul_terminated { }; template -inline __null_terminated +inline __nul_terminated _S_range_end(_Source) { return {}; } template @@ -459,11 +459,11 @@ namespace __detail struct _Cvt; static string_type -_S_convert(value_type* __src, __detail::__null_terminated) +_S_convert(value_type* __src, __detail::__nul_terminated) { return string_type(__src); } static string_type -_S_convert(const value_type* __src, __detail::__null_terminated) +_S_convert(const value_type* __src, __detail::__nul_terminated) { return string_type(__src); } template @@ -477,7 +477,7 @@ namespace __detail template static string_type - _S_convert(_InputIterator __src, __detail::__null_terminated) + _S_convert(_InputIterator __src, __detail::__nul_terminated) { auto __s = _S_string_from_iter(__src); return _S_convert(__s.c_str(), __s.c_str() + __s.size()); @@ -504,7 +504,7 @@ namespace __detail template static string_type - _S_convert_loc(_InputIterator __src, __detail::__null_terminated, + _S_convert_loc(_InputIterator __src, __detail::__nul_terminated, const std::locale& __loc) { const std::string __s = _S_string_from_iter(__src); -- 2.36.1
Re: [PATCH] regrename: Fix -fcompare-debug issue in check_new_reg_p [PR105041]
On 6/10/2022 9:40 AM, Segher Boessenkool wrote: Hi! On Fri, Jun 10, 2022 at 07:52:57PM +0530, Surya Kumari Jangala wrote: In check_new_reg_p, the nregs of a du chain is computed by obtaining the MODE of the first element in the chain, and then calling hard_regno_nregs() with the MODE. But the first element of the chain can be a DEBUG_INSN whose mode need not be the same as the rest of the elements in the du chain. This was resulting in fcompare-debug failure as check_new_reg_p was returning a different result with -g for the same candidate register. We can instead obtain nregs from the du chain itself. Great, thanks for finding and fixing this! I cannot approve it, you'll have to wait for someone who can. It looks fine to me, but that does not mean so much in regrename.c :-) I'll go ahead and ACK the regrename bits. So as soon as you're happy with the testsuite bits, this is good to go. jeff
Re: [PATCH] c++: Add support for __real__/__imag__ modifications in constant expressions [PR88174]
On Fri, Jun 10, 2022 at 01:27:28PM -0400, Jason Merrill wrote: > > --- gcc/cp/constexpr.cc.jj 2022-06-08 08:21:02.973448193 +0200 > > +++ gcc/cp/constexpr.cc 2022-06-08 17:13:04.986040449 +0200 > > @@ -5707,6 +5707,20 @@ cxx_eval_store_expression (const constex > > } > > break; > > + case REALPART_EXPR: > > + gcc_assert (probe == target); > > Doesn't this assert mean that complex_expr will always be == valp? No, even when handling the pushed *PART_EXPR, it will set valp = _OPERAND (*valp, index != integer_zero_node); So, valp will be either _OPERAND (*complex_expr, 0) or _OPERAND (*complex_expr, 1). As *valp = init; is what is usually then stored and we want to store there the scalar. > > @@ -5946,6 +5984,24 @@ cxx_eval_store_expression (const constex > > = get_or_insert_ctor_field (*valp, indexes[i], index_pos_hints[i]); > > valp = >value; > > } > > + if (complex_part != -1) > > + { > > + if (TREE_CODE (*valp) == COMPLEX_CST) > > + *valp = build2 (COMPLEX_EXPR, TREE_TYPE (*valp), > > + TREE_REALPART (*valp), > > + TREE_IMAGPART (*valp)); > > + else if (TREE_CODE (*valp) == CONSTRUCTOR > > + && CONSTRUCTOR_NELTS (*valp) == 0 > > + && CONSTRUCTOR_NO_CLEARING (*valp)) > > + { > > + tree r = build_constructor (TREE_TYPE (TREE_TYPE (*valp)), NULL); > > + CONSTRUCTOR_NO_CLEARING (r) = 1; > > + *valp = build2 (COMPLEX_EXPR, TREE_TYPE (*valp), r, r); > > + } > > + gcc_assert (TREE_CODE (*valp) == COMPLEX_EXPR); > > + complex_expr = valp; > > + valp = _OPERAND (*valp, complex_part); > > I don't understand this block; shouldn't valp point to the real or imag part > of the complex number at this point? How could complex_part be set without > us handling the complex case in the loop already? Because for most references, the code will do: vec_safe_push (ctors, *valp); vec_safe_push (indexes, index); I chose not to do this for *PART_EXPR, because the COMPLEX_EXPR isn't a CONSTRUCTOR and code later on e.g. walks all the ctors and accesses CONSTRUCTOR_NO_CLEARING on them etc. As the *PART_EXPR is asserted to be outermost only, complex_expr is a variant of that ctors push and complex_part of the indexes. The reason for the above if is just in case the evaluation of the rhs of the store would store to the complex and could e.g. make it a COMPLEX_CST again. > > + } > > } > > if (*non_constant_p) > > @@ -6016,6 +6072,22 @@ cxx_eval_store_expression (const constex > > if (TREE_CODE (TREE_TYPE (elt)) == UNION_TYPE) > > CONSTRUCTOR_NO_CLEARING (elt) = false; > > } > > + if (complex_expr) > > I might have added the COMPLEX_EXPR to ctors instead of a separate variable, > but this is fine too. See above. The COMPLEX_EXPR needs special handling (conversion into COMPLEX_CST if it is constant) anyway. Jakub
c++: Add a late-writing step for modules
To add a module initializer optimization, we need to defer finishing writing out the module file until the end of determining the dynamic initializers. This is achieved by passing some saved-state from the main module writing to a new function that completes it. This patch merely adds the skeleton of that state and move things around, allowing the finalization of the ELF file to be postponed. None of the contents writing is moved, or the init optimization added. nathan -- Nathan SidwellFrom e6d369bbdb4eb5f03eec233ef9905013a735fd71 Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Thu, 9 Jun 2022 08:14:31 -0700 Subject: [PATCH] c++: Add a late-writing step for modules To add a module initializer optimization, we need to defer finishing writing out the module file until the end of determining the dynamic initializers. This is achieved by passing some saved-state from the main module writing to a new function that completes it. This patch merely adds the skeleton of that state and move things around, allowing the finalization of the ELF file to be postponed. None of the contents writing is moved, or the init optimization added. gcc/cp/ * cp-tree.h (fini_modules): Add some parameters. (finish_module_processing): Return an opaque pointer. * decl2.cc (c_parse_final_cleanups): Propagate a cookie from finish_module_processing to fini_modules. * module.cc (struct module_processing_cookie): New. (finish_module_processing): Return a heap-allocated cookie. (late_finish_module): New. Finish out the module writing. (fini_modules): Adjust. --- gcc/cp/cp-tree.h | 4 +- gcc/cp/decl2.cc | 4 +- gcc/cp/module.cc | 145 ++- 3 files changed, 98 insertions(+), 55 deletions(-) diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h index f1294dac7d5..60d7b201595 100644 --- a/gcc/cp/cp-tree.h +++ b/gcc/cp/cp-tree.h @@ -7209,9 +7209,9 @@ extern void import_module (module_state *, location_t, bool export_p, extern void declare_module (module_state *, location_t, bool export_p, tree attr, cpp_reader *); extern void init_modules (cpp_reader *); -extern void fini_modules (); +extern void fini_modules (cpp_reader *, void *cookie); extern void maybe_check_all_macros (cpp_reader *); -extern void finish_module_processing (cpp_reader *); +extern void *finish_module_processing (cpp_reader *); extern char const *module_name (unsigned, bool header_ok); extern bitmap get_import_bitmap (); extern bitmap visible_instantiation_path (bitmap *); diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc index ff1c36745cf..cc0b41324b3 100644 --- a/gcc/cp/decl2.cc +++ b/gcc/cp/decl2.cc @@ -5154,7 +5154,7 @@ c_parse_final_cleanups (void) reconsider = true; } - finish_module_processing (parse_in); + void *module_cookie = finish_module_processing (parse_in); lower_var_init (); @@ -5238,7 +5238,7 @@ c_parse_final_cleanups (void) } pop_lang_context (); - fini_modules (); + fini_modules (parse_in, module_cookie); /* Generate any missing aliases. */ maybe_apply_pending_pragma_weaks (); diff --git a/gcc/cp/module.cc b/gcc/cp/module.cc index 2b1877ea82e..51d774ae608 100644 --- a/gcc/cp/module.cc +++ b/gcc/cp/module.cc @@ -19854,11 +19854,32 @@ maybe_check_all_macros (cpp_reader *reader) dump.pop (n); } +// State propagated from finish_module_processing to fini_modules +struct module_processing_cookie +{ + elf_out out; + char *cmi_name; + char *tmp_name; + bool began; + + module_processing_cookie (char *cmi, char *tmp, int fd, int e) +: out (fd, e), cmi_name (cmi), tmp_name (tmp), began (false) + { + } + ~module_processing_cookie () + { +XDELETEVEC (tmp_name); +XDELETEVEC (cmi_name); + } +}; + /* Write the CMI, if we're a module interface. */ -void +void * finish_module_processing (cpp_reader *reader) { + module_processing_cookie *cookie = nullptr; + if (header_module_p ()) module_kind &= ~MK_EXPORTING; @@ -19870,7 +19891,7 @@ finish_module_processing (cpp_reader *reader) else if (!flag_syntax_only) { int fd = -1; - int e = ENOENT; + int e = -1; timevar_start (TV_MODULE_EXPORT); @@ -19879,7 +19900,7 @@ finish_module_processing (cpp_reader *reader) linemap_add (line_table, LC_ENTER, false, "", 0); /* We write to a tmpname, and then atomically rename. */ - const char *path = NULL; + char *cmi_name = NULL; char *tmp_name = NULL; module_state *state = (*modules)[0]; @@ -19888,9 +19909,9 @@ finish_module_processing (cpp_reader *reader) if (state->filename) { size_t len = 0; - path = maybe_add_cmi_prefix (state->filename, ); + cmi_name = xstrdup (maybe_add_cmi_prefix (state->filename, )); tmp_name = XNEWVEC (char, len + 3); - memcpy (tmp_name, path, len); + memcpy (tmp_name, cmi_name, len); strcpy (_name[len], "~"); if (!errorcount) @@ -19905,57 +19926,23 @@ finish_module_processing (cpp_reader
[PATCH] i386: Fix up *3_doubleword_mask [PR105911
Hi! Another regression caused by my recent patch. This time because define_insn_and_split only requires that the constant mask is const_int_operand. When it was only SImode, that wasn't a problem, HImode neither, but for DImode if we need to and the shift count we might run into a problem that it isn't a representable signed 32-bit immediate. But, we don't really care about the upper bits of the mask, so we can just mask the CONST_INT with the mode mask. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2022-06-10 Jakub Jelinek PR target/105911 * config/i386/i386.md ((*ashl3_doubleword_mask, *3_doubleword_mask): Use operands[3] masked with ( * BITS_PER_UNIT) - 1 as AND operand instead of operands[3] unmodified. * gcc.dg/pr105911.c: New test. --- gcc/config/i386/i386.md.jj 2022-06-08 08:21:26.0 +0200 +++ gcc/config/i386/i386.md 2022-06-10 11:37:21.931171567 +0200 @@ -11937,7 +11937,8 @@ (define_insn_and_split "*ashl3_doub rtx xops[3]; xops[0] = gen_reg_rtx (GET_MODE (operands[2])); xops[1] = operands[2]; - xops[2] = operands[3]; + xops[2] = GEN_INT (INTVAL (operands[3]) +& (( * BITS_PER_UNIT) - 1)); ix86_expand_binary_operator (AND, GET_MODE (operands[2]), xops); operands[2] = xops[0]; } @@ -12905,7 +12906,8 @@ (define_insn_and_split "*3_do rtx xops[3]; xops[0] = gen_reg_rtx (GET_MODE (operands[2])); xops[1] = operands[2]; - xops[2] = operands[3]; + xops[2] = GEN_INT (INTVAL (operands[3]) +& (( * BITS_PER_UNIT) - 1)); ix86_expand_binary_operator (AND, GET_MODE (operands[2]), xops); operands[2] = xops[0]; } --- gcc/testsuite/gcc.dg/pr105911.c.jj 2022-06-10 11:45:38.314044503 +0200 +++ gcc/testsuite/gcc.dg/pr105911.c 2022-06-10 11:45:18.068253633 +0200 @@ -0,0 +1,16 @@ +/* PR target/105911 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2" } */ + +__int128 v, x; +unsigned __int128 w; + +void bar (__int128, __int128); + +void +foo (void) +{ + bar (v /= v, v >> (v &= 0x10001)); + bar (w /= w, w >> (w &= 0x30003)); + bar (x /= x, x << (x &= 0x70007)); +} Jakub
Re: [committed] openmp: Add support for HBW or large capacity or interleaved memory through the libmemkind.so library
On Thu, Jun 09, 2022 at 01:57:52PM +0200, Jakub Jelinek via Gcc-patches wrote: > On Thu, Jun 09, 2022 at 12:11:28PM +0200, Thomas Schwinge wrote: > > On 2022-06-09T10:19:03+0200, Jakub Jelinek via Gcc-patches > > wrote: > > > This patch adds support for dlopening libmemkind.so > > > > Instead of 'dlopen'ing literally 'libmemkind.so': > > > > > --- libgomp/allocator.c.jj2022-06-08 08:21:03.099446883 +0200 > > > +++ libgomp/allocator.c 2022-06-08 13:41:45.647133610 +0200 > > > > > + void *handle = dlopen ("libmemkind.so", RTLD_LAZY); > > > > ..., shouldn't this instead 'dlopen' 'libmemkind.so.0'? At least for > > Debian/Ubuntu, the latter ('libmemkind.so.0') is shipped in the "library" > > package: > > I agree and I've actually noticed it too right before committing, but I > thought > I'll investigate and tweak incrementally because "libmemkind.so" > is what I've actually tested (it is what llvm libomp uses). And here is what I've committed after bootstrapping/regtesting it on x86_64-linux and i686-linux. 2022-06-10 Jakub Jelinek * allocator.c (gomp_init_memkind): Call dlopen with "libmemkind.so.0" rather than "libmemkind.so". --- libgomp/allocator.c.jj 2022-06-09 10:14:33.470973961 +0200 +++ libgomp/allocator.c 2022-06-09 14:05:33.665803457 +0200 @@ -99,7 +99,7 @@ static pthread_once_t memkind_data_once static void gomp_init_memkind (void) { - void *handle = dlopen ("libmemkind.so", RTLD_LAZY); + void *handle = dlopen ("libmemkind.so.0", RTLD_LAZY); struct gomp_memkind_data *data; int i; static const char *kinds[] = { Jakub
[PING][PATCH] Add instruction level discriminator support.
Hello, I'd like to ping this patch: https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596065.html Thanks, Eugene -Original Message- From: Gcc-patches On Behalf Of Eugene Rozenfeld via Gcc-patches Sent: Thursday, June 02, 2022 12:22 AM To: gcc-patches@gcc.gnu.org; Andi Kleen ; Jan Hubicka Subject: [EXTERNAL] [PATCH] Add instruction level discriminator support. This is the first in a series of patches to enable discriminator support in AutoFDO. This patch switches to tracking discriminators per statement/instruction instead of per basic block. Tracking per basic block was problematic since not all statements in a basic block needed a discriminator and, also, later optimizations could move statements between basic blocks making correlation during AutoFDO compilation unreliable. Tracking per statement also allows us to assign different discriminators to multiple function calls in the same basic block. A subsequent patch will add that support. The idea of this patch is based on commit 4c311d95cf6d9519c3c20f641cc77af7df491fdf by Dehao Chen in vendors/google/heads/gcc-4_8 but uses a slightly different approach. In Dehao's work special (normally unused) location ids and side tables were used to keep track of locations with discriminators. Things have changed since then and I don't think we have unused location ids anymore. Instead, I made discriminators a part of ad-hoc locations. The difference from Dehao's work also includes support for discriminator reading/writing in lto streaming and in modules. Tested on x86_64-pc-linux-gnu. 0001-Add-instruction-level-discriminator-support.patch Description: 0001-Add-instruction-level-discriminator-support.patch
[PATCH] x86: Require AVX for F16C and VAES
Since F16C and VAES are only usable with AVX, require AVX for F16C and VAES. OK for master and release branches? Thanks. H.J. --- libgcc/105920 * common/config/i386/cpuinfo.h (get_available_features): Require AVX for F16C and VAES. --- gcc/common/config/i386/cpuinfo.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h index adc02bc3d98..bbced8a23b9 100644 --- a/gcc/common/config/i386/cpuinfo.h +++ b/gcc/common/config/i386/cpuinfo.h @@ -651,8 +651,6 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_MOVBE); if (ecx & bit_AES) set_feature (FEATURE_AES); - if (ecx & bit_F16C) -set_feature (FEATURE_F16C); if (ecx & bit_RDRND) set_feature (FEATURE_RDRND); if (ecx & bit_XSAVE) @@ -663,6 +661,8 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_AVX); if (ecx & bit_FMA) set_feature (FEATURE_FMA); + if (ecx & bit_F16C) + set_feature (FEATURE_F16C); } /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */ @@ -683,6 +683,8 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_AVX2); if (ecx & bit_VPCLMULQDQ) set_feature (FEATURE_VPCLMULQDQ); + if (ecx & bit_VAES) + set_feature (FEATURE_VAES); } if (ebx & bit_BMI2) set_feature (FEATURE_BMI2); @@ -705,8 +707,6 @@ get_available_features (struct __processor_model *cpu_model, set_feature (FEATURE_PKU); if (ecx & bit_RDPID) set_feature (FEATURE_RDPID); - if (ecx & bit_VAES) - set_feature (FEATURE_VAES); if (ecx & bit_GFNI) set_feature (FEATURE_GFNI); if (ecx & bit_MOVDIRI) -- 2.36.1
Re: [PATCH] Add optional __Bfloat16 support
On Fri, Jun 10, 2022 at 7:44 AM H.J. Lu wrote: > > On Fri, Jun 10, 2022 at 2:38 AM Florian Weimer wrote: > > > > * liuhongt via Libc-alpha: > > > > > +\subsubsection{Special Types} > > > + > > > +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa. > > > +It is used for \code{BF16} related intrinsics, it cannot be > > Please mention that this is an alternate encoding format for 16-bit floating > point. It has the same size and alignment as _Float16. It also follows the same rules as _Float16 for parameter passing and function return. > > > +used with standard C operators. > > > > I think it's not necessary to specify whether the type supports certain > > C operators (surely assignment will work?). If they are added later, > > the ABI won't need changing. > > > > If _Bfloat16 becomes a fundamental type, the ABI should be changed to > move it together with other scalar types. > > -- > H.J. -- H.J.
Re: [PATCH V2] Disable generating load/store vector pairs for block copies.
Hi! On Fri, Jun 10, 2022 at 11:27:40AM -0400, Michael Meissner wrote: > Testing has found that using store vector pair for block copies can result > in a slow down on power10. This patch disables using the vector pair > instructions for block copies if we are tuning for power10. Load paired should be disabled as well, for the same reason. The patch seems to do that fine? Please fix the commit message. Thanks, Segher > 2022-06-09 Michael Meissner > > gcc/ > * config/rs6000/rs6000.cc (rs6000_option_override_internal): Do > not generate block copies with vector pair instructions if we are > tuning for power10. > --- > gcc/config/rs6000/rs6000.cc | 5 - > 1 file changed, 4 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc > index 0af2085adc0..59481d9ac70 100644 > --- a/gcc/config/rs6000/rs6000.cc > +++ b/gcc/config/rs6000/rs6000.cc > @@ -4141,7 +4141,10 @@ rs6000_option_override_internal (bool global_init_p) > >if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR)) > { > - if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX) > + /* Do not generate lxvp and stxvp on power10 since there are some > + performance issues. */ > + if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX > + && rs6000_tune != PROCESSOR_POWER10) > rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR; >else > rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR;
Re: [PATCH] c++: Add support for __real__/__imag__ modifications in constant expressions [PR88174]
On 6/9/22 04:37, Jakub Jelinek wrote: Hi! We claim we support P0415R1 (constexpr complex), but e.g. #include constexpr bool foo () { std::complex a (1.0, 2.0); a += 3.0; a.real (6.0); return a.real () == 6.0 && a.imag () == 2.0; } static_assert (foo ()); fails with test.C:12:20: error: non-constant condition for static assertion 12 | static_assert (foo ()); |^~ test.C:12:20: in ‘constexpr’ expansion of ‘foo()’ test.C:8:10: in ‘constexpr’ expansion of ‘a.std::complex::real(6.0e+0)’ test.C:12:20: error: modification of ‘__real__ a.std::complex::_M_value’ is not a constant expression The problem is we don't handle REALPART_EXPR and IMAGPART_EXPR in cxx_eval_store_expression. The following patch attempts to support it (with a requirement that those are the outermost expressions, ARRAY_REF/COMPONENT_REF etc. are just not possible on the result of these, BIT_FIELD_REF would be theoretically possible if trying to extract some bits from one part of a complex int, but I don't see how it could appear in the FE trees. For these references, the code handles value being COMPLEX_CST, COMPLEX_EXPR or CONSTRUCTOR_NO_CLEARING empty CONSTRUCTOR (what we use to represent uninitialized values for C++20 and later) and the code starts by rewriting it to COMPLEX_EXPR, so that we can freely adjust the individual parts and later on possibly optimize it back to COMPLEX_CST if both halves are constant. Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2022-06-09 Jakub Jelinek PR c++/88174 * constexpr.cc (cxx_eval_store_expression): Handle REALPART_EXPR and IMAGPART_EXPR. * g++.dg/cpp1y/constexpr-complex1.C: New test. --- gcc/cp/constexpr.cc.jj 2022-06-08 08:21:02.973448193 +0200 +++ gcc/cp/constexpr.cc 2022-06-08 17:13:04.986040449 +0200 @@ -5707,6 +5707,20 @@ cxx_eval_store_expression (const constex } break; + case REALPART_EXPR: + gcc_assert (probe == target); Doesn't this assert mean that complex_expr will always be == valp? + vec_safe_push (refs, integer_zero_node); + vec_safe_push (refs, TREE_TYPE (probe)); + probe = TREE_OPERAND (probe, 0); + break; + + case IMAGPART_EXPR: + gcc_assert (probe == target); + vec_safe_push (refs, integer_one_node); + vec_safe_push (refs, TREE_TYPE (probe)); + probe = TREE_OPERAND (probe, 0); + break; + default: if (evaluated) object = probe; @@ -5749,6 +5763,8 @@ cxx_eval_store_expression (const constex auto_vec index_pos_hints; bool activated_union_member_p = false; bool empty_base = false; + int complex_part = -1; + tree *complex_expr = NULL; while (!refs->is_empty ()) { if (*valp == NULL_TREE) @@ -5785,14 +5801,36 @@ cxx_eval_store_expression (const constex *valp = ary_ctor; } - /* If the value of object is already zero-initialized, any new ctors for -subobjects will also be zero-initialized. */ - no_zero_init = CONSTRUCTOR_NO_CLEARING (*valp); - enum tree_code code = TREE_CODE (type); tree reftype = refs->pop(); tree index = refs->pop(); + if (code == COMPLEX_TYPE) + { + if (TREE_CODE (*valp) == COMPLEX_CST) + *valp = build2 (COMPLEX_EXPR, type, TREE_REALPART (*valp), + TREE_IMAGPART (*valp)); + else if (TREE_CODE (*valp) == CONSTRUCTOR + && CONSTRUCTOR_NELTS (*valp) == 0 + && CONSTRUCTOR_NO_CLEARING (*valp)) + { + tree r = build_constructor (reftype, NULL); + CONSTRUCTOR_NO_CLEARING (r) = 1; + *valp = build2 (COMPLEX_EXPR, type, r, r); + } + gcc_assert (TREE_CODE (*valp) == COMPLEX_EXPR); + complex_expr = valp; + valp = _OPERAND (*valp, index != integer_zero_node); + gcc_checking_assert (refs->is_empty ()); + type = reftype; + complex_part = index != integer_zero_node; + break; + } + + /* If the value of object is already zero-initialized, any new ctors for +subobjects will also be zero-initialized. */ + no_zero_init = CONSTRUCTOR_NO_CLEARING (*valp); + if (code == RECORD_TYPE && is_empty_field (index)) /* Don't build a sub-CONSTRUCTOR for an empty base or field, as they have no data and might have an offset lower than previously declared @@ -5946,6 +5984,24 @@ cxx_eval_store_expression (const constex = get_or_insert_ctor_field (*valp, indexes[i], index_pos_hints[i]); valp = >value; } + if (complex_part != -1) + { + if (TREE_CODE (*valp) == COMPLEX_CST) + *valp = build2 (COMPLEX_EXPR, TREE_TYPE (*valp), + TREE_REALPART (*valp), + TREE_IMAGPART (*valp)); +
Re: [PATCH] c++: optimize specialization of nested class templates
On 6/10/22 12:00, Patrick Palka wrote: On Fri, 10 Jun 2022, Patrick Palka wrote: On Thu, 9 Jun 2022, Patrick Palka wrote: On Thu, 9 Jun 2022, Jason Merrill wrote: On 6/8/22 14:21, Patrick Palka wrote: When substituting a class template specialization, tsubst_aggr_type substitutes the TYPE_CONTEXT before passing it to lookup_template_class. This appears to be unnecessary, however, because the the initial value of lookup_template_class's context parameter is unused outside of the IDENTIFIER_NODE case, and l_t_c performs its own substitution of the context, anyway. So this patch removes the redundant substitution in tsubst_aggr_type. Doing so causes us to ICE on template/nested5.C because during lookup_template_class for A::C::D with T=E and S=S, we substitute and complete the context A::C with T=E, which in turn registers the desired dependent specialization of D for us and we end up trying to register it again. This patch fixes this by checking the specializations table again after completion of the context. This patch also implements a couple of other optimizations: * In lookup_template_class, if the context of the partially instantiated template is already non-dependent, then we could reuse that instead of substituting the context of the most general template. * When substituting the TYPE_DECL for an injected-class-name in tsubst_decl, we can avoid substituting its TREE_TYPE and DECL_TI_ARGS. Together these optimizations improve memory usage for the range-v3 testcase test/view/split.cc by about 5%. The improvement is probably more significant when dealing with deeply nested class templates. Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for trunk? gcc/cp/ChangeLog: * pt.cc (lookup_template_class): Remove dead stores to context parameter. Don't substitute the context of the most general template if that of the partially instantiated template is non-dependent. Check the specializations table again after completing the context of a nested dependent specialization. (tsubst_aggr_type) : Don't substitute TYPE_CONTEXT or pass it to lookup_template_class. (tsubst_decl) : Avoid substituting the TREE_TYPE and DECL_TI_ARGS when DECL_SELF_REFERENCE_P. --- gcc/cp/pt.cc | 69 +++- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc index 59b94317e88..28023d60684 100644 --- a/gcc/cp/pt.cc +++ b/gcc/cp/pt.cc @@ -9840,8 +9840,6 @@ lookup_template_class (tree d1, tree arglist, tree in_decl, tree context, if (context) pop_decl_namespace (); } - if (templ) - context = DECL_CONTEXT (templ); } else if (TREE_CODE (d1) == TYPE_DECL && MAYBE_CLASS_TYPE_P (TREE_TYPE (d1))) { @@ -9868,7 +9866,6 @@ lookup_template_class (tree d1, tree arglist, tree in_decl, tree context, { templ = d1; d1 = DECL_NAME (templ); - context = DECL_CONTEXT (templ); } else if (DECL_TEMPLATE_TEMPLATE_PARM_P (d1)) { @@ -10059,8 +10056,25 @@ lookup_template_class (tree d1, tree arglist, tree in_decl, tree context, context = DECL_CONTEXT (gen_tmpl); if (context && TYPE_P (context)) { - context = tsubst_aggr_type (context, arglist, complain, in_decl, true); - context = complete_type (context); + if (!uses_template_parms (DECL_CONTEXT (templ))) + /* If the context of the partially instantiated template is + already non-dependent, then we might as well use it. */ + context = DECL_CONTEXT (templ); + else + { + context = tsubst_aggr_type (context, arglist, complain, in_decl, true); + context = complete_type (context); + if (is_dependent_type && arg_depth > 1) + { + /* If this is a dependent nested specialization such as +A::B, then completion of A might have +registered this specialization of B for us, so check +the table again (33959). */ + entry = type_specializations->find_with_hash (, hash); + if (entry) + return entry->spec; + } + } } else context = tsubst (context, arglist, complain, in_decl); @@ -13711,25 +13725,12 @@ tsubst_aggr_type (tree t, if (TYPE_TEMPLATE_INFO (t) && uses_template_parms (t)) { tree argvec; - tree context; tree r; /* In "sizeof(X)" we need to evaluate "I". */ cp_evaluated ev; - /* First, determine the context for the type we are looking -up. */ - context = TYPE_CONTEXT (t); - if (context && TYPE_P (context)) - { - context =
c++: Adjust module initializer calling emission
We special-case emitting the calls of module initializer functions. It's simpler to just emit a static fn do do that, and add it onto the front of the global init fn chain. We can also move the calculation of the set of initializers to call to the point of use. nathan -- Nathan SidwellFrom 8834d2d35fcc229c00e2e06e8be8b052c803d8cd Mon Sep 17 00:00:00 2001 From: Nathan Sidwell Date: Fri, 10 Jun 2022 05:22:21 -0700 Subject: [PATCH] c++: Adjust module initializer calling emission We special-case emitting the calls of module initializer functions. It's simpler to just emit a static fn do do that, and add it onto the front of the global init fn chain. We can also move the calculation of the set of initializers to call to the point of use. gcc/cp/ * cp-tree.h (module_has_import_init): Rename to ... (module_determined_import_inits): ... here. * decl2.cc (start_objects): Do not handle module initializers here. (c_parse_final_cleanups): Generate a separate module initializer calling function and add it to the list. Shrink the c-lang region. * module.cc (num_init_calls_needed): Delete. (module_has_import_init): Rename to ... (module_determined_import_inits): ... here. Do the calculation here ... (finish_module_processing): ... rather than here. (module_add_import_initializers): Reformat. gcc/testsuite/ * g++.dg/modules/init-3_a.C: New. * g++.dg/modules/init-3_b.C: New. * g++.dg/modules/init-3_c.C: New. --- gcc/cp/cp-tree.h| 2 +- gcc/cp/decl2.cc | 47 +- gcc/cp/module.cc| 110 +++- gcc/testsuite/g++.dg/modules/init-3_a.C | 17 gcc/testsuite/g++.dg/modules/init-3_b.C | 6 ++ gcc/testsuite/g++.dg/modules/init-3_c.C | 17 6 files changed, 117 insertions(+), 82 deletions(-) create mode 100644 gcc/testsuite/g++.dg/modules/init-3_a.C create mode 100644 gcc/testsuite/g++.dg/modules/init-3_b.C create mode 100644 gcc/testsuite/g++.dg/modules/init-3_c.C diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h index a5d93282167..f1294dac7d5 100644 --- a/gcc/cp/cp-tree.h +++ b/gcc/cp/cp-tree.h @@ -7180,7 +7180,7 @@ extern module_state *get_module (tree name, module_state *parent = NULL, extern bool module_may_redeclare (tree decl); extern bool module_global_init_needed (); -extern bool module_has_import_inits (); +extern bool module_determine_import_inits (); extern void module_add_import_initializers (); /* Where the namespace-scope decl was originally declared. */ diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc index 9de9a7a4f8a..ff1c36745cf 100644 --- a/gcc/cp/decl2.cc +++ b/gcc/cp/decl2.cc @@ -3903,8 +3903,7 @@ start_objects (bool initp, unsigned priority, bool has_body) tree body = begin_compound_stmt (BCS_FN_BODY); - bool has_import_inits = default_init && module_has_import_inits (); - if (is_module_init && (has_import_inits || has_body)) + if (is_module_init && has_body) { // If the function is going to be empty, don't emit idempotency. // 'static bool __in_chrg = false; @@ -3930,9 +3929,6 @@ start_objects (bool initp, unsigned priority, bool has_body) finish_expr_stmt (assign); } - if (has_import_inits) -module_add_import_initializers (); - return body; } @@ -5195,6 +5191,12 @@ c_parse_final_cleanups (void) maybe_warn_sized_delete (); + // Place the init fns in the right order. We need to do this now, + // so that any module init will go at the start. + if (static_init_fini_fns[true]) +for (auto iter : *static_init_fini_fns[true]) + iter.second = nreverse (iter.second); + /* Then, do the Objective-C stuff. This is where all the Objective-C module stuff gets generated (symtab, class/protocol/selector lists etc). This must be done after C++ @@ -5203,11 +5205,18 @@ c_parse_final_cleanups (void) if (c_dialect_objc ()) objc_write_global_declarations (); - /* We give C linkage to static constructors and destructors. */ - push_lang_context (lang_name_c); + if (module_determine_import_inits ()) +{ + input_location = locus_at_end_of_parsing; + tree body = start_partial_init_fini_fn (true, DEFAULT_INIT_PRIORITY, + ssdf_count++); + module_add_import_initializers (); + input_location = locus_at_end_of_parsing; + finish_partial_init_fini_fn (body); +} if ((c_dialect_objc () && objc_static_init_needed_p ()) - || module_global_init_needed () || module_has_import_inits ()) + || module_global_init_needed ()) { // Make sure there's a default priority entry. if (!static_init_fini_fns[true]) @@ -5216,32 +5225,24 @@ c_parse_final_cleanups (void) } /* Generate initialization and destruction functions for all - priorities for which they are required. */ + priorities for which they are required. They have C-language + linkage. */ + push_lang_context (lang_name_c); for (unsigned
Re: [PATCH 2/1] c++: optimize specialization of templated member functions
On 6/9/22 15:37, Patrick Palka wrote: On Thu, 9 Jun 2022, Jason Merrill wrote: On 6/9/22 09:00, Patrick Palka wrote: This performs one of the optimizations added by the previous patch to lookup_template_class, to instantiate_template as well. (For the libstdc++ ranges tests this optimization appears to be effective around 30% of the time, i.e. 30% of the time context of 'tmpl' is non-dependent while the context of 'gen_tmpl' is dependent.) If this is a significant optimization, how about doing it in tsubst_aggr_type rather than its callers? I'm not sure how we'd do this optimization in tsubst_aggr_type? Oops, I was overlooking the gen_tmpl vs. tmpl difference. I haven't observed any significant time/memory improvements based on my limited benchmarking, but I can imagine for deeply nested templates it could be significant. And avoiding redundant work should hopefully help streamline debugging I suppose. OK. gcc/cp/ChangeLog: * pt.cc (instantiate_template): Don't substitute the context of the most general template if that of the partially instantiated template is non-dependent. --- gcc/cp/pt.cc | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc index e021c254872..208daad298a 100644 --- a/gcc/cp/pt.cc +++ b/gcc/cp/pt.cc @@ -21661,8 +21661,14 @@ instantiate_template (tree tmpl, tree orig_args, tsubst_flags_t complain) ++processing_template_decl; if (DECL_CLASS_SCOPE_P (gen_tmpl)) { - tree ctx = tsubst_aggr_type (DECL_CONTEXT (gen_tmpl), targ_ptr, - complain, gen_tmpl, true); + tree ctx; + if (!uses_template_parms (DECL_CONTEXT (tmpl))) + /* If the context of the partially instantiated template is already + non-dependent, then we might as well use it. */ + ctx = DECL_CONTEXT (tmpl); + else + ctx = tsubst_aggr_type (DECL_CONTEXT (gen_tmpl), targ_ptr, + complain, gen_tmpl, true); push_nested_class (ctx); }
Re: [PATCH] c++: improve TYPENAME_TYPE hashing [PR65328]
On 6/10/22 09:40, Patrick Palka wrote: The reason compiling the testcase in this PR is so slow is ultimately due to our poor hashing of TYPENAME_TYPE causing a huge amount of hash table collisions in the spec_hasher and typename_hasher tables. In spec_hasher, we don't hash the components of a TYPENAME_TYPE at all, presumably because TYPENAME_TYPE equivalence as determined by structural_comptypes depends on whether the comparing_specializations flag is set. This patch fixes this by setting comparing_specializations from spec_hasher::hash, and making iterative_hash_template_arg hash the relevant components of a TYPENAME_TYPE when this flag is set. consistently. And in typename_hasher, the hash function doesn't consider the TYPENAME_TYPE_FULLNAME, which this patch fixes accordingly. After this patch, compile time for the testcase in the PR is around 34 seconds (10% faster than Clang). Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for trunk? PR c++/65328 gcc/cp/ChangeLog: * decl.cc (typename_hasher::hash): Add extra overloads. Use iterative_hash_object instead of htab_hash_pointer. Hash the TYPENAME_TYPE_FULLNAME too. (build_typename_type): Use typename_hasher::hash. * pt.cc (spec_hasher::hash): Add two-parameter overload. Set comparing_specializations around the call to hash_tmpl_and_args. (iterative_hash_template_arg) : When comparing_specializations, hash the TYPE_CONTEXT and TYPENAME_TYPE_FULLNAME. (tsubst_function_decl): Use spec_hasher::hash instead of hash_tmpl_and_args. (tsubst_template_decl): Likewise. (tsubst_decl): Likewise. --- gcc/cp/decl.cc | 26 +++--- gcc/cp/pt.cc | 28 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc index 7f3b3c3c588..b7f624ca50b 100644 --- a/gcc/cp/decl.cc +++ b/gcc/cp/decl.cc @@ -4007,14 +4007,27 @@ struct typename_hasher : ggc_ptr_hash /* Hash a TYPENAME_TYPE. */ static hashval_t - hash (tree t) + hash (tree context, tree name, tree fullname) { -hashval_t hash; +hashval_t hash = 0; +hash = iterative_hash_object (context, hash); +hash = iterative_hash_object (name, hash); I'd think we could omit considering 'name', since fullname is either the same as name or a wrapper for it? +hash = iterative_hash_object (fullname, hash); +return hash; + } -hash = (htab_hash_pointer (TYPE_CONTEXT (t)) - ^ htab_hash_pointer (TYPE_IDENTIFIER (t))); + static hashval_t + hash (const typename_info *ti) + { +return typename_hasher::hash (ti->scope, ti->name, ti->template_id); + } -return hash; + static hashval_t + hash (tree t) + { +return typename_hasher::hash (TYPE_CONTEXT (t), + TYPE_IDENTIFIER (t), + TYPENAME_TYPE_FULLNAME (t)); } /* Compare two TYPENAME_TYPEs. */ @@ -4053,8 +4066,7 @@ build_typename_type (tree context, tree name, tree fullname, ti.class_p = (tag_type == class_type || tag_type == record_type || tag_type == union_type); - hashval_t hash = (htab_hash_pointer (ti.scope) -^ htab_hash_pointer (ti.name)); + hashval_t hash = typename_hasher::hash (); /* See if we already have this type. */ tree *e = typename_htab->find_slot_with_hash (, hash, INSERT); diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc index 55129cf6f2c..381fc337cb0 100644 --- a/gcc/cp/pt.cc +++ b/gcc/cp/pt.cc @@ -107,6 +107,7 @@ static bool excessive_deduction_depth; struct spec_hasher : ggc_ptr_hash { static hashval_t hash (spec_entry *); + static hashval_t hash (tree, tree); static bool equal (spec_entry *, spec_entry *); }; @@ -1768,13 +1769,22 @@ hash_tmpl_and_args (tree tmpl, tree args) return iterative_hash_template_arg (args, val); } +hashval_t +spec_hasher::hash (tree tmpl, tree args) +{ + ++comparing_specializations; + hashval_t val = hash_tmpl_and_args (tmpl, args); + --comparing_specializations; + return val; +} + /* Returns a hash for a spec_entry node based on the TMPL and ARGS members, ignoring SPEC. */ hashval_t spec_hasher::hash (spec_entry *e) { - return hash_tmpl_and_args (e->tmpl, e->args); + return spec_hasher::hash (e->tmpl, e->args); } /* Recursively calculate a hash value for a template argument ARG, for use @@ -1960,6 +1970,16 @@ iterative_hash_template_arg (tree arg, hashval_t val) val = iterative_hash_template_arg (DECLTYPE_TYPE_EXPR (arg), val); break; + case TYPENAME_TYPE: + if (comparing_specializations) Please add a comment that this is to match structural_comptypes. OK with these changes. + { + tree context = TYPE_MAIN_VARIANT (TYPE_CONTEXT (arg)); + tree fullname =
Re: [PATCH] c++: optimize specialization of nested class templates
On Fri, 10 Jun 2022, Patrick Palka wrote: > On Thu, 9 Jun 2022, Patrick Palka wrote: > > > On Thu, 9 Jun 2022, Jason Merrill wrote: > > > > > On 6/8/22 14:21, Patrick Palka wrote: > > > > When substituting a class template specialization, tsubst_aggr_type > > > > substitutes the TYPE_CONTEXT before passing it to lookup_template_class. > > > > This appears to be unnecessary, however, because the the initial value > > > > of lookup_template_class's context parameter is unused outside of the > > > > IDENTIFIER_NODE case, and l_t_c performs its own substitution of the > > > > context, anyway. So this patch removes the redundant substitution in > > > > tsubst_aggr_type. Doing so causes us to ICE on template/nested5.C > > > > because during lookup_template_class for A::C::D with T=E and S=S, > > > > we substitute and complete the context A::C with T=E, which in turn > > > > registers the desired dependent specialization of D for us and we end up > > > > trying to register it again. This patch fixes this by checking the > > > > specializations table again after completion of the context. > > > > > > > > This patch also implements a couple of other optimizations: > > > > > > > >* In lookup_template_class, if the context of the partially > > > > instantiated template is already non-dependent, then we could > > > > reuse that instead of substituting the context of the most > > > > general template. > > > >* When substituting the TYPE_DECL for an injected-class-name > > > > in tsubst_decl, we can avoid substituting its TREE_TYPE and > > > > DECL_TI_ARGS. > > > > > > > > Together these optimizations improve memory usage for the range-v3 > > > > testcase test/view/split.cc by about 5%. The improvement is probably > > > > more significant when dealing with deeply nested class templates. > > > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for > > > > trunk? > > > > > > > > gcc/cp/ChangeLog: > > > > > > > > * pt.cc (lookup_template_class): Remove dead stores to > > > > context parameter. Don't substitute the context of the > > > > most general template if that of the partially instantiated > > > > template is non-dependent. Check the specializations table > > > > again after completing the context of a nested dependent > > > > specialization. > > > > (tsubst_aggr_type) : Don't substitute > > > > TYPE_CONTEXT or pass it to lookup_template_class. > > > > (tsubst_decl) : Avoid substituting the > > > > TREE_TYPE and DECL_TI_ARGS when DECL_SELF_REFERENCE_P. > > > > --- > > > > gcc/cp/pt.cc | 69 +++- > > > > 1 file changed, 41 insertions(+), 28 deletions(-) > > > > > > > > diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc > > > > index 59b94317e88..28023d60684 100644 > > > > --- a/gcc/cp/pt.cc > > > > +++ b/gcc/cp/pt.cc > > > > @@ -9840,8 +9840,6 @@ lookup_template_class (tree d1, tree arglist, tree > > > > in_decl, tree context, > > > > if (context) > > > > pop_decl_namespace (); > > > > } > > > > - if (templ) > > > > - context = DECL_CONTEXT (templ); > > > > } > > > > else if (TREE_CODE (d1) == TYPE_DECL && MAYBE_CLASS_TYPE_P > > > > (TREE_TYPE > > > > (d1))) > > > > { > > > > @@ -9868,7 +9866,6 @@ lookup_template_class (tree d1, tree arglist, tree > > > > in_decl, tree context, > > > > { > > > > templ = d1; > > > > d1 = DECL_NAME (templ); > > > > - context = DECL_CONTEXT (templ); > > > > } > > > > else if (DECL_TEMPLATE_TEMPLATE_PARM_P (d1)) > > > > { > > > > @@ -10059,8 +10056,25 @@ lookup_template_class (tree d1, tree arglist, > > > > tree > > > > in_decl, tree context, > > > > context = DECL_CONTEXT (gen_tmpl); > > > > if (context && TYPE_P (context)) > > > > { > > > > - context = tsubst_aggr_type (context, arglist, complain, > > > > in_decl, > > > > true); > > > > - context = complete_type (context); > > > > + if (!uses_template_parms (DECL_CONTEXT (templ))) > > > > + /* If the context of the partially instantiated template is > > > > + already non-dependent, then we might as well use it. */ > > > > + context = DECL_CONTEXT (templ); > > > > + else > > > > + { > > > > + context = tsubst_aggr_type (context, arglist, complain, > > > > in_decl, > > > > true); > > > > + context = complete_type (context); > > > > + if (is_dependent_type && arg_depth > 1) > > > > + { > > > > + /* If this is a dependent nested specialization such > > > > as > > > > +A::B, then completion of A might have > > > > +registered this specialization of B for us, so > > > > check > > > > +the table again (33959). */ > > > >
[PATCH] libgompd: Fix sizes in OMPD support and add local ICVs finctions.
libgomp/ChangeLog 2022-06-10 Mohamed Atef * ompd-helper.h (DEREFERENCE, ACCESS_VALUE): New macros. * ompd-helper.c (gompd_get_nthread, gompd_get_thread_limit, gomp_get_run_shed, gompd_get_run_sched_chunk_size, gompd_get_default_device, gompd_get_dynamic, gompd_get_max_active_levels, gompd_get_proc_bind, gompd_is_final, gompd_is_implicit, gompd_get_team_size): defined. * ompd-icv.c (ompd_get_icv_from_scope): call the previous fincions, thread_handle, task_handle and parallel handle: New variable. Fix format in ashandle definition. * ompd-init.c: call GET_VALUE with sizeof_short for gompd_state. * ompd-support.h (gompd_state): size of short instead of long. (GOMPD_FOREACH_ACCESS): Add gompd_access (gomp_task, kind) gompd_access (gomp_task, final_task) gompd_access (gomp_team, nthreads) * ompd-support.c: Define gompd_get_offset gompd_get_sizeof_member gompd_get_size. (gompd_load): Remove gompd_init_access, gompd_init_sizeof_members, gompd_init_sizes define gompd_access_gomp_thread_handle with __UINT16_TYPE__. diff --git a/libgomp/ompd-helper.c b/libgomp/ompd-helper.c index a488ba7df2e..5a79ef9581d 100644 --- a/libgomp/ompd-helper.c +++ b/libgomp/ompd-helper.c @@ -256,6 +256,350 @@ gompd_stringize_gompd_enabled (ompd_address_space_handle_t *ah, /* End of global ICVs functions. */ +/* Get per thread ICVs. */ + +ompd_rc_t +gompd_get_nthread (ompd_thread_handle_t *thread_handle, + ompd_word_t *nthreads_var) +{ + /* gomp_thread->task->gomp_task_icv.nthreads_var. */ + if (thread_handle == NULL) +return ompd_rc_stale_handle; + if (nthreads_var == NULL) +return ompd_rc_bad_input; + CHECK (thread_handle->ah); + + ompd_word_t res = 0; + ompd_address_t symbol_addr = thread_handle->th; + ompd_word_t temp_offset; + ompd_address_t temp_sym_addr; + ompd_addr_t temp_addr; + ompd_address_space_context_t *context = thread_handle->ah->context; + ompd_thread_context_t *t_context = thread_handle->thread_context; + ompd_rc_t ret; + /* gomp_thread->task. */ + ACCESS_VALUE (context, t_context, "gompd_access_gomp_thread_task", +temp_offset, 1, ret, symbol_addr, temp_sym_addr, temp_addr); + /* gomp_thread->task->task_icv. */ + ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv", temp_offset, +1, ret, symbol_addr, temp_sym_addr, temp_addr); + /* gomp_thread->task->task_icv.nthreads_var. */ + ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv_nthreads_var", +temp_offset, 0, ret, symbol_addr, temp_sym_addr, temp_addr); + DEREFERENCE (context, t_context, symbol_addr, target_sizes.sizeof_long_long, + 1, res, ret, 0); + *nthreads_var = res; + return ompd_rc_ok; +} + +ompd_rc_t +gompd_get_default_device (ompd_thread_handle_t *thread_handle, + ompd_word_t *defalut_device_var) +{ + /* gomp_thread->task->gomp_task_icv.default_device_var. */ + if (thread_handle == NULL) +return ompd_rc_stale_handle; + if (defalut_device_var == NULL) +return ompd_rc_bad_input; + CHECK (thread_handle->ah); + + ompd_word_t res = 0; + ompd_address_t symbol_addr = thread_handle->th; + ompd_word_t temp_offset; + ompd_address_t temp_sym_addr; + ompd_addr_t temp_addr; + ompd_address_space_context_t *context = thread_handle->ah->context; + ompd_thread_context_t *t_context = thread_handle->thread_context; + ompd_rc_t ret; + /* gomp_thread->task. */ + ACCESS_VALUE (context, t_context, "gompd_access_gomp_thread_task", +temp_offset, 1, ret, symbol_addr, temp_sym_addr, temp_addr); + /* gomp_thread->task->task_icv. */ + ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv", temp_offset, +1, ret, symbol_addr, temp_sym_addr, temp_addr); + /* gomp_thread->task->task_icv.default_device_var. */ + ACCESS_VALUE (context, t_context, +"gompd_access_gomp_task_icv_default_device_var", temp_offset, 0, +ret, symbol_addr, temp_sym_addr, temp_addr); + DEREFERENCE (context, t_context, symbol_addr, target_sizes.sizeof_int, 1, + res, ret, 0); + *defalut_device_var = res; + return ompd_rc_ok; +} + +ompd_rc_t +gompd_get_dynamic (ompd_thread_handle_t *thread_handle, ompd_word_t *dyn_var) +{ + /* gomp_thread->task->gomp_task_icv.dyn_var. */ + if (thread_handle == NULL) +return ompd_rc_stale_handle; + if (dyn_var == NULL) +return ompd_rc_bad_input; + CHECK (thread_handle->ah); + + ompd_word_t res = 0; + ompd_address_t symbol_addr = thread_handle->th; + ompd_word_t temp_offset; + ompd_address_t temp_sym_addr; + ompd_addr_t temp_addr; + ompd_address_space_context_t *context = thread_handle->ah->context; + ompd_thread_context_t *t_context = thread_handle->thread_context; + ompd_rc_t ret; + /* gomp_thread->task. */ + ACCESS_VALUE (context, t_context, "gompd_access_gomp_thread_task", +temp_offset, 1, ret, symbol_addr, temp_sym_addr, temp_addr); + /*
Re: [PATCH] regrename: Fix -fcompare-debug issue in check_new_reg_p [PR105041]
Hi! On Fri, Jun 10, 2022 at 07:52:57PM +0530, Surya Kumari Jangala wrote: > In check_new_reg_p, the nregs of a du chain is computed by obtaining the MODE > of the first element in the chain, and then calling hard_regno_nregs() with > the > MODE. But the first element of the chain can be a DEBUG_INSN whose mode need > not be the same as the rest of the elements in the du chain. This > was resulting in fcompare-debug failure as check_new_reg_p was returning a > different result with -g for the same candidate register. We can instead > obtain > nregs from the du chain itself. Great, thanks for finding and fixing this! I cannot approve it, you'll have to wait for someone who can. It looks fine to me, but that does not mean so much in regrename.c :-) > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr105041.c > @@ -0,0 +1,24 @@ > +/* { dg-do compile } */ Please delete this line, it is the default. > +/* { dg-require-effective-target be } */ Is there a reason to not test this on LE? If not, please remove this line as well. > +/* { dg-options "-m32 -mdejagnu-cpu=power4 -O2 -fcompare-debug > -fharden-compares -frename-registers" } */ Aha. You check for LE because you use -m32 in the test? Don't, then! Instead, test with -m32 in your RUNTESTFLAGS, like make check-gcc-c RUNTESTFLAGS="--target_board=unix'{-m64,-m32}' powerpc.exp=pr105041.c" or similar. It's a good idea to add a comment a la /* PR rtl-optimization/105041: This test failed with -m32. */ Thanks again for the patch! Segher
[PATCH v2 4/4] xtensa: Improve constant synthesis for both integer and floating-point
This patch revises the previous implementation of constant synthesis. First, changed to use define_split machine description pattern and to run after reload pass, in order not to interfere some optimizations such as the loop invariant motion. Second, not only integer but floating-point is subject to processing. Third, several new synthesis patterns - when the constant cannot fit into a "MOVI Ax, simm12" instruction, but: I. can be represented as a power of two minus one (eg. 32767, 65535 or 0x7fffUL) => "MOVI(.N) Ax, -1" + "SRLI Ax, Ax, 1 ... 31" (or "EXTUI") II. is between -34816 and 34559 => "MOVI(.N) Ax, -2048 ... 2047" + "ADDMI Ax, Ax, -32768 ... 32512" III. (existing case) can fit into a signed 12-bit if the trailing zero bits are stripped => "MOVI(.N) Ax, -2048 ... 2047" + "SLLI Ax, Ax, 1 ... 31" The above sequences consist of 5 or 6 bytes and have latency of 2 clock cycles, in contrast with "L32R Ax, " (3 bytes and one clock latency, but may suffer additional one clock pipeline stall and implementation-specific InstRAM/ROM access penalty) plus 4 bytes of constant value. In addition, 3-instructions synthesis patterns (8 or 9 bytes, 3 clock latency) are also provided when optimizing for speed and L32R instruction has considerable access penalty: IV. 2-instructions synthesis (any of I ... III) followed by "SLLI Ax, Ax, 1 ... 31" V. 2-instructions synthesis followed by either "ADDX[248] Ax, Ax, Ax" or "SUBX8 Ax, Ax, Ax" (multiplying by 3, 5, 7 or 9) gcc/ChangeLog: * config/xtensa/xtensa-protos.h (xtensa_constantsynth): New prototype. * config/xtensa/xtensa.cc (xtensa_emit_constantsynth, xtensa_constantsynth_2insn, xtensa_constantsynth_rtx_SLLI, xtensa_constantsynth_rtx_ADDSUBX, xtensa_constantsynth): New backend functions that process the abovementioned logic. (xtensa_emit_move_sequence): Revert the previous changes. * config/xtensa/xtensa.md: New split patterns for integer and floating-point, as the frontend part. gcc/testsuite/ChangeLog: * gcc.target/xtensa/constsynth_2insns.c: New. * gcc.target/xtensa/constsynth_3insns.c: Ditto. * gcc.target/xtensa/constsynth_double.c: Ditto. --- gcc/config/xtensa/xtensa-protos.h | 1 + gcc/config/xtensa/xtensa.cc | 133 +++--- gcc/config/xtensa/xtensa.md | 50 +++ .../gcc.target/xtensa/constsynth_2insns.c | 44 ++ .../gcc.target/xtensa/constsynth_3insns.c | 24 .../gcc.target/xtensa/constsynth_double.c | 11 ++ 6 files changed, 247 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/gcc.target/xtensa/constsynth_2insns.c create mode 100644 gcc/testsuite/gcc.target/xtensa/constsynth_3insns.c create mode 100644 gcc/testsuite/gcc.target/xtensa/constsynth_double.c diff --git a/gcc/config/xtensa/xtensa-protos.h b/gcc/config/xtensa/xtensa-protos.h index 30e4b54394a..c2fd750cd3a 100644 --- a/gcc/config/xtensa/xtensa-protos.h +++ b/gcc/config/xtensa/xtensa-protos.h @@ -44,6 +44,7 @@ extern int xtensa_expand_block_move (rtx *); extern int xtensa_expand_block_set_unrolled_loop (rtx *); extern int xtensa_expand_block_set_small_loop (rtx *); extern void xtensa_split_operand_pair (rtx *, machine_mode); +extern int xtensa_constantsynth (rtx, HOST_WIDE_INT); extern int xtensa_emit_move_sequence (rtx *, machine_mode); extern rtx xtensa_copy_incoming_a7 (rtx); extern void xtensa_expand_nonlocal_goto (rtx *); diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc index 1769e43c7b5..b48ac5063c0 100644 --- a/gcc/config/xtensa/xtensa.cc +++ b/gcc/config/xtensa/xtensa.cc @@ -1037,6 +1037,123 @@ xtensa_split_operand_pair (rtx operands[4], machine_mode mode) } +/* Try to emit insns to load srcval (that cannot fit into signed 12-bit) + into dst with synthesizing a such constant value from a sequence of + load-immediate / arithmetic ones, instead of a L32R instruction + (plus a constant in litpool). */ + +static void +xtensa_emit_constantsynth (rtx dst, enum rtx_code code, + HOST_WIDE_INT imm0, HOST_WIDE_INT imm1, + rtx (*gen_op)(rtx, HOST_WIDE_INT), + HOST_WIDE_INT imm2) +{ + gcc_assert (REG_P (dst)); + emit_move_insn (dst, GEN_INT (imm0)); + emit_move_insn (dst, gen_rtx_fmt_ee (code, SImode, + dst, GEN_INT (imm1))); + if (gen_op) +emit_move_insn (dst, gen_op (dst, imm2)); +} + +static int +xtensa_constantsynth_2insn (rtx dst, HOST_WIDE_INT srcval, + rtx (*gen_op)(rtx, HOST_WIDE_INT), + HOST_WIDE_INT op_imm) +{ + int shift = exact_log2 (srcval + 1); + + if (IN_RANGE (shift, 1, 31)) +{ + xtensa_emit_constantsynth (dst, LSHIFTRT, -1, 32 - shift, +gen_op, op_imm); + return 1;
[PATCH V2] Disable generating load/store vector pairs for block copies.
[PATCH, V2] Disable generating load/store vector pairs for block copies. Testing has found that using store vector pair for block copies can result in a slow down on power10. This patch disables using the vector pair instructions for block copies if we are tuning for power10. This is version 2 of the patch. | Date: Mon, 6 Jun 2022 20:55:55 -0400 | Subject: [PATCH 2/3] Disable generating load/store vector pairs for block copies. | Message-ID: | https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596275.html Compared to version 1, this patch is a stand-alone patch, and it doesn't depend on a new switch (-mno-store-vector-pair). Instead, this patch just sets the default for -mblock-ops-vector-pair to be off if the current cpu being tuned for is power10. It would be anticipated that it would automatically be eabled when tuning for a future cpu. I have tested this patch on: little endian power10 using --with-cpu=power10 little endian power9 using --with-cpu=power9 big endian power8 using --with-cpu=power8, both 32/64-bit tested there were no regressions. Can I apply this to the master branch, and then apply it to the GCC 12 patch after a burn-in period? 2022-06-09 Michael Meissner gcc/ * config/rs6000/rs6000.cc (rs6000_option_override_internal): Do not generate block copies with vector pair instructions if we are tuning for power10. --- gcc/config/rs6000/rs6000.cc | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 0af2085adc0..59481d9ac70 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -4141,7 +4141,10 @@ rs6000_option_override_internal (bool global_init_p) if (!(rs6000_isa_flags_explicit & OPTION_MASK_BLOCK_OPS_VECTOR_PAIR)) { - if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX) + /* Do not generate lxvp and stxvp on power10 since there are some +performance issues. */ + if (TARGET_MMA && TARGET_EFFICIENT_UNALIGNED_VSX + && rs6000_tune != PROCESSOR_POWER10) rs6000_isa_flags |= OPTION_MASK_BLOCK_OPS_VECTOR_PAIR; else rs6000_isa_flags &= ~OPTION_MASK_BLOCK_OPS_VECTOR_PAIR; -- 2.35.3 -- Michael Meissner, IBM PO Box 98, Ayer, Massachusetts, USA, 01432 email: meiss...@linux.ibm.com
Re: [PATCH] Darwin: Future-proof -mmacosx-version-min
Hi Mark, > On 10 Jun 2022, at 15:56, Mark Mentovai wrote: > > f18cbc1ee1f4 (2021-12-18) updated various parts of gcc to not impose a > Darwin or macOS version maximum of the current known release. Different > parts of gcc accept, variously, Darwin version numbers matching > darwin2*, and macOS major version numbers up to 99. The current released > version is Darwin 21 and macOS 12, with Darwin 22 and macOS 13 expected > for public release later this year. With one major OS release per year, > this strategy is expected to provide another 8 years of headroom. > > However, f18cbc1ee1f4 missed config/darwin-c.c (now .cc), which > continued to impose a maximum of macOS 12 on the -mmacosx-version-min > compiler driver argument. This was last updated from 11 to 12 in > 11b967577483 (2021-10-27), but kicking the can down the road one year at > a time is not a viable strategy, and is not in line with the more recent > technique from f18cbc1ee1f4. > > Prior to 556ab5125912 (2020-11-06), config/darwin-c.c did not impose a > maximum that needed annual maintenance, as at that point, all macOS > releases had used a major version of 10. The stricter approach imposed > since then was valuable for a time until the particulars of the new > versioning scheme were established and understood, but now that they > are, it's prudent to restore a more permissive approach. OK for master and open branches, thanks Iain > > gcc/ChangeLog: > > * config/darwin-c.cc: Make -mmacosx-version-min more future-proof. > > Signed-off-by: Mark Mentovai > --- > gcc/config/darwin-c.cc | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/darwin-c.cc b/gcc/config/darwin-c.cc > index 9203c84d2c26..00fc1253e265 100644 > --- a/gcc/config/darwin-c.cc > +++ b/gcc/config/darwin-c.cc > @@ -691,7 +691,8 @@ macosx_version_as_macro (void) > if (!version_array) > goto fail; > > - if (version_array[MAJOR] < 10 || version_array[MAJOR] > 12) > + /* clang accepts up to 99. */ > + if (version_array[MAJOR] < 10 || version_array[MINOR] > 99) > goto fail; > > if (version_array[MAJOR] == 10 && version_array[MINOR] < 10) > -- > 2.36.1 >
[PATCH] Darwin: Future-proof -mmacosx-version-min
f18cbc1ee1f4 (2021-12-18) updated various parts of gcc to not impose a Darwin or macOS version maximum of the current known release. Different parts of gcc accept, variously, Darwin version numbers matching darwin2*, and macOS major version numbers up to 99. The current released version is Darwin 21 and macOS 12, with Darwin 22 and macOS 13 expected for public release later this year. With one major OS release per year, this strategy is expected to provide another 8 years of headroom. However, f18cbc1ee1f4 missed config/darwin-c.c (now .cc), which continued to impose a maximum of macOS 12 on the -mmacosx-version-min compiler driver argument. This was last updated from 11 to 12 in 11b967577483 (2021-10-27), but kicking the can down the road one year at a time is not a viable strategy, and is not in line with the more recent technique from f18cbc1ee1f4. Prior to 556ab5125912 (2020-11-06), config/darwin-c.c did not impose a maximum that needed annual maintenance, as at that point, all macOS releases had used a major version of 10. The stricter approach imposed since then was valuable for a time until the particulars of the new versioning scheme were established and understood, but now that they are, it's prudent to restore a more permissive approach. gcc/ChangeLog: * config/darwin-c.cc: Make -mmacosx-version-min more future-proof. Signed-off-by: Mark Mentovai --- gcc/config/darwin-c.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/config/darwin-c.cc b/gcc/config/darwin-c.cc index 9203c84d2c26..00fc1253e265 100644 --- a/gcc/config/darwin-c.cc +++ b/gcc/config/darwin-c.cc @@ -691,7 +691,8 @@ macosx_version_as_macro (void) if (!version_array) goto fail; - if (version_array[MAJOR] < 10 || version_array[MAJOR] > 12) + /* clang accepts up to 99. */ + if (version_array[MAJOR] < 10 || version_array[MINOR] > 99) goto fail; if (version_array[MAJOR] == 10 && version_array[MINOR] < 10) -- 2.36.1
Re: [PATCH] Add optional __Bfloat16 support
On Fri, Jun 10, 2022 at 2:38 AM Florian Weimer wrote: > > * liuhongt via Libc-alpha: > > > +\subsubsection{Special Types} > > + > > +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa. > > +It is used for \code{BF16} related intrinsics, it cannot be Please mention that this is an alternate encoding format for 16-bit floating point. It has the same size and alignment as _Float16. > > +used with standard C operators. > > I think it's not necessary to specify whether the type supports certain > C operators (surely assignment will work?). If they are added later, > the ABI won't need changing. > If _Bfloat16 becomes a fundamental type, the ABI should be changed to move it together with other scalar types. -- H.J.
[committed] libstdc++: Make std::lcm and std::gcd detect overflow [PR105844]
Tested powerpc64le-linux, pushed to trunk. -- >8 -- When I fixed PR libstdc++/92978 I introduced a regression whereby std::lcm(INT_MIN, 1) and std::lcm(5, 4) would no longer produce errors during constant evaluation. Those calls are undefined, because they violate the preconditions that |m| and the result can be represented in the return type (which is int in both those cases). The regression occurred because __absu(INT_MIN) is well-formed, due to the explicit casts to unsigned in that new helper function, and the out-of-range multiplication is well-formed, because unsigned arithmetic wraps instead of overflowing. To fix 92978 I made std::gcm and std::lcm calculate |m| and |n| immediately, yielding a common unsigned type that was used to calculate the result. That was partly correct, but there's no need to use an unsigned type. Doing so only suppresses the overflow errors so the compiler can't detect them. This change replaces __absu with __abs_r that returns the common type (not its corresponding unsigned type). This way we can detect overflow in __abs_r when required, while still supporting the most-negative value when it can be represented in the result type. To detect LCM results that are out of range of the result type we still need explicit checks, because neither constant evaluation nor UBsan will complain about unsigned wrapping for cases such as std::lcm(50u, 49u). We can detect those overflows efficiently by using __builtin_mul_overflow and asserting. libstdc++-v3/ChangeLog: PR libstdc++/105844 * include/experimental/numeric (experimental::gcd): Simplify assertions. Use __abs_r instead of __absu. (experimental::lcm): Likewise. Remove use of __detail::__lcm so overflow can be detected. * include/std/numeric (__detail::__absu): Rename to __abs_r and change to allow signed result type, so overflow can be detected. (__detail::__lcm): Remove. (gcd): Simplify assertions. Use __abs_r instead of __absu. (lcm): Likewise. Remove use of __detail::__lcm so overflow can be detected. * testsuite/26_numerics/gcd/gcd_neg.cc: Adjust dg-error lines. * testsuite/26_numerics/lcm/lcm_neg.cc: Likewise. * testsuite/26_numerics/gcd/105844.cc: New test. * testsuite/26_numerics/lcm/105844.cc: New test. --- libstdc++-v3/include/experimental/numeric | 46 +++- libstdc++-v3/include/std/numeric | 75 +++ .../testsuite/26_numerics/gcd/105844.cc | 21 ++ .../testsuite/26_numerics/gcd/gcd_neg.cc | 10 ++- .../testsuite/26_numerics/lcm/105844.cc | 22 ++ .../testsuite/26_numerics/lcm/lcm_neg.cc | 10 ++- 6 files changed, 123 insertions(+), 61 deletions(-) create mode 100644 libstdc++-v3/testsuite/26_numerics/gcd/105844.cc create mode 100644 libstdc++-v3/testsuite/26_numerics/lcm/105844.cc diff --git a/libstdc++-v3/include/experimental/numeric b/libstdc++-v3/include/experimental/numeric index 4c6a662fdd6..426d9430dd6 100644 --- a/libstdc++-v3/include/experimental/numeric +++ b/libstdc++-v3/include/experimental/numeric @@ -56,17 +56,15 @@ inline namespace fundamentals_v2 constexpr common_type_t<_Mn, _Nn> gcd(_Mn __m, _Nn __n) noexcept { - static_assert(is_integral_v<_Mn>, - "std::experimental::gcd arguments must be integers"); - static_assert(is_integral_v<_Nn>, - "std::experimental::gcd arguments must be integers"); - static_assert(_Mn(2) != _Mn(1), - "std::experimental::gcd arguments must not be bool"); - static_assert(_Nn(2) != _Nn(1), - "std::experimental::gcd arguments must not be bool"); - using _Up = make_unsigned_t>; - return std::__detail::__gcd(std::__detail::__absu<_Up>(__m), - std::__detail::__absu<_Up>(__n)); + static_assert(is_integral_v<_Mn> && is_integral_v<_Nn>, + "std::experimental::gcd arguments must be integers"); + static_assert(_Mn(2) == 2 && _Nn(2) == 2, + "std::experimental::gcd arguments must not be bool"); + namespace __detail = std::__detail; + using _Ct = common_type_t<_Mn, _Nn>; + const _Ct __m2 = __detail::__abs_r<_Ct>(__m); + const _Ct __n2 = __detail::__abs_r<_Ct>(__n); + return __detail::__gcd>(__m2, __n2); } /// Least common multiple @@ -74,17 +72,25 @@ inline namespace fundamentals_v2 constexpr common_type_t<_Mn, _Nn> lcm(_Mn __m, _Nn __n) { - static_assert(is_integral_v<_Mn>, + static_assert(is_integral_v<_Mn> && is_integral_v<_Nn>, "std::experimental::lcm arguments must be integers"); - static_assert(is_integral_v<_Nn>, - "std::experimental::lcm arguments must be integers"); - static_assert(_Mn(2) != _Mn(1), + static_assert(_Mn(2) == 2 && _Nn(2) == 2, "std::experimental::lcm arguments must not be bool"); -
[committed] libstdc++: Fix lifetime bugs for non-TLS eh_globals [PR105880]
Tested powerpc64le-linux, pushed to trunk. -- >8 -- This ensures that the single-threaded fallback buffer eh_globals is not destroyed during program termination, using the same immortalization technique used for error category objects. Also ensure that init._M_init can still be read after init has been destroyed, by making it a static data member. libstdc++-v3/ChangeLog: PR libstdc++/105880 * libsupc++/eh_globals.cc (eh_globals): Ensure constant init and prevent destruction during termination. (__eh_globals_init::_M_init): Replace with static member _S_init. (__cxxabiv1::__cxa_get_globals_fast): Update. (__cxxabiv1::__cxa_get_globals): Likewise. --- libstdc++-v3/libsupc++/eh_globals.cc | 51 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/libstdc++-v3/libsupc++/eh_globals.cc b/libstdc++-v3/libsupc++/eh_globals.cc index 3a003b89edf..768425c0f40 100644 --- a/libstdc++-v3/libsupc++/eh_globals.cc +++ b/libstdc++-v3/libsupc++/eh_globals.cc @@ -64,8 +64,26 @@ __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW #else -// Single-threaded fallback buffer. -static __cxa_eh_globals eh_globals; +#if __has_cpp_attribute(clang::require_constant_initialization) +# define __constinit [[clang::require_constant_initialization]] +#endif + +namespace +{ + struct constant_init + { +union { + unsigned char unused; + __cxa_eh_globals obj; +}; +constexpr constant_init() : obj() { } + +~constant_init() { /* do nothing, union member is not destroyed */ } + }; + + // Single-threaded fallback buffer. + __constinit constant_init eh_globals; +} #if __GTHREADS @@ -90,32 +108,37 @@ eh_globals_dtor(void* ptr) struct __eh_globals_init { __gthread_key_t _M_key; - bool _M_init; + static bool _S_init; - __eh_globals_init() : _M_init(false) - { + __eh_globals_init() + { if (__gthread_active_p()) - _M_init = __gthread_key_create(&_M_key, eh_globals_dtor) == 0; + _S_init = __gthread_key_create(&_M_key, eh_globals_dtor) == 0; } ~__eh_globals_init() { -if (_M_init) +if (_S_init) __gthread_key_delete(_M_key); -_M_init = false; +_S_init = false; } + + __eh_globals_init(const __eh_globals_init&) = delete; + __eh_globals_init& operator=(const __eh_globals_init&) = delete; }; +bool __eh_globals_init::_S_init = false; + static __eh_globals_init init; extern "C" __cxa_eh_globals* __cxxabiv1::__cxa_get_globals_fast() _GLIBCXX_NOTHROW { __cxa_eh_globals* g; - if (init._M_init) + if (init._S_init) g = static_cast<__cxa_eh_globals*>(__gthread_getspecific(init._M_key)); else -g = _globals; +g = _globals.obj; return g; } @@ -123,7 +146,7 @@ extern "C" __cxa_eh_globals* __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW { __cxa_eh_globals* g; - if (init._M_init) + if (init._S_init) { g = static_cast<__cxa_eh_globals*>(__gthread_getspecific(init._M_key)); if (!g) @@ -140,7 +163,7 @@ __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW } } else -g = _globals; +g = _globals.obj; return g; } @@ -148,11 +171,11 @@ __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW extern "C" __cxa_eh_globals* __cxxabiv1::__cxa_get_globals_fast() _GLIBCXX_NOTHROW -{ return _globals; } +{ return _globals.obj; } extern "C" __cxa_eh_globals* __cxxabiv1::__cxa_get_globals() _GLIBCXX_NOTHROW -{ return _globals; } +{ return _globals.obj; } #endif -- 2.34.3
[PATCH] regrename: Fix -fcompare-debug issue in check_new_reg_p [PR105041]
regrename: Fix -fcompare-debug issue in check_new_reg_p [PR105041] In check_new_reg_p, the nregs of a du chain is computed by obtaining the MODE of the first element in the chain, and then calling hard_regno_nregs() with the MODE. But the first element of the chain can be a DEBUG_INSN whose mode need not be the same as the rest of the elements in the du chain. This was resulting in fcompare-debug failure as check_new_reg_p was returning a different result with -g for the same candidate register. We can instead obtain nregs from the du chain itself. 2022-06-10 Surya Kumari Jangala gcc/ PR rtl-optimization/105041 * regrename.cc (check_new_reg_p): Use nregs value from du chain. gcc/testsuite/ PR rtl-optimization/105041 * gcc.target/powerpc/pr105041.c: New test. diff --git a/gcc/regrename.cc b/gcc/regrename.cc index 10271e1..f651351 100644 --- a/gcc/regrename.cc +++ b/gcc/regrename.cc @@ -324,8 +324,7 @@ static bool check_new_reg_p (int reg ATTRIBUTE_UNUSED, int new_reg, class du_head *this_head, HARD_REG_SET this_unavailable) { - machine_mode mode = GET_MODE (*this_head->first->loc); - int nregs = hard_regno_nregs (new_reg, mode); + int nregs = this_head->nregs; int i; struct du_chain *tmp; diff --git a/gcc/testsuite/gcc.target/powerpc/pr105041.c b/gcc/testsuite/gcc.target/powerpc/pr105041.c new file mode 100644 index 000..89eed1c --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr105041.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target be } */ +/* { dg-options "-m32 -mdejagnu-cpu=power4 -O2 -fcompare-debug -fharden-compares -frename-registers" } */ + +double m; +int n; + +unsigned int +foo (unsigned int x, int y) +{ + long long int a = y, b = !a; + int c = 0; + + if (b != x) +while ((int) m == a) + { +c = a; +a = 0; + } + + n = b = y; + + return x + c; +}
[committed] libstdc++: Make std::hash> allocator-agnostic (LWG 3705)
Tested powerpc64le-linux, pushed to trunk. -- >8 -- This new library issue was recently moved to Tentatively Ready by an LWG poll, so I'm making the change on trunk. As noted in PR libstc++/105907 the std::hash specializations for PMR strings were not treated as slow hashes by the unordered containers, so this change preserves that. The new specializations for custom allocators are also not treated as slow, for the same reason. For the versioned namespace (i.e. unstable ABI) we don't have to worry about that, so can enable hash code caching for all basic_string specializations. libstdc++-v3/ChangeLog: * include/bits/basic_string.h (__hash_str_base): New class template. (hash, A>>): Define partial specialization for each of the standard character types. (hash, hash, hash, hash) (hash): Remove explicit specializations. * include/std/string (__hash_string_base): Remove class template. (hash, hash, hash) (hash, hash): Remove explicit specializations. * testsuite/21_strings/basic_string/hash/hash.cc: Test with custom allocators. * testsuite/21_strings/basic_string/hash/hash_char8_t.cc: Likewise. --- libstdc++-v3/include/bits/basic_string.h | 108 -- libstdc++-v3/include/std/string | 33 -- .../21_strings/basic_string/hash/hash.cc | 16 +++ .../basic_string/hash/hash_char8_t.cc | 12 ++ 4 files changed, 77 insertions(+), 92 deletions(-) diff --git a/libstdc++-v3/include/bits/basic_string.h b/libstdc++-v3/include/bits/basic_string.h index 6041d05815b..f76ddf970c6 100644 --- a/libstdc++-v3/include/bits/basic_string.h +++ b/libstdc++-v3/include/bits/basic_string.h @@ -4226,86 +4226,76 @@ namespace std _GLIBCXX_VISIBILITY(default) { _GLIBCXX_BEGIN_NAMESPACE_VERSION - // DR 1182. + // _GLIBCXX_RESOLVE_LIB_DEFECTS + // 3705. Hashability shouldn't depend on basic_string's allocator + + template, _Alloc>> +struct __str_hash_base +: public __hash_base +{ + size_t + operator()(const _StrT& __s) const noexcept + { return _Hash_impl::hash(__s.data(), __s.length() * sizeof(_CharT)); } +}; #ifndef _GLIBCXX_COMPATIBILITY_CXX0X /// std::hash specialization for string. - template<> -struct hash -: public __hash_base -{ - size_t - operator()(const string& __s) const noexcept - { return std::_Hash_impl::hash(__s.data(), __s.length()); } -}; - - template<> -struct __is_fast_hash> : std::false_type + template +struct hash, _Alloc>> +: public __str_hash_base { }; /// std::hash specialization for wstring. - template<> -struct hash -: public __hash_base -{ - size_t - operator()(const wstring& __s) const noexcept - { return std::_Hash_impl::hash(__s.data(), - __s.length() * sizeof(wchar_t)); } -}; + template +struct hash, _Alloc>> +: public __str_hash_base +{ }; - template<> -struct __is_fast_hash> : std::false_type + template +struct __is_fast_hash, + _Alloc>>> +: std::false_type { }; #endif /* _GLIBCXX_COMPATIBILITY_CXX0X */ #ifdef _GLIBCXX_USE_CHAR8_T /// std::hash specialization for u8string. - template<> -struct hash -: public __hash_base -{ - size_t - operator()(const u8string& __s) const noexcept - { return std::_Hash_impl::hash(__s.data(), - __s.length() * sizeof(char8_t)); } -}; - - template<> -struct __is_fast_hash> : std::false_type + template +struct hash, _Alloc>> +: public __str_hash_base { }; #endif /// std::hash specialization for u16string. - template<> -struct hash -: public __hash_base -{ - size_t - operator()(const u16string& __s) const noexcept - { return std::_Hash_impl::hash(__s.data(), - __s.length() * sizeof(char16_t)); } -}; - - template<> -struct __is_fast_hash> : std::false_type + template +struct hash, _Alloc>> +: public __str_hash_base { }; /// std::hash specialization for u32string. - template<> -struct hash -: public __hash_base -{ - size_t - operator()(const u32string& __s) const noexcept - { return std::_Hash_impl::hash(__s.data(), - __s.length() * sizeof(char32_t)); } -}; - - template<> -struct __is_fast_hash> : std::false_type + template +struct hash, _Alloc>> +: public __str_hash_base { }; +#if ! _GLIBCXX_INLINE_VERSION + // PR libstdc++/105907 - __is_fast_hash affects unordered container ABI. + template<> struct __is_fast_hash> : std::false_type { }; + template<> struct __is_fast_hash> : std::false_type { }; + template<> struct __is_fast_hash> : std::false_type { }; + template<> struct
[PATCH] c++: improve TYPENAME_TYPE hashing [PR65328]
The reason compiling the testcase in this PR is so slow is ultimately due to our poor hashing of TYPENAME_TYPE causing a huge amount of hash table collisions in the spec_hasher and typename_hasher tables. In spec_hasher, we don't hash the components of a TYPENAME_TYPE at all, presumably because TYPENAME_TYPE equivalence as determined by structural_comptypes depends on whether the comparing_specializations flag is set. This patch fixes this by setting comparing_specializations from spec_hasher::hash, and making iterative_hash_template_arg hash the relevant components of a TYPENAME_TYPE when this flag is set. consistently. And in typename_hasher, the hash function doesn't consider the TYPENAME_TYPE_FULLNAME, which this patch fixes accordingly. After this patch, compile time for the testcase in the PR is around 34 seconds (10% faster than Clang). Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for trunk? PR c++/65328 gcc/cp/ChangeLog: * decl.cc (typename_hasher::hash): Add extra overloads. Use iterative_hash_object instead of htab_hash_pointer. Hash the TYPENAME_TYPE_FULLNAME too. (build_typename_type): Use typename_hasher::hash. * pt.cc (spec_hasher::hash): Add two-parameter overload. Set comparing_specializations around the call to hash_tmpl_and_args. (iterative_hash_template_arg) : When comparing_specializations, hash the TYPE_CONTEXT and TYPENAME_TYPE_FULLNAME. (tsubst_function_decl): Use spec_hasher::hash instead of hash_tmpl_and_args. (tsubst_template_decl): Likewise. (tsubst_decl): Likewise. --- gcc/cp/decl.cc | 26 +++--- gcc/cp/pt.cc | 28 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc index 7f3b3c3c588..b7f624ca50b 100644 --- a/gcc/cp/decl.cc +++ b/gcc/cp/decl.cc @@ -4007,14 +4007,27 @@ struct typename_hasher : ggc_ptr_hash /* Hash a TYPENAME_TYPE. */ static hashval_t - hash (tree t) + hash (tree context, tree name, tree fullname) { -hashval_t hash; +hashval_t hash = 0; +hash = iterative_hash_object (context, hash); +hash = iterative_hash_object (name, hash); +hash = iterative_hash_object (fullname, hash); +return hash; + } -hash = (htab_hash_pointer (TYPE_CONTEXT (t)) - ^ htab_hash_pointer (TYPE_IDENTIFIER (t))); + static hashval_t + hash (const typename_info *ti) + { +return typename_hasher::hash (ti->scope, ti->name, ti->template_id); + } -return hash; + static hashval_t + hash (tree t) + { +return typename_hasher::hash (TYPE_CONTEXT (t), + TYPE_IDENTIFIER (t), + TYPENAME_TYPE_FULLNAME (t)); } /* Compare two TYPENAME_TYPEs. */ @@ -4053,8 +4066,7 @@ build_typename_type (tree context, tree name, tree fullname, ti.class_p = (tag_type == class_type || tag_type == record_type || tag_type == union_type); - hashval_t hash = (htab_hash_pointer (ti.scope) -^ htab_hash_pointer (ti.name)); + hashval_t hash = typename_hasher::hash (); /* See if we already have this type. */ tree *e = typename_htab->find_slot_with_hash (, hash, INSERT); diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc index 55129cf6f2c..381fc337cb0 100644 --- a/gcc/cp/pt.cc +++ b/gcc/cp/pt.cc @@ -107,6 +107,7 @@ static bool excessive_deduction_depth; struct spec_hasher : ggc_ptr_hash { static hashval_t hash (spec_entry *); + static hashval_t hash (tree, tree); static bool equal (spec_entry *, spec_entry *); }; @@ -1768,13 +1769,22 @@ hash_tmpl_and_args (tree tmpl, tree args) return iterative_hash_template_arg (args, val); } +hashval_t +spec_hasher::hash (tree tmpl, tree args) +{ + ++comparing_specializations; + hashval_t val = hash_tmpl_and_args (tmpl, args); + --comparing_specializations; + return val; +} + /* Returns a hash for a spec_entry node based on the TMPL and ARGS members, ignoring SPEC. */ hashval_t spec_hasher::hash (spec_entry *e) { - return hash_tmpl_and_args (e->tmpl, e->args); + return spec_hasher::hash (e->tmpl, e->args); } /* Recursively calculate a hash value for a template argument ARG, for use @@ -1960,6 +1970,16 @@ iterative_hash_template_arg (tree arg, hashval_t val) val = iterative_hash_template_arg (DECLTYPE_TYPE_EXPR (arg), val); break; + case TYPENAME_TYPE: + if (comparing_specializations) + { + tree context = TYPE_MAIN_VARIANT (TYPE_CONTEXT (arg)); + tree fullname = TYPENAME_TYPE_FULLNAME (arg); + val = iterative_hash_template_arg (context, val); + val = iterative_hash_template_arg (fullname, val); + } + break; + default: if (tree canonical = TYPE_CANONICAL (arg)) val =
[PATCH][AArch64] Implement ACLE Data Intrinsics
Hi, This patch adds support for the ACLE Data Intrinsics to the AArch64 port. Bootstrapped and regression tested on aarch64-none-linux. OK for trunk? gcc/ChangeLog: 2022-06-10 Andre Vieira * config/aarch64/aarch64.md (rbit2): Rename this ... (@aarch64_rbit): ... this and change it in... (ffs2,ctz2): ... here. (@aarch64_rev16): New. * config/aarch64/aarch64-builtins.cc: (aarch64_builtins): Define the following enum AARCH64_REV16, AARCH64_REV16L, AARCH64_REV16LL, AARCH64_RBIT, AARCH64_RBITL, AARCH64_RBITLL. (aarch64_init_data_intrinsics): New. (handle_arm_acle_h): Add call to aarch64_init_data_intrinsics. (aarch64_expand_builtin_data_intrinsic): New. (aarch64_general_expand_builtin): Add call to aarch64_expand_builtin_data_intrinsic. * config/aarch64/arm_acle.h (__clz, __clzl, __clzll, __cls, __clsl, __clsll, __rbit, __rbitl, __rbitll, __rev, __revl, __revll, __rev16, __rev16l, __rev16ll, __ror, __rorl, __rorll, __revsh): New. gcc/testsuite/ChangeLog: 2022-06-10 Andre Vieira * gcc.target/aarch64/acle/data-intrinsics.c: New test. diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index e0a741ac663188713e21f457affa57217d074783..91a687dee13a27c21f0c50de9ba777aa900d6096 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -613,6 +613,12 @@ enum aarch64_builtins AARCH64_LS64_BUILTIN_ST64B, AARCH64_LS64_BUILTIN_ST64BV, AARCH64_LS64_BUILTIN_ST64BV0, + AARCH64_REV16, + AARCH64_REV16L, + AARCH64_REV16LL, + AARCH64_RBIT, + AARCH64_RBITL, + AARCH64_RBITLL, AARCH64_BUILTIN_MAX }; @@ -1664,10 +1670,41 @@ aarch64_init_ls64_builtins (void) = aarch64_general_add_builtin (data[i].name, data[i].type, data[i].code); } +static void +aarch64_init_data_intrinsics (void) +{ + tree uint32_fntype = build_function_type_list (uint32_type_node, +uint32_type_node, NULL_TREE); + tree long_fntype = build_function_type_list (long_unsigned_type_node, + long_unsigned_type_node, + NULL_TREE); + tree uint64_fntype = build_function_type_list (uint64_type_node, +uint64_type_node, NULL_TREE); + aarch64_builtin_decls[AARCH64_REV16] += aarch64_general_add_builtin ("__builtin_aarch64_rev16", uint32_fntype, + AARCH64_REV16); + aarch64_builtin_decls[AARCH64_REV16L] += aarch64_general_add_builtin ("__builtin_aarch64_rev16l", long_fntype, + AARCH64_REV16L); + aarch64_builtin_decls[AARCH64_REV16LL] += aarch64_general_add_builtin ("__builtin_aarch64_rev16ll", uint64_fntype, + AARCH64_REV16LL); + aarch64_builtin_decls[AARCH64_RBIT] += aarch64_general_add_builtin ("__builtin_aarch64_rbit", uint32_fntype, + AARCH64_RBIT); + aarch64_builtin_decls[AARCH64_RBITL] += aarch64_general_add_builtin ("__builtin_aarch64_rbitl", long_fntype, + AARCH64_RBITL); + aarch64_builtin_decls[AARCH64_RBITLL] += aarch64_general_add_builtin ("__builtin_aarch64_rbitll", uint64_fntype, + AARCH64_RBITLL); +} + /* Implement #pragma GCC aarch64 "arm_acle.h". */ void handle_arm_acle_h (void) { + aarch64_init_data_intrinsics (); if (TARGET_LS64) aarch64_init_ls64_builtins (); } @@ -2393,6 +2430,32 @@ aarch64_expand_builtin_memtag (int fcode, tree exp, rtx target) emit_insn (pat); return target; } +/* Function to expand an expression EXP which calls one of the ACLE Data + Intrinsic builtins FCODE with the result going to TARGET. */ +static rtx +aarch64_expand_builtin_data_intrinsic (unsigned int fcode, tree exp, rtx target) +{ + rtx op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); + machine_mode mode = GET_MODE (op0); + rtx pat; + switch (fcode) +{ +case AARCH64_REV16: +case AARCH64_REV16L: +case AARCH64_REV16LL: + pat = gen_aarch64_rev16 (mode, target, op0); + break; +case AARCH64_RBIT: +case AARCH64_RBITL: +case AARCH64_RBITLL: + pat = gen_aarch64_rbit (mode, target, op0); + break; +default: + gcc_unreachable (); +} + emit_insn (pat); + return target; +} /* Expand an expression EXP as fpsr or fpcr setter (depending on UNSPEC) using MODE. */ @@ -2551,6 +2614,9 @@ aarch64_general_expand_builtin (unsigned int fcode, tree exp, rtx target, if (fcode >= AARCH64_MEMTAG_BUILTIN_START && fcode <= AARCH64_MEMTAG_BUILTIN_END) return aarch64_expand_builtin_memtag (fcode, exp, target); + if (fcode >= AARCH64_REV16 + && fcode <= AARCH64_RBITLL) +return aarch64_expand_builtin_data_intrinsic (fcode, exp, target);
[committed] libstdc++: Partially revert r11-9772-g6f8133689f4397 [PR105915]
I have done a partial revert on the gcc-11 branch to fix PR105915. I'll also backport it to gcc-10 after testing finishes. -- >8 -- The r11-9772-g6f8133689f4397 backport made two changes, but only one was needed on the gcc-11 branch. The other should not have been backported, and causes errors with clang. This removes the unwanted part. libstdc++-v3/ChangeLog: PR libstdc++/105915 * include/experimental/bits/fs_path.h (path::begin, path::end): Remove noexcept from declarations. --- libstdc++-v3/include/experimental/bits/fs_path.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libstdc++-v3/include/experimental/bits/fs_path.h b/libstdc++-v3/include/experimental/bits/fs_path.h index 1cc1b3bf686..a2bc931c696 100644 --- a/libstdc++-v3/include/experimental/bits/fs_path.h +++ b/libstdc++-v3/include/experimental/bits/fs_path.h @@ -425,8 +425,8 @@ namespace __detail class iterator; typedef iterator const_iterator; -iterator begin() const noexcept; -iterator end() const noexcept; +iterator begin() const; +iterator end() const; /// @cond undocumented // Create a basic_string by reading until a null character. -- 2.34.3
Re: [PATCH] c++: optimize specialization of nested class templates
On Thu, 9 Jun 2022, Patrick Palka wrote: > On Thu, 9 Jun 2022, Jason Merrill wrote: > > > On 6/8/22 14:21, Patrick Palka wrote: > > > When substituting a class template specialization, tsubst_aggr_type > > > substitutes the TYPE_CONTEXT before passing it to lookup_template_class. > > > This appears to be unnecessary, however, because the the initial value > > > of lookup_template_class's context parameter is unused outside of the > > > IDENTIFIER_NODE case, and l_t_c performs its own substitution of the > > > context, anyway. So this patch removes the redundant substitution in > > > tsubst_aggr_type. Doing so causes us to ICE on template/nested5.C > > > because during lookup_template_class for A::C::D with T=E and S=S, > > > we substitute and complete the context A::C with T=E, which in turn > > > registers the desired dependent specialization of D for us and we end up > > > trying to register it again. This patch fixes this by checking the > > > specializations table again after completion of the context. > > > > > > This patch also implements a couple of other optimizations: > > > > > >* In lookup_template_class, if the context of the partially > > > instantiated template is already non-dependent, then we could > > > reuse that instead of substituting the context of the most > > > general template. > > >* When substituting the TYPE_DECL for an injected-class-name > > > in tsubst_decl, we can avoid substituting its TREE_TYPE and > > > DECL_TI_ARGS. > > > > > > Together these optimizations improve memory usage for the range-v3 > > > testcase test/view/split.cc by about 5%. The improvement is probably > > > more significant when dealing with deeply nested class templates. > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for > > > trunk? > > > > > > gcc/cp/ChangeLog: > > > > > > * pt.cc (lookup_template_class): Remove dead stores to > > > context parameter. Don't substitute the context of the > > > most general template if that of the partially instantiated > > > template is non-dependent. Check the specializations table > > > again after completing the context of a nested dependent > > > specialization. > > > (tsubst_aggr_type) : Don't substitute > > > TYPE_CONTEXT or pass it to lookup_template_class. > > > (tsubst_decl) : Avoid substituting the > > > TREE_TYPE and DECL_TI_ARGS when DECL_SELF_REFERENCE_P. > > > --- > > > gcc/cp/pt.cc | 69 +++- > > > 1 file changed, 41 insertions(+), 28 deletions(-) > > > > > > diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc > > > index 59b94317e88..28023d60684 100644 > > > --- a/gcc/cp/pt.cc > > > +++ b/gcc/cp/pt.cc > > > @@ -9840,8 +9840,6 @@ lookup_template_class (tree d1, tree arglist, tree > > > in_decl, tree context, > > > if (context) > > > pop_decl_namespace (); > > > } > > > - if (templ) > > > - context = DECL_CONTEXT (templ); > > > } > > > else if (TREE_CODE (d1) == TYPE_DECL && MAYBE_CLASS_TYPE_P (TREE_TYPE > > > (d1))) > > > { > > > @@ -9868,7 +9866,6 @@ lookup_template_class (tree d1, tree arglist, tree > > > in_decl, tree context, > > > { > > > templ = d1; > > > d1 = DECL_NAME (templ); > > > - context = DECL_CONTEXT (templ); > > > } > > > else if (DECL_TEMPLATE_TEMPLATE_PARM_P (d1)) > > > { > > > @@ -10059,8 +10056,25 @@ lookup_template_class (tree d1, tree arglist, > > > tree > > > in_decl, tree context, > > > context = DECL_CONTEXT (gen_tmpl); > > > if (context && TYPE_P (context)) > > > { > > > - context = tsubst_aggr_type (context, arglist, complain, in_decl, > > > true); > > > - context = complete_type (context); > > > + if (!uses_template_parms (DECL_CONTEXT (templ))) > > > + /* If the context of the partially instantiated template is > > > +already non-dependent, then we might as well use it. */ > > > + context = DECL_CONTEXT (templ); > > > + else > > > + { > > > + context = tsubst_aggr_type (context, arglist, complain, in_decl, > > > true); > > > + context = complete_type (context); > > > + if (is_dependent_type && arg_depth > 1) > > > + { > > > + /* If this is a dependent nested specialization such as > > > + A::B, then completion of A might have > > > + registered this specialization of B for us, so check > > > + the table again (33959). */ > > > + entry = type_specializations->find_with_hash (, hash); > > > + if (entry) > > > + return entry->spec; > > > + } > > > + } > > > } > > > else > > > context = tsubst (context, arglist, complain, in_decl); > > > @@ -13711,25 +13725,12 @@ tsubst_aggr_type (tree t, > > > if (TYPE_TEMPLATE_INFO (t) && uses_template_parms (t)) > > > { > > >
Fix ipa-prop wrt volatile memory accesses
Hi, this patch prevents ipa-prop from propagating aggregates when load is volatile. Martin, does this look OK? It seem to me that ipa-prop may need some additional volatile flag checks. Bootstrapped/regtested x86_64-linux, OK? Honza gcc/ChangeLog: 2022-06-10 Jan Hubicka PR ipa/105739 * ipa-prop.cc (ipa_load_from_parm_agg): Disqualify volatile memory accesses. gcc/testsuite/ChangeLog: 2022-06-10 Jan Hubicka * gcc.dg/ipa/pr105739.c: New test. diff --git a/gcc/ipa-prop.cc b/gcc/ipa-prop.cc index afd9222b5a2..c037668e7d8 100644 --- a/gcc/ipa-prop.cc +++ b/gcc/ipa-prop.cc @@ -1112,6 +1112,10 @@ ipa_load_from_parm_agg (struct ipa_func_body_info *fbi, if (!base) return false; + /* We can not propagate across volatile loads. */ + if (TREE_THIS_VOLATILE (op)) +return false; + if (DECL_P (base)) { int index = ipa_get_param_decl_index_1 (descriptors, base); diff --git a/gcc/testsuite/gcc.dg/ipa/pr105739.c b/gcc/testsuite/gcc.dg/ipa/pr105739.c new file mode 100644 index 000..8dbe8fc2494 --- /dev/null +++ b/gcc/testsuite/gcc.dg/ipa/pr105739.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + + +__attribute__((noinline)) +static int +test2(int a) +{ +if (__builtin_constant_p (a)) +__builtin_abort (); +return a; +} +static int +test(int *a) +{ +int val = *(volatile int *)a; +if (__builtin_constant_p (val)) +__builtin_abort (); +if (val) + return test2(val); +return 0; +} +int a; +int +main() +{ +a = 0; +return test (); +} +/* { dg-final { scan-tree-dump "test2" "optimized" } } */
Re: [PATCH 2/2] Add a general mapping from internal fns to target insns
On Fri, 2022-06-10 at 10:14 +0100, Richard Sandiford via Gcc-patches wrote: Several existing internal functions map directly to an instruction defined in target-insns.def. This patch makes it easier to define more such functions in future. This should help to reduce cut-&-paste, but more importantly, it allows the difference between optab functions and target-insns.def functions to be abstracted away; both are now treated as “directly-mapped”. Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? Richard gcc/ * internal-fn.def (DEF_INTERNAL_INSN_FN): New macro. (GOMP_SIMT_ENTER_ALLOC, GOMP_SIMT_EXIT, GOMP_SIMT_LANE) (GOMP_SIMT_LAST_LANE, GOMP_SIMT_ORDERED_PRED, GOMP_SIMT_VOTE_ANY) (GOMP_SIMT_XCHG_BFLY, GOMP_SIMT_XCHG_IDX): Use it. * internal-fn.h (direct_internal_fn_info::directly_mapped): New member variable. (direct_internal_fn_info::vectorizable): Reduce to 1 bit. (direct_internal_fn_p): Also return true for internal functions that map directly to instructions defined target-insns.def. (direct_internal_fn): Adjust comment accordingly. * internal-fn.c (direct_insn, optab1, optab2, vectorizable_optab1) [...snip...] --- gcc/internal-fn.cc | 152 +++- gcc/internal-fn.def | 34 +++--- gcc/internal-fn.h | 20 +++--- 3 files changed, 87 insertions(+), 119 deletions(-) [...snip...] I can't comment on the correctness of the patch, but I happened to spot that the filename in the changelog entry needs renaming for the .c to .cc transition, or the git hooks will complain when you try to push this. Dave
Re: [PING][PATCH][WIP] have configure probe prefix for gmp/mpfr/mpc [PR44425]
On Thu, 2022-06-09 at 16:04 -0400, Eric Gallager via Gcc-patches wrote: > Hi, I'd like to ping this patch: > https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596126.html > (cc-ing the build machinery maintainers listed in MAINTAINERS this > time) > > On Thu, Jun 2, 2022 at 11:53 AM Eric Gallager > wrote: > > > > So, I'm working on fixing PR bootstrap/44425, and have this patch to > > have the top-level configure script check in the value passed to > > `--prefix=` when looking for gmp/mpfr/mpc. It "works" (in that > > configuring with just `--prefix=` and none of > > `--with-gmp=`/`--with-mpfr=`/`--with-mpc=` now works where it failed > > before), but unfortunately it results in a bunch of duplicated > > `-I`/`-L` flags stuck in ${gmplibs} and ${gmpinc}... is that > > acceptable or should I try another approach? > > Eric A patch should not edit configure directly. configure.ac should be edited and configure should be regenerated from it. -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University
[PATCH] Do not erase warning data in gimple_set_location
Hi, gimple_set_location is mostly invoked on newly built GIMPLE statements, so their location is UNKNOWN_LOCATION and setting it will clobber the warning data of the passed location, if any. Tested on x86-64/Linux, OK for mainline and 12 branch? 2022-06-10 Eric Botcazou * gimple.h (gimple_set_location): Do not copy warning data from the previous location when it is UNKNOWN_LOCATION. 2022-06-10 Eric Botcazou testsuite/ * c-c++-common/nonnull-1.c: Remove XFAIL for C++. -- Eric Botcazoudiff --git a/gcc/gimple.h b/gcc/gimple.h index 6b1e89ad74e..870629cd562 100644 --- a/gcc/gimple.h +++ b/gcc/gimple.h @@ -1913,7 +1913,8 @@ static inline void gimple_set_location (gimple *g, location_t location) { /* Copy the no-warning data to the statement location. */ - copy_warning (location, g->location); + if (g->location != UNKNOWN_LOCATION) +copy_warning (location, g->location); g->location = location; } diff --git a/gcc/testsuite/c-c++-common/nonnull-1.c b/gcc/testsuite/c-c++-common/nonnull-1.c index ea987365302..7be4e3479dd 100644 --- a/gcc/testsuite/c-c++-common/nonnull-1.c +++ b/gcc/testsuite/c-c++-common/nonnull-1.c @@ -30,5 +30,5 @@ func (char *cp1, char *cp2, char *cp3, char *cp4) __attribute__((nonnull (1))) int func2 (char *cp) { - return (cp != NULL) ? 1 : 0; /* { dg-warning "'nonnull' argument" "cp compared to NULL" { xfail c++ } } */ + return (cp != NULL) ? 1 : 0; /* { dg-warning "'nonnull' argument" "cp compared to NULL" } */ }
Re: [PATCH] Add optional __Bfloat16 support
* liuhongt via Libc-alpha: > +\subsubsection{Special Types} > + > +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa. > +It is used for \code{BF16} related intrinsics, it cannot be > +used with standard C operators. I think it's not necessary to specify whether the type supports certain C operators (surely assignment will work?). If they are added later, the ABI won't need changing. Thanks, Florian
Re: [PATCH] testsuite: Add -mtune=generic to dg-options for two testcases.
On Fri, Jun 10, 2022 at 4:45 PM Cui,Lili via Gcc-patches wrote: > > This patch is to change dg-options for two testcases. > > Use -mtune=generic to limit these two testcases. Because configuring them with > -mtune=cascadelake or znver3 will vectorize them. > > regtested on x86_64-linux-gnu{-m32,}. Ok for trunk? Ok. > > Thanks, > Lili. > > Use -mtune=generic to limit these two test cases. Because configuring them > with > -mtune=cascadelake or znver3 will vectorize them. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c: Add > -mtune=generic to dg-options. > * gcc.target/i386/pr84101.c: Likewise. > --- > .../gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c | 2 +- > gcc/testsuite/gcc.target/i386/pr84101.c | 2 +- > 2 files changed, 2 insertions(+), 2 deletions(-) > > diff --git > a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c > b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c > index 7637cdb4a97..d060135d877 100644 > --- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c > +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ > +/* { dg-additional-options "-msse -mtune=generic -fdump-tree-slp2-details" } > */ > > struct S { unsigned long a, b; } s; > > diff --git a/gcc/testsuite/gcc.target/i386/pr84101.c > b/gcc/testsuite/gcc.target/i386/pr84101.c > index cf144894f9b..2c5a97308ca 100644 > --- a/gcc/testsuite/gcc.target/i386/pr84101.c > +++ b/gcc/testsuite/gcc.target/i386/pr84101.c > @@ -1,5 +1,5 @@ > /* { dg-do compile } */ > -/* { dg-options "-O3 -fdump-tree-slp2-details" } */ > +/* { dg-options "-O3 -mtune=generic -fdump-tree-slp2-details" } */ > > typedef struct uint64_pair uint64_pair_t ; > struct uint64_pair > -- > 2.17.1 > -- BR, Hongtao
[PATCH 2/2] Add a general mapping from internal fns to target insns
Several existing internal functions map directly to an instruction defined in target-insns.def. This patch makes it easier to define more such functions in future. This should help to reduce cut-&-paste, but more importantly, it allows the difference between optab functions and target-insns.def functions to be abstracted away; both are now treated as “directly-mapped”. Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? Richard gcc/ * internal-fn.def (DEF_INTERNAL_INSN_FN): New macro. (GOMP_SIMT_ENTER_ALLOC, GOMP_SIMT_EXIT, GOMP_SIMT_LANE) (GOMP_SIMT_LAST_LANE, GOMP_SIMT_ORDERED_PRED, GOMP_SIMT_VOTE_ANY) (GOMP_SIMT_XCHG_BFLY, GOMP_SIMT_XCHG_IDX): Use it. * internal-fn.h (direct_internal_fn_info::directly_mapped): New member variable. (direct_internal_fn_info::vectorizable): Reduce to 1 bit. (direct_internal_fn_p): Also return true for internal functions that map directly to instructions defined target-insns.def. (direct_internal_fn): Adjust comment accordingly. * internal-fn.c (direct_insn, optab1, optab2, vectorizable_optab1) (vectorizable_optab2): New local macros. (not_direct): Initialize directly_mapped. (mask_load_direct, load_lanes_direct, mask_load_lanes_direct) (gather_load_direct, len_load_direct, mask_store_direct) (store_lanes_direct, mask_store_lanes_direct, vec_cond_mask_direct) (vec_cond_direct, scatter_store_direct, len_store_direct) (vec_set_direct, unary_direct, binary_direct, ternary_direct) (cond_unary_direct, cond_binary_direct, cond_ternary_direct) (while_direct, fold_extract_direct, fold_left_direct) (mask_fold_left_direct, check_ptrs_direct): Use the macros above. (expand_GOMP_SIMT_ENTER_ALLOC, expand_GOMP_SIMT_EXIT): Delete (expand_GOMP_SIMT_LANE, expand_GOMP_SIMT_LAST_LANE): Likewise; (expand_GOMP_SIMT_ORDERED_PRED, expand_GOMP_SIMT_VOTE_ANY): Likewise. (expand_GOMP_SIMT_XCHG_BFLY, expand_GOMP_SIMT_XCHG_IDX): Likewise. (direct_internal_fn_types): Handle functions that map to instructions defined in target-insns.def. (direct_internal_fn_types): Likewise. (direct_internal_fn_supported_p): Likewise. (internal_fn_expanders): Likewise. --- gcc/internal-fn.cc | 152 +++- gcc/internal-fn.def | 34 +++--- gcc/internal-fn.h | 20 +++--- 3 files changed, 87 insertions(+), 119 deletions(-) diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index ab2b1baa893..a809953ce6f 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -105,37 +105,44 @@ init_internal_fns () /* Create static initializers for the information returned by direct_internal_fn. */ -#define not_direct { -2, -2, false } -#define mask_load_direct { -1, 2, false } -#define load_lanes_direct { -1, -1, false } -#define mask_load_lanes_direct { -1, -1, false } -#define gather_load_direct { 3, 1, false } -#define len_load_direct { -1, -1, false } -#define mask_store_direct { 3, 2, false } -#define store_lanes_direct { 0, 0, false } -#define mask_store_lanes_direct { 0, 0, false } -#define vec_cond_mask_direct { 1, 0, false } -#define vec_cond_direct { 2, 0, false } -#define scatter_store_direct { 3, 1, false } -#define len_store_direct { 3, 3, false } -#define vec_set_direct { 3, 3, false } -#define unary_direct { 0, 0, true } -#define binary_direct { 0, 0, true } -#define ternary_direct { 0, 0, true } -#define cond_unary_direct { 1, 1, true } -#define cond_binary_direct { 1, 1, true } -#define cond_ternary_direct { 1, 1, true } -#define while_direct { 0, 2, false } -#define fold_extract_direct { 2, 2, false } -#define fold_left_direct { 1, 1, false } -#define mask_fold_left_direct { 1, 1, false } -#define check_ptrs_direct { 0, 0, false } +#define not_direct { -2, -2, false, false } +#define direct_insn{ -2, -2, true, false } +#define optab1(TYPE0) { TYPE0, TYPE0, true, false } +#define optab2(TYPE0, TYPE1) { TYPE0, TYPE1, true, false } +#define vectorizable_optab1(TYPE0) { TYPE0, TYPE0, true, true } + +#define mask_load_direct optab2 (-1, 2) +#define load_lanes_direct optab1 (-1) +#define mask_load_lanes_direct optab1 (-1) +#define gather_load_direct optab2 (3, 1) +#define len_load_directoptab1 (-1) +#define mask_store_direct optab2 (3, 2) +#define store_lanes_direct optab1 (0) +#define mask_store_lanes_directoptab1 (0) +#define vec_cond_mask_direct optab2 (1, 0) +#define vec_cond_directoptab2 (2, 0) +#define scatter_store_direct optab2 (3, 1) +#define len_store_direct optab1 (3) +#define vec_set_direct optab1 (3) +#define unary_direct
[PATCH 1/2] Factor out common internal-fn idiom
internal-fn.c has quite a few functions that simply map the result of the call to an instruction's output operand (if any) and map each argument to an instruction's input operand, in order. This patch adds a single function for doing that. It's really just a generalisation of expand_direct_optab_fn, but with the output operand being optional. Unfortunately, it isn't possible to do this for vcond_mask because the internal function has a different argument order from the optab. Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install? Richard gcc/ * internal-fn.cc (expand_fn_using_insn): New function, split out and adapted from... (expand_direct_optab_fn): ...here. (expand_GOMP_SIMT_ENTER_ALLOC): Use it. (expand_GOMP_SIMT_EXIT): Likewise. (expand_GOMP_SIMT_LANE): Likewise. (expand_GOMP_SIMT_LAST_LANE): Likewise. (expand_GOMP_SIMT_ORDERED_PRED): Likewise. (expand_GOMP_SIMT_VOTE_ANY): Likewise. (expand_GOMP_SIMT_XCHG_BFLY): Likewise. (expand_GOMP_SIMT_XCHG_IDX): Likewise. --- gcc/internal-fn.cc | 243 + 1 file changed, 89 insertions(+), 154 deletions(-) diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc index 8b1733e20c4..ab2b1baa893 100644 --- a/gcc/internal-fn.cc +++ b/gcc/internal-fn.cc @@ -140,6 +140,86 @@ const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = { not_direct }; +/* Expand STMT using instruction ICODE. The instruction has NOUTPUTS + output operands and NINPUTS input operands, where NOUTPUTS is either + 0 or 1. The output operand (if any) comes first, followed by the + NINPUTS input operands. */ + +static void +expand_fn_using_insn (gcall *stmt, insn_code icode, unsigned int noutputs, + unsigned int ninputs) +{ + gcc_assert (icode != CODE_FOR_nothing); + + expand_operand *ops = XALLOCAVEC (expand_operand, noutputs + ninputs); + unsigned int opno = 0; + rtx lhs_rtx = NULL_RTX; + tree lhs = gimple_call_lhs (stmt); + + if (noutputs) +{ + gcc_assert (noutputs == 1); + if (lhs) + lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + + /* Do not assign directly to a promoted subreg, since there is no +guarantee that the instruction will leave the upper bits of the +register in the state required by SUBREG_PROMOTED_SIGN. */ + rtx dest = lhs_rtx; + if (dest && GET_CODE (dest) == SUBREG && SUBREG_PROMOTED_VAR_P (dest)) + dest = NULL_RTX; + create_output_operand ([opno], dest, +insn_data[icode].operand[opno].mode); + opno += 1; +} + else +gcc_assert (!lhs); + + for (unsigned int i = 0; i < ninputs; ++i) +{ + tree rhs = gimple_call_arg (stmt, i); + tree rhs_type = TREE_TYPE (rhs); + rtx rhs_rtx = expand_normal (rhs); + if (INTEGRAL_TYPE_P (rhs_type)) + create_convert_operand_from ([opno], rhs_rtx, +TYPE_MODE (rhs_type), +TYPE_UNSIGNED (rhs_type)); + else + create_input_operand ([opno], rhs_rtx, TYPE_MODE (rhs_type)); + opno += 1; +} + + gcc_assert (opno == noutputs + ninputs); + expand_insn (icode, opno, ops); + if (lhs_rtx && !rtx_equal_p (lhs_rtx, ops[0].value)) +{ + /* If the return value has an integral type, convert the instruction +result to that type. This is useful for things that return an +int regardless of the size of the input. If the instruction result +is smaller than required, assume that it is signed. + +If the return value has a nonintegral type, its mode must match +the instruction result. */ + if (GET_CODE (lhs_rtx) == SUBREG && SUBREG_PROMOTED_VAR_P (lhs_rtx)) + { + /* If this is a scalar in a register that is stored in a wider +mode than the declared mode, compute the result into its +declared mode and then convert to the wider mode. */ + gcc_checking_assert (INTEGRAL_TYPE_P (TREE_TYPE (lhs))); + rtx tmp = convert_to_mode (GET_MODE (lhs_rtx), ops[0].value, 0); + convert_move (SUBREG_REG (lhs_rtx), tmp, + SUBREG_PROMOTED_SIGN (lhs_rtx)); + } + else if (GET_MODE (lhs_rtx) == GET_MODE (ops[0].value)) + emit_move_insn (lhs_rtx, ops[0].value); + else + { + gcc_checking_assert (INTEGRAL_TYPE_P (TREE_TYPE (lhs))); + convert_move (lhs_rtx, ops[0].value, 0); + } +} +} + /* ARRAY_TYPE is an array of vector modes. Return the associated insn for load-lanes-style optab OPTAB, or CODE_FOR_nothing if none. */ @@ -233,22 +313,8 @@ expand_GOMP_SIMT_ENTER (internal_fn, gcall *) static void expand_GOMP_SIMT_ENTER_ALLOC (internal_fn, gcall *stmt) { - rtx target; - tree lhs = gimple_call_lhs (stmt); - if (lhs) -target = expand_expr (lhs, NULL_RTX,
[PATCH] testsuite: Add -mtune=generic to dg-options for two testcases.
This patch is to change dg-options for two testcases. Use -mtune=generic to limit these two testcases. Because configuring them with -mtune=cascadelake or znver3 will vectorize them. regtested on x86_64-linux-gnu{-m32,}. Ok for trunk? Thanks, Lili. Use -mtune=generic to limit these two test cases. Because configuring them with -mtune=cascadelake or znver3 will vectorize them. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c: Add -mtune=generic to dg-options. * gcc.target/i386/pr84101.c: Likewise. --- .../gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c | 2 +- gcc/testsuite/gcc.target/i386/pr84101.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c index 7637cdb4a97..d060135d877 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr104582-2.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-additional-options "-msse -fdump-tree-slp2-details" } */ +/* { dg-additional-options "-msse -mtune=generic -fdump-tree-slp2-details" } */ struct S { unsigned long a, b; } s; diff --git a/gcc/testsuite/gcc.target/i386/pr84101.c b/gcc/testsuite/gcc.target/i386/pr84101.c index cf144894f9b..2c5a97308ca 100644 --- a/gcc/testsuite/gcc.target/i386/pr84101.c +++ b/gcc/testsuite/gcc.target/i386/pr84101.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O3 -fdump-tree-slp2-details" } */ +/* { dg-options "-O3 -mtune=generic -fdump-tree-slp2-details" } */ typedef struct uint64_pair uint64_pair_t ; struct uint64_pair -- 2.17.1
Re: [PATCH] Add optional __Bfloat16 support
On Fri, Jun 10, 2022 at 3:47 PM liuhongt via Libc-alpha wrote: > > Pass and return __Bfloat16 values in XMM registers. > > Background: > __Bfloat16 (BF16) is a new floating-point format that can accelerate machine > learning (deep learning training, in particular) algorithms. > It's first introduced by Intel AVX-512 extension called AVX-512_BF16. > __Bfloat16 has 8 bits of exponent and 7 bits of mantissa and it's different > from _Float16. > > Movivation: > Currently __bfloat16 is a typedef of short, which creates a problem where the > compiler does not raise any alarms if it is used to add, subtract, multiply > or divide, but the result of the calculation is actually meaningless. > To solve this problem, a real scalar type __Bfloat16 needs to be introduced. > It is mainly used for intrinsics, not available for C standard operators. > __Bfloat16 will also be used for movement like passing parameter, load and > store, vector initialization, vector shuffle, and .etc. It creates a need for > a corresponding psABI. > > --- > x86-64-ABI/low-level-sys-info.tex | 10 -- > 1 file changed, 8 insertions(+), 2 deletions(-) > > diff --git a/x86-64-ABI/low-level-sys-info.tex > b/x86-64-ABI/low-level-sys-info.tex > index a8b69db..ba8db0d 100644 > --- a/x86-64-ABI/low-level-sys-info.tex > +++ b/x86-64-ABI/low-level-sys-info.tex > @@ -302,6 +302,12 @@ be used to represent the type, is a family of integer > types. > This permits the use of these types in allocated arrays using the common > sizeof(Array)/sizeof(ElementType) pattern. > > +\subsubsection{Special Types} > + > +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa. > +It is used for \code{BF16} related intrinsics, it cannot be > +used with standard C operators. > + > \subsubsection{Aggregates and Unions} > > Structures and unions assume the alignment of their most strictly > @@ -563,8 +569,8 @@ The basic types are assigned their natural classes: > \item Arguments of types (signed and unsigned) \code{_Bool}, \code{char}, >\code{short}, \code{int}, \code{long}, \code{long long}, and >pointers are in the INTEGER class. > -\item Arguments of types \code{_Float16}, \code{float}, \code{double}, > - \code{_Decimal32}, > +\item Arguments of types \code{_Float16}, \code{__Bfloat16}, \code{float}, > + \code{double}, \code{_Decimal32}, >\code{_Decimal64} and \code{__m64} are in class SSE. > \item Arguments of types \code{__float128}, \code{_Decimal128} >and \code{__m128} are split into two halves. The least significant > -- > 2.18.1 > -- BR, Hongtao
[PATCH] Add optional __Bfloat16 support
Pass and return __Bfloat16 values in XMM registers. Background: __Bfloat16 (BF16) is a new floating-point format that can accelerate machine learning (deep learning training, in particular) algorithms. It's first introduced by Intel AVX-512 extension called AVX-512_BF16. __Bfloat16 has 8 bits of exponent and 7 bits of mantissa and it's different from _Float16. Movivation: Currently __bfloat16 is a typedef of short, which creates a problem where the compiler does not raise any alarms if it is used to add, subtract, multiply or divide, but the result of the calculation is actually meaningless. To solve this problem, a real scalar type __Bfloat16 needs to be introduced. It is mainly used for intrinsics, not available for C standard operators. __Bfloat16 will also be used for movement like passing parameter, load and store, vector initialization, vector shuffle, and .etc. It creates a need for a corresponding psABI. --- x86-64-ABI/low-level-sys-info.tex | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/x86-64-ABI/low-level-sys-info.tex b/x86-64-ABI/low-level-sys-info.tex index a8b69db..ba8db0d 100644 --- a/x86-64-ABI/low-level-sys-info.tex +++ b/x86-64-ABI/low-level-sys-info.tex @@ -302,6 +302,12 @@ be used to represent the type, is a family of integer types. This permits the use of these types in allocated arrays using the common sizeof(Array)/sizeof(ElementType) pattern. +\subsubsection{Special Types} + +The \code{__Bfloat16} type uses a 8-bit exponent and 7-bit mantissa. +It is used for \code{BF16} related intrinsics, it cannot be +used with standard C operators. + \subsubsection{Aggregates and Unions} Structures and unions assume the alignment of their most strictly @@ -563,8 +569,8 @@ The basic types are assigned their natural classes: \item Arguments of types (signed and unsigned) \code{_Bool}, \code{char}, \code{short}, \code{int}, \code{long}, \code{long long}, and pointers are in the INTEGER class. -\item Arguments of types \code{_Float16}, \code{float}, \code{double}, - \code{_Decimal32}, +\item Arguments of types \code{_Float16}, \code{__Bfloat16}, \code{float}, + \code{double}, \code{_Decimal32}, \code{_Decimal64} and \code{__m64} are in class SSE. \item Arguments of types \code{__float128}, \code{_Decimal128} and \code{__m128} are split into two halves. The least significant -- 2.18.1
Re: [PATCH] aarch64: Lower vcombine to GIMPLE
Andrew Carlotti via Gcc-patches writes: > Hi all, > > This lowers vcombine intrinsics to a GIMPLE vector constructor, which enables > better optimisation during GIMPLE passes. > > Bootstrapped and tested on aarch64-none-linux-gnu, and tested for > aarch64_be-none-linux-gnu via cross-compilation. > > > gcc/ > > * config/aarch64/aarch64-builtins.c > (aarch64_general_gimple_fold_builtin): Add combine. > > gcc/testsuite/ > > * gcc.target/aarch64/advsimd-intrinsics/combine.c: > New test. > > --- > > diff --git a/gcc/config/aarch64/aarch64-builtins.cc > b/gcc/config/aarch64/aarch64-builtins.cc > index > 5217dbdb2ac78bba0a669d22af6d769d1fe91a3d..9d52fb8c5a48c9b743defb340a85fb20a1c8f014 > 100644 > --- a/gcc/config/aarch64/aarch64-builtins.cc > +++ b/gcc/config/aarch64/aarch64-builtins.cc > @@ -2827,6 +2827,18 @@ aarch64_general_gimple_fold_builtin (unsigned int > fcode, gcall *stmt, > gimple_call_set_lhs (new_stmt, gimple_call_lhs (stmt)); > break; > > + BUILTIN_VDC (BINOP, combine, 0, AUTO_FP) > + BUILTIN_VD_I (BINOPU, combine, 0, NONE) > + BUILTIN_VDC_P (BINOPP, combine, 0, NONE) > + { > + if (BYTES_BIG_ENDIAN) > + std::swap(args[0], args[1]); We probably shouldn't do this swap in-place, since args refers directly to the gimple statement. > + tree ret_type = TREE_TYPE (gimple_call_lhs (stmt)); > + tree ctor = build_constructor_va (ret_type, 2, NULL_TREE, args[0], > NULL_TREE, args[1]); Minor formatting nit: lines should be under 80 chars. Looks good otherwise, thanks, and sorry for the slow review. Richard > + new_stmt = gimple_build_assign (gimple_call_lhs (stmt), ctor); > + } > + break; > + > /*lower store and load neon builtins to gimple. */ > BUILTIN_VALL_F16 (LOAD1, ld1, 0, LOAD) > BUILTIN_VDQ_I (LOAD1_U, ld1, 0, LOAD) > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c > new file mode 100644 > index > ..d08faf7a4a160a1e83428ed9b270731bbf7b8c8a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/combine.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile { target { aarch64*-*-* } } } */ > +/* { dg-final { check-function-bodies "**" "" {-O[^0]} } } */ > +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ > + > +#include > + > +/* > +** foo: > +** umovw0, v1\.s\[1\] > +** ret > +*/ > + > +int32_t foo (int32x2_t a, int32x2_t b) > +{ > + int32x4_t c = vcombine_s32(a, b); > + return vgetq_lane_s32(c, 3); > +} > +