[gcc r15-4213] aarch64: Fix SVE ACLE gimple folds for C++ LTO [PR116629]

Richard Sandiford via Gcc-cvs Wed, 09 Oct 2024 05:58:08 -0700

https://gcc.gnu.org/g:fee3adbac055c3ff2649fed866c66d44ebfcbe90


commit r15-4213-gfee3adbac055c3ff2649fed866c66d44ebfcbe90
Author: Richard Sandiford <richard.sandif...@arm.com>
Date:   Wed Oct 9 13:57:36 2024 +0100

    aarch64: Fix SVE ACLE gimple folds for C++ LTO [PR116629]
    
    The SVE ACLE code has two ways of handling overloaded functions.
    One, used by C, is to define a single dummy function for each unique
    overloaded name, with resolve_overloaded_builtin then resolving calls
    to real non-overloaded functions.  The other, used by C++, is to
    define a separate function for each individual overload.
    
    The builtins harness assigns integer function codes programmatically.
    However, LTO requires it to use the same assignment for every
    translation unit, regardless of language.  This means that C++ TUs
    need to create (unused) slots for the C overloads and that C TUs
    need to create (unused) slots for the C++ overloads.
    
    In many ways, it doesn't matter whether the LTO frontend itself
    uses the C approach or the C++ approach to defining overloaded
    functions, since the LTO frontend never has to resolve source-level
    overloading.  However, the C++ approach of defining a separate
    function for each overload means that C++ calls never need to
    be redirected to a different function.  Calls to an overload
    can appear in the LTO dump and survive until expand.  In contrast,
    calls to C's dummy overload functions are resolved by the front
    end and never survive to LTO (or expand).
    
    Some optimisations work by moving between sibling functions, such as _m
    to _x.  If the source function is an overload, the expected destination
    function is too.  The LTO frontend needs to define C++ overloads if it
    wants to do this optimisation properly for C++.
    
    The PR is about a tree checking failure caused by trying to use a
    stubbed-out C++ overload in LTO.  Dealing with that by detecting the
    stub (rather than changing which overloads are defined) would have
    turned this from an ice-on-valid to a missed optimisation.
    
    In future, it would probably make sense to redirect overloads to
    non-overloaded functions during gimple folding, in case that exposes
    more CSE opportunities.  But it'd probably be of limited benefit, since
    it should be rare for code to mix overloaded and non-overloaded uses of
    the same operation.  It also wouldn't be suitable for backports.
    
    gcc/
            PR target/116629
            * config/aarch64/aarch64-sve-builtins.cc
            (function_builder::function_builder): Use direct overloads for LTO.
    
    gcc/testsuite/
            PR target/116629
            * gcc.target/aarch64/sve/acle/general/pr106326_2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins.cc         |   2 +-
 .../aarch64/sve/acle/general/pr106326_2.c          | 381 +++++++++++++++++++++
 2 files changed, 382 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 5ff46212d18d..e7c703c987e8 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -1283,7 +1283,7 @@ function_builder::function_builder (handle_pragma_index 
pragma_index,
                                    bool function_nulls)
 {
   m_overload_type = build_function_type (void_type_node, void_list_node);
-  m_direct_overloads = lang_GNU_CXX ();
+  m_direct_overloads = lang_GNU_CXX () || in_lto_p;
 
   if (initial_indexes[pragma_index] == 0)
     {
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_2.c
new file mode 100644
index 000000000000..deb936cac5c3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/pr106326_2.c
@@ -0,0 +1,381 @@
+/* { dg-do link } */
+/* { dg-options "-O2 -flto -shared -fPIC --save-temps" } */
+/* { dg-require-effective-target shared } */
+/* { dg-require-effective-target fpic } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <arm_sve.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** add1:
+**     add     z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**     ret
+*/
+svint32_t
+add1 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b8 (), x, y);
+}
+
+/*
+** add2:
+**     add     z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**     ret
+*/
+svint32_t
+add2 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b16 (), x, y);
+}
+
+/*
+** add3:
+**     add     z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**     ret
+*/
+svint32_t
+add3 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b32 (), x, y);
+}
+
+/*
+** add4:
+**     ...
+**     movprfx [^\n]+
+**     ...
+**     ret
+*/
+svint32_t
+add4 (svint32_t x, svint32_t y)
+{
+  return svadd_z (svptrue_b64 (), x, y);
+}
+
+/*
+** add5:
+**     add     z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**     ret
+*/
+svint32_t
+add5 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b8 (), x, y);
+}
+
+/*
+** add6:
+**     add     z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**     ret
+*/
+svint32_t
+add6 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b16 (), x, y);
+}
+
+/*
+** add7:
+**     add     z0\.s, (z1\.s, z0\.s|z0\.s, z1\.s)
+**     ret
+*/
+svint32_t
+add7 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b32 (), x, y);
+}
+
+/*
+** add8:
+**     ptrue   (p[0-7])\.d(?:, all)?
+**     add     z0\.s, \1/m, z0\.s, z1\.s
+**     ret
+*/
+svint32_t
+add8 (svint32_t x, svint32_t y)
+{
+  return svadd_m (svptrue_b64 (), x, y);
+}
+
+/*
+** add9:
+**     ptrue   (p[0-7])\.s(?:, all)?
+**     add     z0\.h, \1/m, z0\.h, z1\.h
+**     ret
+*/
+svint16_t
+add9 (svint16_t x, svint16_t y)
+{
+  return svadd_m (svptrue_b32 (), x, y);
+}
+
+/*
+** and1:
+**     and     z0\.s, z0\.s, #(?:0x)?1
+**     ret
+*/
+svint32_t
+and1 (svint32_t x)
+{
+  return svand_z (svptrue_b8 (), x, 1);
+}
+
+/*
+** and2:
+**     and     z0\.s, z0\.s, #(?:0x)?1
+**     ret
+*/
+svint32_t
+and2 (svint32_t x)
+{
+  return svand_z (svptrue_b16 (), x, 1);
+}
+
+/*
+** and3:
+**     and     z0\.s, z0\.s, #(?:0x)?1
+**     ret
+*/
+svint32_t
+and3 (svint32_t x)
+{
+  return svand_z (svptrue_b32 (), x, 1);
+}
+
+/*
+** and4:
+**     (?!and  z0\.s, z0\.s, #).*
+**     ret
+*/
+svint32_t
+and4 (svint32_t x)
+{
+  return svand_z (svptrue_b64 (), x, 1);
+}
+
+/*
+** and5:
+**     and     z0\.s, z0\.s, #(?:0x)?1
+**     ret
+*/
+svint32_t
+and5 (svint32_t x)
+{
+  return svand_m (svptrue_b8 (), x, 1);
+}
+
+/*
+** and6:
+**     and     z0\.s, z0\.s, #(?:0x)?1
+**     ret
+*/
+svint32_t
+and6 (svint32_t x)
+{
+  return svand_m (svptrue_b16 (), x, 1);
+}
+
+/*
+** and7:
+**     and     z0\.s, z0\.s, #(?:0x)?1
+**     ret
+*/
+svint32_t
+and7 (svint32_t x)
+{
+  return svand_m (svptrue_b32 (), x, 1);
+}
+
+/*
+** and8:
+**     (?!and  z0\.s, z0\.s, #).*
+**     ret
+*/
+svint32_t
+and8 (svint32_t x)
+{
+  return svand_m (svptrue_b64 (), x, 1);
+}
+
+/*
+** and9:
+** (
+**     and     p0\.b, p0/z, p1\.b, p1\.b
+** |
+**     and     p0\.b, p1/z, p0\.b, p0\.b
+** )
+**     ret
+*/
+svbool_t
+and9 (svbool_t x, svbool_t y)
+{
+  return svand_z (svptrue_b8 (), x, y);
+}
+
+/*
+** not1:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     not     z0\.s, \1/m, z1\.s
+**     ret
+*/
+svint32_t
+not1 (svint32_t x, svint32_t y)
+{
+  return svnot_m (x, svptrue_b8 (), y);
+}
+
+/*
+** cvt1:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     fcvtzs  z0\.s, \1/m, z0\.h
+**     ret
+*/
+svint32_t
+cvt1 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt2:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     fcvtzs  z0\.s, \1/m, z0\.h
+**     ret
+*/
+svint32_t
+cvt2 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt3:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     fcvtzs  z0\.s, \1/m, z0\.h
+**     ret
+*/
+svint32_t
+cvt3 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt4:
+**     ...
+**     movprfx [^\n]+
+**     ...
+**     ret
+*/
+svint32_t
+cvt4 (svfloat16_t x)
+{
+  return svcvt_s32_z (svptrue_b64 (), x);
+}
+
+/*
+** cvt5:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     fcvt    z0\.h, \1/m, z0\.s
+**     ret
+*/
+svfloat16_t
+cvt5 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt6:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     fcvt    z0\.h, \1/m, z0\.s
+**     ret
+*/
+svfloat16_t
+cvt6 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt7:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     fcvt    z0\.h, \1/m, z0\.s
+**     ret
+*/
+svfloat16_t
+cvt7 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt8:
+**     ...
+**     movprfx [^\n]+
+**     ...
+**     ret
+*/
+svfloat16_t
+cvt8 (svfloat32_t x)
+{
+  return svcvt_f16_z (svptrue_b64 (), x);
+}
+
+/*
+** cvt9:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     scvtf   z0\.h, \1/m, z0\.h
+**     ret
+*/
+svfloat16_t
+cvt9 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b8 (), x);
+}
+
+/*
+** cvt10:
+**     ptrue   (p[0-7])\.b(?:, all)?
+**     scvtf   z0\.h, \1/m, z0\.h
+**     ret
+*/
+svfloat16_t
+cvt10 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b16 (), x);
+}
+
+/*
+** cvt11:
+**     ...
+**     movprfx [^\n]+
+**     ...
+**     ret
+*/
+svfloat16_t
+cvt11 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b32 (), x);
+}
+
+/*
+** cvt12:
+**     ...
+**     movprfx [^\n]+
+**     ...
+**     ret
+*/
+svfloat16_t
+cvt12 (svint16_t x)
+{
+  return svcvt_f16_z (svptrue_b64 (), x);
+}
+
+#ifdef __cplusplus
+}
+#endif

[gcc r15-4213] aarch64: Fix SVE ACLE gimple folds for C++ LTO [PR116629]

Reply via email to