[PATCH V2] Enable small loop unrolling for O2

2022-11-01 Thread Hongyu Wang via Gcc-patches
Hi, this is the updated patch of
https://gcc.gnu.org/pipermail/gcc-patches/2022-October/604345.html,
which uses targetm.loop_unroll_adjust as gate to enable small loop unroll.

This patch does not change rs6000/s390 since I don't have machine to 
test them, but I suppose the default behavior is the same since they
enable flag_unroll_loops at O2.

Bootstrapped & regrtested on x86_64-pc-linux-gnu.

Ok for trunk?

-- Patch content 

Modern processors has multiple way instruction decoders
For x86, icelake/zen3 has 5 uops, so for small loop with <= 4
instructions (usually has 3 uops with a cmp/jmp pair that can be
macro-fused), the decoder would have 2 uops bubble for each iteration
and the pipeline could not be fully utilized.

Therefore, this patch enables loop unrolling for small size loop at O2
to fullfill the decoder as much as possible. It turns on rtl loop
unrolling when targetm.loop_unroll_adjust exists and O2 plus speed only.
In x86 backend the default behavior is to unroll small loops with less
than 4 insns by 1 time.

This improves 548.exchange2 by 9% on icelake and 7.4% on zen3 with
0.9% codesize increment. For other benchmarks the variants are minor
and overall codesize increased by 0.2%.

The kernel image size increased by 0.06%, and no impact on eembc.

gcc/ChangeLog:

* common/config/i386/i386-common.cc (ix86_optimization_table):
Enable small loop unroll at O2 by default.
* config/i386/i386.cc (ix86_loop_unroll_adjust): Adjust unroll
factor if -munroll-only-small-loops enabled and -funroll-loops/
-funroll-all-loops are disabled.
* config/i386/i386.opt: Add -munroll-only-small-loops,
-param=x86-small-unroll-ninsns= for loop insn limit,
-param=x86-small-unroll-factor= for unroll factor.
* doc/invoke.texi: Document -munroll-only-small-loops,
x86-small-unroll-ninsns and x86-small-unroll-factor.
* loop-init.cc (pass_rtl_unroll_loops::gate): Enable rtl
loop unrolling for -O2-speed and above if target hook
loop_unroll_adjust exists.

gcc/testsuite/ChangeLog:

* gcc.dg/guality/loop-1.c: Add additional option
  -mno-unroll-only-small-loops.
* gcc.target/i386/pr86270.c: Add -mno-unroll-only-small-loops.
* gcc.target/i386/pr93002.c: Likewise.
---
 gcc/common/config/i386/i386-common.cc   |  1 +
 gcc/config/i386/i386.cc | 18 ++
 gcc/config/i386/i386.opt| 13 +
 gcc/doc/invoke.texi | 16 
 gcc/loop-init.cc| 10 +++---
 gcc/testsuite/gcc.dg/guality/loop-1.c   |  2 ++
 gcc/testsuite/gcc.target/i386/pr86270.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr93002.c |  2 +-
 8 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index f66bdd5a2af..c6891486078 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -1724,6 +1724,7 @@ static const struct default_options 
ix86_option_optimization_table[] =
 /* The STC algorithm produces the smallest code at -Os, for x86.  */
 { OPT_LEVELS_2_PLUS, OPT_freorder_blocks_algorithm_, NULL,
   REORDER_BLOCKS_ALGORITHM_STC },
+{ OPT_LEVELS_2_PLUS_SPEED_ONLY, OPT_munroll_only_small_loops, NULL, 1 },
 /* Turn off -fschedule-insns by default.  It tends to make the
problem with not enough registers even worse.  */
 { OPT_LEVELS_ALL, OPT_fschedule_insns, NULL, 0 },
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index c0f37149ed0..0f94a3b609e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23827,6 +23827,24 @@ ix86_loop_unroll_adjust (unsigned nunroll, class loop 
*loop)
   unsigned i;
   unsigned mem_count = 0;
 
+  /* Unroll small size loop when unroll factor is not explicitly
+ specified.  */
+  if (!(flag_unroll_loops
+   || flag_unroll_all_loops
+   || loop->unroll))
+{
+  nunroll = 1;
+
+  /* Any explicit -f{no-}unroll-{all-}loops turns off
+-munroll-only-small-loops.  */
+  if (ix86_unroll_only_small_loops
+ && !OPTION_SET_P (flag_unroll_loops))
+   if (loop->ninsns <= (unsigned) ix86_small_unroll_ninsns)
+ nunroll = (unsigned) ix86_small_unroll_factor;
+
+  return nunroll;
+}
+
   if (!TARGET_ADJUST_UNROLL)
  return nunroll;
 
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 53d534f6392..6da9c8d670d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1224,3 +1224,16 @@ mavxvnniint8
 Target Mask(ISA2_AVXVNNIINT8) Var(ix86_isa_flags2) Save
 Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2 and
 AVXVNNIINT8 built-in functions and code generation.
+
+munroll-only-small-loops
+Target Var(ix86_unroll_only_small_loops) Init(0) Save
+Enable conservative small loop unrolling.
+

[r13-3596 Regression] FAIL: gcc.dg/guality/pr54693-2.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects -DPREVENT_OPTIMIZATION line 21 x == 10 - i on Linux/x86_64

2022-11-01 Thread haochen.jiang via Gcc-patches
On Linux/x86_64,

e7310e24b1c0ca67b1bb507c1330b2bf39e59e32 is the first bad commit
commit e7310e24b1c0ca67b1bb507c1330b2bf39e59e32
Author: Andrew MacLeod 
Date:   Tue Oct 25 16:42:41 2022 -0400

Make ranger vrp1 default.

caused

FAIL: gcc.dg/guality/pr54693-2.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 21 x == 10 - i

with GCC configured with

../../gcc/configure 
--prefix=/export/users/haochenj/src/gcc-bisect/master/master/r13-3596/usr 
--enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="guality.exp=gcc.dg/guality/pr54693-2.c 
--target_board='unix{-m32}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="guality.exp=gcc.dg/guality/pr54693-2.c --target_board='unix{-m32\ 
-march=cascadelake}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="guality.exp=gcc.dg/guality/pr54693-2.c 
--target_board='unix{-m64}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="guality.exp=gcc.dg/guality/pr54693-2.c --target_board='unix{-m64\ 
-march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at haochen dot jiang at intel.com)


[PATCH 3/3] Update float 128-bit conversions, PR target/107299.

2022-11-01 Thread Michael Meissner via Gcc-patches
This patch fixes two tests that are still failing when long double is IEEE
128-bit after the previous 2 patches for PR target/107299 have been applied.
The tests are:

gcc.target/powerpc/convert-fp-128.c
gcc.target/powerpc/pr85657-3.c

This patch is a rewrite of the patch submitted on August 18th:

| https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599988.html

This patch reworks the conversions between 128-bit binary floating point types.
Previously, we would call rs6000_expand_float128_convert to do all conversions.
Now, we only define the conversions between the same representation that turn
into a NOP.  The appropriate extend or truncate insn is generated, and after
register allocation, it is converted to a move.

This patch also fixes two places where we want to override the external name
for the conversion function, and the wrong optab was used.  Previously,
rs6000_expand_float128_convert would handle the move or generate the call as
needed.  Now, it lets the machine independent code generate the call.  But if
we use the machine independent code to generate the call, we need to update the
name for two optabs where a truncate would be used in terms of converting
between the modes.  This patch updates those two optabs.

I tested this patch on:

1)  LE Power10 using --with-cpu=power10 --with-long-double-format=ieee
2)  LE Power10 using --with-cpu=power10 --with-long-double-format=ibm
3)  LE Power9  using --with-cpu=power9  --with-long-double-format=ibm
4)  BE Power8  using --with-cpu=power8  --with-long-double-format=ibm

In the past I have also tested this exact patch on the following systems:

1)  LE Power10 using --with-cpu=power9  --with-long-double-format=ibm
2)  LE Power10 using --with-cpu=power8  --with-long-double-format=ibm
3)  LE Power10 using --with-cpu=power10 --with-long-double-format=ibm

There were no regressions in the bootstrap process or running the tests (after
applying all 3 patches for PR target/107299).  Can I check this patch into the
trunk?

2022-11-01   Michael Meissner  

gcc/

PR target/107299
* config/rs6000/rs6000.cc (init_float128_ieee): Use the correct
float_extend or float_truncate optab based on how the machine converts
between IEEE 128-bit and IBM 128-bit.
* config/rs6000/rs6000.md (IFKF): Delete.
(IFKF_reg): Delete.
(extendiftf2): Rewrite to be a move if IFmode and TFmode are both IBM
128-bit.  Do not run if TFmode is IEEE 128-bit.
(extendifkf2): Delete.
(extendtfkf2): Delete.
(extendtfif2): Delete.
(trunciftf2): Delete.
(truncifkf2): Delete.
(trunckftf2): Delete.
(extendkftf2): Implement conversion of IEEE 128-bit types as a move.
(trunctfif2): Delete.
(trunctfkf2): Implement conversion of IEEE 128-bit types as a move.
(extendtf2_internal): Delete.
(extendtf2_internal): Delete.
---
 gcc/config/rs6000/rs6000.cc |   4 +-
 gcc/config/rs6000/rs6000.md | 177 ++--
 2 files changed, 50 insertions(+), 131 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc
index 8a8357512c0..9a5907c7130 100644
--- a/gcc/config/rs6000/rs6000.cc
+++ b/gcc/config/rs6000/rs6000.cc
@@ -11156,11 +11156,11 @@ init_float128_ieee (machine_mode mode)
   set_conv_libfunc (trunc_optab, SFmode, mode, "__trunckfsf2");
   set_conv_libfunc (trunc_optab, DFmode, mode, "__trunckfdf2");
 
-  set_conv_libfunc (sext_optab, mode, IFmode, "__trunctfkf2");
+  set_conv_libfunc (trunc_optab, mode, IFmode, "__trunctfkf2");
   if (mode != TFmode && FLOAT128_IBM_P (TFmode))
set_conv_libfunc (sext_optab, mode, TFmode, "__trunctfkf2");
 
-  set_conv_libfunc (trunc_optab, IFmode, mode, "__extendkftf2");
+  set_conv_libfunc (sext_optab, IFmode, mode, "__extendkftf2");
   if (mode != TFmode && FLOAT128_IBM_P (TFmode))
set_conv_libfunc (trunc_optab, TFmode, mode, "__extendkftf2");
 
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 3bae303086b..4880df5c51c 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -543,12 +543,6 @@ (define_mode_iterator FMOVE128_GPR [TI
 ; Iterator for 128-bit VSX types for pack/unpack
 (define_mode_iterator FMOVE128_VSX [V1TI KF])
 
-; Iterators for converting to/from TFmode
-(define_mode_iterator IFKF [IF KF])
-
-; Constraints for moving IF/KFmode.
-(define_mode_attr IFKF_reg [(IF "d") (KF "wa")])
-
 ; Whether a floating point move is ok, don't allow SD without hardware FP
 (define_mode_attr fmove_ok [(SF "")
(DF "")
@@ -9096,106 +9090,65 @@ (define_insn "*ieee_128bit_vsx_nabs2_internal"
   "xxlor %x0,%x1,%x2"
   [(set_attr "type" "veclogical")])
 
-;; Float128 conversion functions.  These expand to library function calls.
-;; We use expand to convert from IBM double double to IEEE 128-bit
-;; and trunc for 

[PATCH 2/3] Make __float128 use the _Float128 type, PR target/107299

2022-11-01 Thread Michael Meissner via Gcc-patches
This patch fixes the issue that GCC cannot build when the default long double
is IEEE 128-bit.  It fails in building libgcc, specifically when it is trying
to buld the __mulkc3 function in libgcc.  It is failing in gimple-range-fold.cc
during the evrp pass.  Ultimately it is failing because the code declared the
type to use TFmode but it used F128 functions (i.e. KFmode).

typedef float TFtype __attribute__((mode (TF)));
typedef __complex float TCtype __attribute__((mode (TC)));

TCtype
__mulkc3_sw (TFtype a, TFtype b, TFtype c, TFtype d)
{
  TFtype ac, bd, ad, bc, x, y;
  TCtype res;

  ac = a * c;
  bd = b * d;
  ad = a * d;
  bc = b * c;

  x = ac - bd;
  y = ad + bc;

  if (__builtin_isnan (x) && __builtin_isnan (y))
{
  _Bool recalc = 0;
  if (__builtin_isinf (a) || __builtin_isinf (b))
{

  a = __builtin_copysignf128 (__builtin_isinf (a) ? 1 : 0, a);
  b = __builtin_copysignf128 (__builtin_isinf (b) ? 1 : 0, b);
  if (__builtin_isnan (c))
c = __builtin_copysignf128 (0, c);
  if (__builtin_isnan (d))
d = __builtin_copysignf128 (0, d);
  recalc = 1;
}
  if (__builtin_isinf (c) || __builtin_isinf (d))
{

  c = __builtin_copysignf128 (__builtin_isinf (c) ? 1 : 0, c);
  d = __builtin_copysignf128 (__builtin_isinf (d) ? 1 : 0, d);
  if (__builtin_isnan (a))
a = __builtin_copysignf128 (0, a);
  if (__builtin_isnan (b))
b = __builtin_copysignf128 (0, b);
  recalc = 1;
}
  if (!recalc
  && (__builtin_isinf (ac) || __builtin_isinf (bd)
  || __builtin_isinf (ad) || __builtin_isinf (bc)))
{

  if (__builtin_isnan (a))
a = __builtin_copysignf128 (0, a);
  if (__builtin_isnan (b))
b = __builtin_copysignf128 (0, b);
  if (__builtin_isnan (c))
c = __builtin_copysignf128 (0, c);
  if (__builtin_isnan (d))
d = __builtin_copysignf128 (0, d);
  recalc = 1;
}
  if (recalc)
{
  x = __builtin_inff128 () * (a * c - b * d);
  y = __builtin_inff128 () * (a * d + b * c);
}
}

  __real__ res = x;
  __imag__ res = y;
  return res;
}

Currently GCC uses the long double type node for __float128 if long double is
IEEE 128-bit.  It did not use the node for _Float128.

Originally this was noticed if you call the nansq function to make a signaling
NaN (nansq is mapped to nansf128).  Because the type node for _Float128 is
different from __float128, the machine independent code converts signaling NaNs
to quiet NaNs if the types are not compatible.  The following tests used to
fail when run on a system where long double is IEEE 128-bit:

gcc.dg/torture/float128-nan.c
gcc.target/powerpc/nan128-1.c

This patch makes both __float128 and _Float128 use the same type node.

One side effect of not using the long double type node for __float128 is that we
must only use KFmode for _Float128/__float128.  The libstdc++ library won't
build if we use TFmode for _Float128 and __float128 when long double is IEEE
128-bit.

Another minor side effect is that the f128 round to odd fused multiply-add
function will not merge negatition with the FMA operation when the type is long
double.  If the type is __float128 or _Float128, then it will continue to do the
optimization.  The round to odd functions are defined in terms of __float128
arguments.  For example:

long double
do_fms (long double a, long double b, long double c)
{
return __builtin_fmaf128_round_to_odd (a, b, -c);
}

will generate (assuming -mabi=ieeelongdouble):

xsnegqp 4,4
xsmaddqpo 4,2,3
xxlor 34,36,36

while:

__float128
do_fms (__float128 a, __float128 b, __float128 c)
{
return __builtin_fmaf128_round_to_odd (a, b, -c);
}

will generate:

xsmsubqpo 4,2,3
xxlor 34,36,36

I tested all 3 patchs for PR target/107299 on:

1)  LE Power10 using --with-cpu=power10 --with-long-double-format=ieee
2)  LE Power10 using --with-cpu=power10 --with-long-double-format=ibm
3)  LE Power9  using --with-cpu=power9  --with-long-double-format=ibm
4)  BE Power8  using --with-cpu=power8  --with-long-double-format=ibm

Once all 3 patches have been applied, we can once again build GCC when long
double is IEEE 128-bit.  There were no other regressions 

[PATCH 1/3] Rework 128-bit complex multiply and divide, PR target/107299

2022-11-01 Thread Michael Meissner via Gcc-patches
This function reworks how the complex multiply and divide built-in functions are
done.  Previously we created built-in declarations for doing long double complex
multiply and divide when long double is IEEE 128-bit.  The old code also did not
support __ibm128 complex multiply and divide if long double is IEEE 128-bit.

In terms of history, I wrote the original code just as I was starting to test
GCC on systems where IEEE 128-bit long double was the default.  At the time, we
had not yet started mangling the built-in function names as a way to bridge
going from a system with 128-bit IBM long double to 128-bin IEEE long double.

The original code depends on there only being two 128-bit types invovled.  With
the next patch in this series, this assumption will no longer be true.  When
long double is IEEE 128-bit, there will be 2 IEEE 128-bit types (one for the
explicit __float128/_Float128 type and one for long double).

The problem is we cannot create two separate built-in functions that resolve to
the same name.  This is a requirement of add_builtin_function and the C front
end.  That means for the 3 possible modes (IFmode, KFmode, and TFmode), you can
only use 2 of them.

This code does not create the built-in declaration with the changed name.
Instead, it uses the TARGET_MANGLE_DECL_ASSEMBLER_NAME hook to change the name
before it is written out to the assembler file like it now does for all of the
other long double built-in functions.

We need to disable using this mapping when we are building libgcc, specifically
when it is building the floating point 128-bit multiply and divide functions.
The flag that is used when libgcc is built (-fbuilding-libcc) is only available
in the C/C++ front ends.  We need to remember that we are building libgcc in the
rs6000-c.cc support to be able to use this later to decided whether to mangle
the decl assembler name or not.

When I wrote these patches, I discovered that __ibm128 complex multiply and
divide had originally not been supported if long double is IEEE 128-bit as it
would generate calls to __mulic3 and __divic3.  I added tests in the testsuite
to verify that the correct name (i.e. __multc3 and __divtc3) is used in this
case.

I tested all 3 patchs for PR target/107299 on:

1)  LE Power10 using --with-cpu=power10 --with-long-double-format=ieee
2)  LE Power10 using --with-cpu=power10 --with-long-double-format=ibm
3)  LE Power9  using --with-cpu=power9  --with-long-double-format=ibm
4)  BE Power8  using --with-cpu=power8  --with-long-double-format=ibm

Once all 3 patches have been applied, we can once again build GCC when long
double is IEEE 128-bit.  There were no other regressions with these patches.
Can I check these patches into the trunk?

2022-11-01   Michael Meissner  

gcc/

PR target/107299
* config/rs6000/rs6000-c.cc (rs6000_cpu_cpp_builtins): Set
building_libgcc.
* config/rs6000/rs6000.cc (create_complex_muldiv): Delete.
(init_float128_ieee): Delete code to switch complex multiply and divide
for long double.
(complex_multiply_builtin_code): New helper function.
(complex_divide_builtin_code): Likewise.
(rs6000_mangle_decl_assembler_name): Add support for mangling the name
of complex 128-bit multiply and divide built-in functions.
* config/rs6000/rs6000.opt (building_libgcc): New target variable.

gcc/testsuite/

PR target/107299
* gcc.target/powerpc/divic3-1.c: New test.
* gcc.target/powerpc/divic3-2.c: Likewise.
* gcc.target/powerpc/mulic3-1.c: Likewise.
* gcc.target/powerpc/mulic3-2.c: Likewise.
---
 gcc/config/rs6000/rs6000-c.cc   |   8 ++
 gcc/config/rs6000/rs6000.cc | 110 +++-
 gcc/config/rs6000/rs6000.opt|   4 +
 gcc/testsuite/gcc.target/powerpc/divic3-1.c |  18 
 gcc/testsuite/gcc.target/powerpc/divic3-2.c |  17 +++
 gcc/testsuite/gcc.target/powerpc/mulic3-1.c |  18 
 gcc/testsuite/gcc.target/powerpc/mulic3-2.c |  17 +++
 7 files changed, 145 insertions(+), 47 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/divic3-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/divic3-2.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/mulic3-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/mulic3-2.c

diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 56609462629..5c2f3bcee9f 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -780,6 +780,14 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
   || DEFAULT_ABI == ABI_ELFv2
   || (DEFAULT_ABI == ABI_AIX && !rs6000_compat_align_parm))
 builtin_define ("__STRUCT_PARM_ALIGN__=16");
+
+  /* Store whether or not we are building libgcc.  This is needed to disable
+ generating the alternate names for 128-bit complex multiply and divide.
+ We need to disable generating __multc3, __divtc3, __mulkc3, and 

Patch [0/3] for PR target/107299 (GCC does not build on PowerPC when long double is IEEE 128-bit)

2022-11-01 Thread Michael Meissner via Gcc-patches
These 3 patches fix the problems with building GCC on PowerPC systems when long
double is configured to use the IEEE 128-bit format.

There are 3 patches in this patch set.  The first two patches are required to
fix the basic problem.  The third patch fixes some issue that were noticed
along the way.

The basic issue is internally within GCC there are several types for 128-bit
floating point.  The types are:

1)  The long double type (TFmode or possibly DFmode).  In the normal case,
long double is 128-bits (TFmode) and depending on the configuration
switches and the switches passed by the user at compilation time, long
double is either the 128-bit IBM double-double type or IEEE 128-bit.

2)  The type for __ibm128.  If long double is IBM 128-bit double-double,
internally within the compiler, this type is the same as the long
double type.  If long double is either IEEE 128-bit or is 64-bit, then
this type is a separate type.

3)  The type for _Float128.  This type is always IEEE 128-bit if it exists.
While it is a separate internal type, currently if long double is IEEE
128-bit, this type uses TFmode once it gets to RTL, but within Gimple
it is a separate type.  If long double is not IEEE 128-bit, then this
type uses KFmode.  All of the f128 math functions defined by the
compiler use this type.  In the past, _Float128 was a C extended type,
but now it is a part of the C/C++ 2x standards.

4)  The type for __float128.  The history is I implemented __float128
first, and several releases later, we added _Float128 as a standard C
type.  Unfortunately, I didn't think things through enough when
_Float128 came out.  Like __ibm128, it uses the long double type if
long double is IEEE 128-bit, and now it uses the _Float128 type if long
double is not IEEE 128-bit.  IMHO, this is the major problem.  The two
IEEE 128-bit types should use the same type internally (or at least one
should be a qualified type of the other).  Before we started adding
more support for _Float128, it mostly works, but now it doesn't with
more optimizations being done.

5)  The error occurs in building _mulkc3 in libgcc, when the TFmode type in
the code is defined to use attribute((mode(TF))), but the functions
that are called all have _Float128 arguments.  These are separate
types, and ultimately one of the consistancy checks fails because they
are different types.

There are 3 patches in this set:

1)  The first patch rewrites how the complex 128-bit multiply and divide
functions are done in the compiler.  In the old scheme, essentially
there were only two types ever being used, the long double type, and
the not long double type.  The original code would make the names
called of these functions to be __multc3/__divtc3 or
__mulkc3/__divkc3.  This worked because there were only two types.
With straightening out the types, so __float128/_Float128 is never the
long double type, there are potentially 3-4 types.  However, the C
front end and the middle end code will not let use create two built-in
functions that have the same name.

So I ripped out this code, and I hopefully replaced it with cleaner
code that is in patch #1.  This patch needs to be in the compiler
before the second patch can be installed.

2)  The second patch fixes the problem of __float128 and _Float128 not
being the same if long double is IEEE 128-bit.  After this patch, both
_Float128 and __float128 types will always use the KFmode type.  The
stdc++ library will not build if we use TFmode for these types due to
the other changes.

There is a minor codegen issue that if you explicitly use long double
and call the F128 FMA (fused multiply-add) round to odd functions that
are defined to use __float128/_Float128 arguments.  While we might be
able to optimize these later, I don't think it is important to optimize
the use of long double instead of __float128/_Float128.  Note, if you
use the proper __float128/_Float128 types instead of long double, the
code does the optimization.

By doing this change, it also fixes two tests that have been broken on
IEEE 128-bit long double systems (float128-cmp2-runnable.c and
nan128-1.c).  These two tests use __float128 variables and call nansq
to create a signaling NaN.  Nansq is defined to be __builtin_nansf128,
which returns a _Float128 Nan.  However, since in the current
implementation before these patches, __float128 is a different type
than _Float128 when long double is IEEE 128-bit, the machine
independent code converts the signaling NaN into a non-signaling NaN.

RE: [wwwdocs] [GCC13] Mention Intel __bf16 support in AVX512BF16 intrinsics.

2022-11-01 Thread Kong, Lingling via Gcc-patches
> > diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
> > index 7c6bfa6e..cd0282f1 100644
> > --- a/htdocs/gcc-13/changes.html
> > +++ b/htdocs/gcc-13/changes.html
> > @@ -230,6 +230,8 @@ a work-in-progress.
> >For both C and C++ the __bf16 type is supported on
> >x86 systems with SSE2 and above enabled.
> >
> > +  Use __bf16 type for AVX512BF16 intrinsics.
> Could you add more explanations. Like originally it's ..., now it's ..., and 
> what's
> the difference when users compile the same source code(which contains
> avx512bf16 intrinsics) with gcc12(and before) and GCC13.
> > +  
> >  
> >
> >  
> > --
> > 2.18.2
> >
Yes,  changed it. Thanks a lot!

Subject: [PATCH] Mention Intel __bf16 support in AVX512BF16 intrinsics.

---
 htdocs/gcc-13/changes.html | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
index 7c6bfa6e..a35f4fab 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -230,6 +230,12 @@ a work-in-progress.
   For both C and C++ the __bf16 type is supported on
   x86 systems with SSE2 and above enabled.
   
+  Use __bf16 type for AVX512BF16 intrinsics. Previously we use
+  short to represent bf16. Now we introduced __bf16 to x86 psABI.
+  So we switch intrinsics in AVX512BF16 to the new type __bf16.
+  When users compile the same source code contains AVX512BF16 intrinsics with
+  GCC13 need to support SSE2, which is different to GCC12 (and before).
+  
 

 
--
2.18.2

BRs,
Lingling


[OG12] [committed] amdgcn: Enable SIMD vectorization of math library functions

2022-11-01 Thread Kwok Cheung Yeung

Hello

I have committed the following patches onto the devel/omp/gcc-12 
development branch:


863579c4e30 amdgcn: Enable SIMD vectorization of math functions
bd9a6106b95 amdgcn: Add SIMD versions of math routines to libgcc
d3a2a1cc424 amdgcn: Add builtins for vector floor/floorf
a3c04a367a9 amdgcn: Fix expansion of builtin for vector fabs operation

These patches implement a vectorized version of most of the C math 
library for AMD GCN. These routines will be used when math functions are 
used in auto-vectorized code.


Note that -fno-math-errno must be specified on the command line in most 
cases before the compiler will consider using these functions.


Vectors smaller than the native 64 element ones are also supported (by 
masking off the unused lanes), which can be useful for SLP vectorized code.


Kwok Yeung


Re: [PATCH v2] RISC-V modified add3 for large stack frame optimization [PR105733]

2022-11-01 Thread Jeff Law via Gcc-patches



On 11/1/22 11:25, Kevin Lee wrote:

This is the updated patch of
https://gcc.gnu.org/pipermail/gcc-patches/2022-September/601824.html. Since
the riscv-selftest.cc has been added, this version of the patch adds the
logic in riscv-selftest.cc to also consider parallel insns.
   The patch has been tested with rv64imafdc / rv64imac / rv32imafdc /
rv32imac and no additional failures were detected in the testsuite.

gcc/ChangeLog:
Jim Wilson
Michael Collison
Kevin Lee
* config/riscv/predicates.md (const_lui_operand): New Predicate.
(add_operand): Ditto.
(reg_or_const_int_operand): Ditto.
* config/riscv/riscv-protos.h (riscv_eliminable_reg): New
function.
* config/riscv/riscv-selftests.cc (calculate_x_in_sequence):
Consider Parallel insns.
* config/riscv/riscv.cc (riscv_eliminable_reg): New function.
(riscv_adjust_libcall_cfi_prologue): Use gen_rtx_SET and
gen_rtx_fmt_ee instead of gen_add3_insn.
(riscv_adjust_libcall_cfi_epilogue): Ditto.
* config/riscv/riscv.md (addsi3): Remove.
(add3): New instruction for large stack frame
optimization.
(add3_internal): Ditto.
(adddi3): Remove.
(add3_internal2): New instruction for insns generated in
the prologue and epilogue pass.


It looks like your mailer completely messed up the formatting of the 
patch.  Please resend it as a plaintext attachment.  It's basically 
unreadable as-is.



I went back and looked at the original thread, for the saxpy example, 
the patch made a notable improvement in the setup code, but actually 
lengthened the loop by one instruction, though it has eliminated two 
memory loads in the loop, replacing them with arithmetic, which is 
probably a win.


The loop still seems a bit odd which may point to further improvements 
that could be made to this patch.  Consider this fragment of the loop:


addi a3,sp,-864
sh2add a3,a5,a3
flw fa5,864(a3)

Note the +-864.  Don't those just cancel out?


Jeff




Re: [PATCH 0/9] Add debug_annotate attributes

2022-11-01 Thread Yonghong Song via Gcc-patches

Hi, Jose and David,

Any progress on implement debug_annotate attribute in gcc?

Thanks,

Yonghong


On 6/15/22 3:56 PM, Yonghong Song wrote:



On 6/15/22 1:57 PM, David Faust wrote:



On 6/14/22 22:53, Yonghong Song wrote:



On 6/7/22 2:43 PM, David Faust wrote:

Hello,

This patch series adds support for:

- Two new C-language-level attributes that allow to associate (to 
"annotate" or
    to "tag") particular declarations and types with arbitrary 
strings. As
    explained below, this is intended to be used to, for example, 
characterize

    certain pointer types.

- The conveyance of that information in the DWARF output in the form 
of a new

    DIE: DW_TAG_GNU_annotation.

- The conveyance of that information in the BTF output in the form 
of two new

    kinds of BTF objects: BTF_KIND_DECL_TAG and BTF_KIND_TYPE_TAG.

All of these facilities are being added to the eBPF ecosystem, and 
support for

them exists in some form in LLVM.

Purpose
===

1)  Addition of C-family language constructs (attributes) to specify 
free-text

  tags on certain language elements, such as struct fields.

  The purpose of these annotations is to provide additional 
information about
  types, variables, and function parameters of interest to the 
kernel. A
  driving use case is to tag pointer types within the linux 
kernel and eBPF
  programs with additional semantic information, such as 
'__user' or '__rcu'.


  For example, consider the linux kernel function do_execve with 
the

  following declaration:

    static int do_execve(struct filename *filename,
   const char __user *const __user *__argv,
   const char __user *const __user *__envp);

  Here, __user could be defined with these annotations to record 
semantic
  information about the pointer parameters (e.g., they are 
user-provided) in
  DWARF and BTF information. Other kernel facilites such as the 
eBPF verifier

  can read the tags and make use of the information.

2)  Conveying the tags in the generated DWARF debug info.

  The main motivation for emitting the tags in DWARF is that the 
Linux kernel
  generates its BTF information via pahole, using DWARF as a 
source:


  ++  BTF  BTF   +--+
  | pahole |---> vmlinux.btf --->| verifier |
  ++ +--+
  ^    ^
  |    |
    DWARF |    BTF |
  |    |
   vmlinux  +-+
   module1.ko   | BPF program |
   module2.ko   +-+
 ...

  This is because:

  a)  Unlike GCC, LLVM will only generate BTF for BPF programs.

  b)  GCC can generate BTF for whatever target with -gbtf, but 
there is no

  support for linking/deduplicating BTF in the linker.

  In the scenario above, the verifier needs access to the 
pointer tags of
  both the kernel types/declarations (conveyed in the DWARF and 
translated
  to BTF by pahole) and those of the BPF program (available 
directly in BTF).


  Another motivation for having the tag information in DWARF, 
unrelated to
  BPF and BTF, is that the drgn project (another DWARF consumer) 
also wants
  to benefit from these tags in order to differentiate between 
different

  kinds of pointers in the kernel.

3)  Conveying the tags in the generated BTF debug info.

  This is easy: the main purpose of having this info in BTF is 
for the
  compiled eBPF programs. The kernel verifier can then access 
the tags

  of pointers used by the eBPF programs.


For more information about these tags and the motivation behind 
them, please

refer to the following linux kernel discussions:

    https://lore.kernel.org/bpf/20210914223004.244411-1-...@fb.com/
    https://lore.kernel.org/bpf/20211012164838.3345699-1-...@fb.com/
    https://lore.kernel.org/bpf/2022012604.1504583-1-...@fb.com/


Implementation Overview
===

To enable these annotations, two new C language attributes are added:
__attribute__((debug_annotate_decl("foo"))) and
__attribute__((debug_annotate_type("bar"))). Both attributes accept 
a single
arbitrary string constant argument, which will be recorded in the 
generated
DWARF and/or BTF debug information. They have no effect on code 
generation.


Note that we are not using the same attribute names as LLVM 
(btf_decl_tag and
btf_type_tag, respectively). While these attributes are functionally 
very
similar, they have grown beyond purely BTF-specific uses, so 
inclusion of "btf"

in the attribute name seems misleading.

DWARF support is enabled via a new DW_TAG_GNU_annotation. When 
generating DWARF,
declarations and types 

[PATCH] c++: Quash -Wdangling-reference for member operator* [PR107488]

2022-11-01 Thread Marek Polacek via Gcc-patches
-Wdangling-reference complains here:

  std::vector v = ...;
  std::vector::const_iterator it = v.begin();
  while (it != v.end()) {
const int  = *it++; // warning
  }

because it sees a call to
__gnu_cxx::__normal_iterator >::operator*
which returns a reference and its argument is a TARGET_EXPR representing
the result of
__gnu_cxx::__normal_iterator >::operator++
But 'r' above refers to one of the int elements of the vector 'v', not
to a temporary object.  Therefore the warning is a false positive.

I suppose code like the above is relatively common (the warning broke
cppunit-1.15.1 and a few other projects), so presumably it makes sense
to suppress the warning when it comes to member operator*.  In this case
it's defined as

  reference
  operator*() const _GLIBCXX_NOEXCEPT
  { return *_M_current; }

and I'm guessing a lot of member operator* are like that, at least when
it comes to iterators.  I've looked at _Fwd_list_iterator,
_Fwd_list_const_iterator, __shared_ptr_access, _Deque_iterator,
istream_iterator, etc, and they're all like that, so adding #pragmas
would be quite tedious.  :/

Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?

PR c++/107488

gcc/cp/ChangeLog:

* call.cc (do_warn_dangling_reference): Quash -Wdangling-reference
for member operator*.

gcc/testsuite/ChangeLog:

* g++.dg/warn/Wdangling-reference5.C: New test.
---
 gcc/cp/call.cc| 12 +-
 .../g++.dg/warn/Wdangling-reference5.C| 22 +++
 2 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/warn/Wdangling-reference5.C

diff --git a/gcc/cp/call.cc b/gcc/cp/call.cc
index c7c7a122045..2c0fa37f53a 100644
--- a/gcc/cp/call.cc
+++ b/gcc/cp/call.cc
@@ -13467,7 +13467,17 @@ do_warn_dangling_reference (tree expr)
   can be e.g.
 const int& z = std::min({1, 2, 3, 4, 5, 6, 7});
   which doesn't dangle: std::min here returns an int.  */
-   || !TYPE_REF_OBJ_P (TREE_TYPE (TREE_TYPE (fndecl
+   || !TYPE_REF_OBJ_P (TREE_TYPE (TREE_TYPE (fndecl)))
+   /* Don't emit a false positive for:
+   std::vector v = ...;
+   std::vector::const_iterator it = v.begin();
+   const int  = *it++;
+  because R refers to one of the int elements of V, not to
+  a temporary object.  Member operator* may return a reference
+  but probably not to one of its arguments.  */
+   || (DECL_NONSTATIC_MEMBER_FUNCTION_P (fndecl)
+   && DECL_OVERLOADED_OPERATOR_P (fndecl)
+   && DECL_OVERLOADED_OPERATOR_IS (fndecl, INDIRECT_REF)))
  return NULL_TREE;
 
/* Here we're looking to see if any of the arguments is a temporary
diff --git a/gcc/testsuite/g++.dg/warn/Wdangling-reference5.C 
b/gcc/testsuite/g++.dg/warn/Wdangling-reference5.C
new file mode 100644
index 000..59b5538aee5
--- /dev/null
+++ b/gcc/testsuite/g++.dg/warn/Wdangling-reference5.C
@@ -0,0 +1,22 @@
+// PR c++/107488
+// { dg-do compile }
+// { dg-options "-Wdangling-reference" }
+
+#include 
+
+int
+do_sum (std::vector& v)
+{
+  int sum = 0;
+
+  std::vector::const_iterator it = v.begin();
+  while (it != v.end())
+{
+  // R refers to one of the int elements of V, not to a temporary
+  // object, so no dangling reference here.
+  const int  = *it++; // { dg-bogus "dangling reference" }
+  sum += r;
+}
+
+  return sum;
+}

base-commit: 2b0e81d5cc2f7e1d773f6c502bd65b097f392675
-- 
2.38.1



Re: [PATCH v2 06/11] OpenMP: lvalue parsing for map clauses (C++)

2022-11-01 Thread Julian Brown
Hi,

On Tue, 24 May 2022 16:15:31 +0200
Jakub Jelinek via Fortran  wrote:

> On Fri, Mar 18, 2022 at 09:26:47AM -0700, Julian Brown wrote:
> > --- a/gcc/cp/parser.cc
> > +++ b/gcc/cp/parser.cc
> > @@ -4266,6 +4266,9 @@ cp_parser_new (cp_lexer *lexer)
> >parser->omp_declare_simd = NULL;
> >parser->oacc_routine = NULL;
> >  
> > +  /* Allow array slice in expression.  */  
> 
> Better /* Disallow OpenMP array sections in expressions.  */

Fixed.

> > +  parser->omp_array_section_p = false;
> > +
> >/* Not declaring an implicit function template.  */
> >parser->auto_is_implicit_function_template_parm_p = false;
> >parser->fully_implicit_function_template_p = false;  
> 
> I think we should figure out when we should temporarily disable
>   parser->omp_array_section_p = false;
> and restore it afterwards to a saved value.  E.g.
> cp_parser_lambda_expression seems like a good candidate, the fact that
> OpenMP array sections are allowed say in map clause doesn't mean they
> are allowed inside of lambdas and it would be especially hard when
> the lambda is defining a separate function and the search for
> OMP_ARRAY_SECTION probably wouldn't be able to discover those.
> Other spots to consider might be statement expressions, perhaps type
> definitions etc.

I've had a go at doing this -- several expression types now forbid
array-section syntax (see new "bad-array-section-*" tests added). I'm
afraid my C++ isn't quite up to figuring out how it's possible to
define a type inside an expression (inside a map clause) if we forbid
lambdas and statement expressions though -- can you give an example?

> > @@ -8021,6 +8024,7 @@ cp_parser_postfix_open_square_expression
> > (cp_parser *parser, releasing_vec expression_list = NULL;
> >location_t loc = cp_lexer_peek_token (parser->lexer)->location;
> >bool saved_greater_than_is_operator_p;
> > +  bool saved_colon_corrects_to_scope_p;
> >  
> >/* Consume the `[' token.  */
> >cp_lexer_consume_token (parser->lexer);
> > @@ -8028,6 +8032,9 @@ cp_parser_postfix_open_square_expression
> > (cp_parser *parser, saved_greater_than_is_operator_p =
> > parser->greater_than_is_operator_p;
> > parser->greater_than_is_operator_p = true; 
> > +  saved_colon_corrects_to_scope_p =
> > parser->colon_corrects_to_scope_p;
> > +  parser->colon_corrects_to_scope_p = false;  
> 
> I think the last above line should be guarded on
>   if (parser->omp_array_section_p)
> There is no reason to get worse diagnostics in non-OpenMP code or
> even in OpenMP code where array sections aren't allowed.

Fixed.

> > +
> > +  /* NOTE: We are reusing using the type of the whole array as
> > the type of
> > +the array section here, which isn't necessarily entirely
> > correct.
> > +Might need revisiting.  */  
> 
> "reusing using" looks weird.
> As for the type of OMP_ARRAY_SECTION trees, perhaps we could
> initially use an incomplete array (so array element would be
> meaningful) and when we figure out the details and the array section
> is contiguous change its type to array type covering it.

This version of the patch makes a best-effort attempt to create an
exact-sized array type at parse time, else falls back to an incomplete
array type if there are e.g. variable bounds. The type is essentially
only used for diagnostics anyway, I think, so that should hopefully be
good enough.

> > +  return build3_loc (input_location, OMP_ARRAY_SECTION,
> > +TREE_TYPE (postfix_expression),
> > +postfix_expression, index, length);
> > +}
> > +
> > +  parser->colon_corrects_to_scope_p =
> > saved_colon_corrects_to_scope_p; +
> >/* Look for the closing `]'.  */
> >cp_parser_require (parser, CPP_CLOSE_SQUARE, RT_CLOSE_SQUARE);
> >  
> > @@ -36536,7 +36570,7 @@ struct omp_dim
> >  static tree
> >  cp_parser_omp_var_list_no_open (cp_parser *parser, enum
> > omp_clause_code kind, tree list, bool *colon,
> > -   bool allow_deref = false)
> > +   bool map_lvalue = false)
> >  {
> >auto_vec dims;
> >bool array_section_p;
> > @@ -36547,12 +36581,95 @@ cp_parser_omp_var_list_no_open (cp_parser
> > *parser, enum omp_clause_code kind,
> > parser->colon_corrects_to_scope_p = false; *colon = false;
> >  }
> > +  begin_scope (sk_omp, NULL);  
> 
> Why?  Base-language-wise, clauses don't introduce a new scope
> for name-lookup.

I think this was in aid of a particular test case
(c-c++-common/gomp/map-6.c) that tests various bad usages of "always"
and "close" modifiers, together with variables called literally
"always" and "close".  Parse failures during earlier tests could make
later tests fail without the scope.  I've moved the scope-creation to
the appropriate caller.  (Is there a better way?  Discarding
newly-created symbols on error, perhaps?)

> And if it is really needed, I'd strongly prefer to either do it solely
> for the clauses that might need it, or do begin_scope before 

c++: per-scope, per-signature lambda discriminators

2022-11-01 Thread Nathan Sidwell via Gcc-patches


This implements ABI-compliant lambda discriminators.  Not only do we
have per-scope counters, but we also distinguish by lambda signature.
Only lambdas with the same signature will need non-zero
discriminators.  As the discriminator is signature-dependent, we have
to process the lambda function's declaration before we can determine
it.  For templated and generic lambdas the signature is that of the
uninstantiated lambda -- not separate for each instantiation.

With this change, gcc and clang now produce the same lambda manglings
for all these testcases.

nathan

--
Nathan SidwellFrom 2b0e81d5cc2f7e1d773f6c502bd65b097f392675 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell 
Date: Mon, 31 Oct 2022 06:11:28 -0400
Subject: [PATCH] c++: per-scope, per-signature lambda discriminators

This implements ABI-compliant lambda discriminators.  Not only do we
have per-scope counters, but we also distinguish by lambda signature.
Only lambdas with the same signature will need non-zero
discriminators.  As the discriminator is signature-dependent, we have
to process the lambda function's declaration before we can determine
it.  For templated and generic lambdas the signature is that of the
uninstantiated lambda -- not separate for each instantiation.

With this change, gcc and clang now produce the same lambda manglings
for all these testcases.

	gcc/cp/
	* cp-tree.h (LAMBDA_EXPR_SCOPE_SIG_DISCRIMINATOR): New.
	(struct tree_lambda_expr): Add discriminator_sig bitfield.
	(recrd_lambda_scope_sig_discriminator): Declare.
	* lambda.cc (struct lambda_sig_count): New.
	(lambda_discriminator): Add signature vector.
	(start_lambda_scope): Adjust.
	(compare_lambda_template_head, compare_lambda_sig): New.
	(record_lambda_scope_sig_discriminator): New.
	* mangle.cc (write_closure_type): Use the scope-sig discriminator for
	ABI >= 18.  Emit abi mangling warning if needed.
	* module.cc (trees_out::core_vals): Stream the new discriminator.
	(trees_in::core_vals): Likewise.
	* parser.cc (cp_parser_lambda_declarator_opt): Call
	record_lambda_scope_sig_discriminator.
	* pt.cc (tsubst_lambda_expr): Likewise.
	libcc1/
	* libcp1plugin.cc (plugin_start_lambda_closure_class_type):
	Initialize the per-scope, per-signature discriminator.
	gcc/testsuite/
	* g++.dg/abi/lambda-sig1-18.C: New.
	* g++.dg/abi/lambda-sig1-18vs17.C: New.
	* g++.dg/cpp1y/lambda-mangle-1-18.C: New.
---
 gcc/cp/cp-tree.h  |   7 +-
 gcc/cp/lambda.cc  | 148 +-
 gcc/cp/mangle.cc  |   8 +-
 gcc/cp/module.cc  |   2 +
 gcc/cp/parser.cc  |   1 +
 gcc/cp/pt.cc  |   1 +
 gcc/testsuite/g++.dg/abi/lambda-sig1-18.C |  34 
 gcc/testsuite/g++.dg/abi/lambda-sig1-18vs17.C |  40 +
 .../g++.dg/cpp1y/lambda-mangle-1-18.C |  26 +++
 libcc1/libcp1plugin.cc|   1 +
 10 files changed, 265 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/abi/lambda-sig1-18.C
 create mode 100644 gcc/testsuite/g++.dg/abi/lambda-sig1-18vs17.C
 create mode 100644 gcc/testsuite/g++.dg/cpp1y/lambda-mangle-1-18.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 4c0bacb91da..d13bb3d4c0e 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -1501,9 +1501,12 @@ enum cp_lambda_default_capture_mode_type {
   (((struct tree_lambda_expr *)LAMBDA_EXPR_CHECK (NODE))->extra_scope)
 
 /* Lambdas in the same extra scope might need a discriminating count.
-   This is a single per-scope count.  */
+   For ABI 17, we have single per-scope count, for ABI 18, we have
+   per-scope, per-signature numbering.  */
 #define LAMBDA_EXPR_SCOPE_ONLY_DISCRIMINATOR(NODE) \
   (((struct tree_lambda_expr *)LAMBDA_EXPR_CHECK (NODE))->discriminator_scope)
+#define LAMBDA_EXPR_SCOPE_SIG_DISCRIMINATOR(NODE) \
+  (((struct tree_lambda_expr *)LAMBDA_EXPR_CHECK (NODE))->discriminator_sig)
 
 /* During parsing of the lambda, a vector of capture proxies which need
to be pushed once we're done processing a nested lambda.  */
@@ -1533,6 +1536,7 @@ struct GTY (()) tree_lambda_expr
   location_t locus;
   enum cp_lambda_default_capture_mode_type default_capture_mode : 2;
   unsigned discriminator_scope : 15; // Per-scope discriminator
+  unsigned discriminator_sig : 15; // Per-scope, per-signature discriminator
 };
 
 /* Non-zero if this template specialization has access violations that
@@ -7783,6 +7787,7 @@ extern void start_lambda_scope			(tree decl);
 extern void finish_lambda_scope			(void);
 extern void record_lambda_scope			(tree lambda);
 extern void record_lambda_scope_discriminator	(tree lambda);
+extern void record_lambda_scope_sig_discriminator (tree lambda, tree fn);
 extern tree start_lambda_function		(tree fn, tree lambda_expr);
 extern void finish_lambda_function		(tree body);
 extern bool regenerated_lambda_fn_p		(tree);
diff --git a/gcc/cp/lambda.cc b/gcc/cp/lambda.cc
index 

[PATCH] libstdc++: Fix ERANGE behavior for fallback FP std::from_chars

2022-11-01 Thread Patrick Palka via Gcc-patches
The fallback implementation of floating-point std::from_chars for e.g.
float80 just calls the C library's strtod family of functions.  In case
of overflow of the parsed result, the behavior of these functions is
rigidly specified:

  If the correct value overflows and default rounding is in effect, plus
  or minus HUGE_VAL, HUGE_VALF, or HUGE_VALL is returned (according to
  the return type and sign of the value), and the value of the macro
  ERANGE is stored in errno.

But in case of underflow, implementations are given more leeway:

  If the result underflows the functions return a value whose magnitude
  is no greater than the smallest normalized positive number in the
  return type; whether errno acquires the value ERANGE is
  implementation-defined.

Thus we can (and do) portably detect overflow, but we can't portably
detect underflow.  However, glibc (and presumably other high-quality C
library implementations) will reliably set errno to ERANGE in case of
underflow too, and it will also return the nearest denormal number to
the parsed result (including zero in case of true underflow).

Since we can't be perfect here, this patch takes the best effort
approach of assuming a high quality C library implementation that
allows us to distinguish between a denormal parsed result and true
underflow by inspecting the return value.

Tested on x86_64-pc-linux-gnu, does this look OK for trunk?  Dunno
if we should backport this too.  No test because we can't portably
test this IIUC.

libstdc++-v3/ChangeLog:

* src/c++17/floating_from_chars.cc (from_chars_impl): In the
ERANGE case, also check for a 0 return value before returning
result_out_of_range, occurred, otherwise assume it's a denormal
number.
---
 libstdc++-v3/src/c++17/floating_from_chars.cc | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/libstdc++-v3/src/c++17/floating_from_chars.cc 
b/libstdc++-v3/src/c++17/floating_from_chars.cc
index a25ac5ce3aa..939c751f861 100644
--- a/libstdc++-v3/src/c++17/floating_from_chars.cc
+++ b/libstdc++-v3/src/c++17/floating_from_chars.cc
@@ -637,8 +637,13 @@ namespace
  {
if (__builtin_isinf(tmpval)) // overflow
  ec = errc::result_out_of_range;
-   else // underflow (LWG 3081 wants to set value = tmpval here)
+   else if (tmpval == 0) // underflow (LWG 3081 wants to set value = 
tmpval here)
  ec = errc::result_out_of_range;
+   else // denormal value
+ {
+   value = tmpval;
+   ec = errc();
+ }
  }
else if (n)
  {
-- 
2.38.1.381.gc03801e19c



Re: [PATCH] x86: Replace ne:CCC/ne:CCO with UNSPEC_CC_NE in neg patterns

2022-11-01 Thread Eric Botcazou via Gcc-patches
> Yes.  But it is all the same: neither signed overflow nor unsigned
> overflow (of an addition, say) can be described as the result of an
> RTL comparison.

I disagree, see for example the implementation of the addvdi4_sp3 pattern (for 
which we indeed use an UNSPEC) and of the uaddvdi4_sp32 pattern (for which we 
describe the overflow with a COMPARE) in the SPARC back-end.  And that's even 
simpler for an unsigned subtraction, where we do not need a special CC mode.

Sure there is a technical difficulty for unsigned negation because of the 
canonicalization rules, hence the trick used in the SPARC back-end, but 
unsigned overflow is much easier to deal with than signed overflow.

-- 
Eric Botcazou




Re: [PATCH] x86: Replace ne:CCC/ne:CCO with UNSPEC_CC_NE in neg patterns

2022-11-01 Thread Segher Boessenkool
On Fri, Oct 28, 2022 at 11:55:35PM +0200, Eric Botcazou wrote:
> > You mean in CCV?  That works yes, but only because (or if) the setter
> > and getter of the CC reg both use CCV (so never use any other flag at
> > the same time; CCV has an empty intersection with all other CC modes).
> 
> We're talking about CCC here AFAIK, i.e. the carry, not CCV.

Yes.  But it is all the same: neither signed overflow nor unsigned
overflow (of an addition, say) can be described as the result of an
RTL comparison.

The point is that all of this is put completely outside of all other
MODE_CC handling, and only works because of that.  And a small
modification to the backend, completely elsewhere, can make that house
of cards collapse.  It is much more robust to use a different relation,
not EQ, to decribe this.  Something with an unspec is fine.

But what the sparc backend does does work.


Segher


Re: GCC systemtap cache variable

2022-11-01 Thread Jeff Law via Gcc-patches



On 7/5/22 06:20, David Seifert wrote:

Hi Jeff,
thanks for merging my OBJDUMP patch. Could you please also merge the
following patch, which Jakub approved on IRC? I have tried contacting
the build system maintainers, but they are all AWOL

https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591734.html

Thanks for your help!


I went ahead and pushed the sdt.h patch too.

jeff



Re: [PATCH] gcc: honour -ffile-prefix-map in ASM_MAP [PR93371]

2022-11-01 Thread Jeff Law via Gcc-patches



On 8/29/22 03:29, Rasmus Villemoes wrote:

-ffile-prefix-map is supposed to be a superset of -fmacro-prefix-map
and -fdebug-prefix-map. However, when building .S or .s files, gas is
not called with the appropriate --debug-prefix-map option when
-ffile-prefix-map is used.

While the user can specify -fdebug-prefix-map when building assembly
files via gcc, it's more ergonomic to also support -ffile-prefix-map;
especially since for .S files that could contain the __FILE__ macro,
one would then also have to specify -fmacro-prefix-map.

gcc:
PR driver/93371
* gcc.cc (ASM_MAP): Honour -ffile-prefix-map.


OK.  Sorry for the long delay.

jeff




Re: [PATCH 1/2][GCC][AArch64] Implement hint intrinsics for AArch64

2022-11-01 Thread Andrew Pinski via Gcc-patches
On Thu, Jan 10, 2019 at 11:20 AM Srinath Parvathaneni
 wrote:
>
> Hi All,
>
> This patch implements the ACLE hint intrinsics (nop, yield, wfe, wfi,
> sev and sevl), for AArch64.

Hmm, this (and the corresponding arm patch) was never reviewed.
It might be useful to get an updated version which could be reviewed
and merged in for GCC 13.

Thanks,
Andrew

>
> The instructions are documented in the ArmARM[1] and the intrinsics
> specification will be
> published on the Arm website [2].
>
> [1]
> https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile
> [2]
> http://infocenter.arm.com/help/topic/com.arm.doc.ihi0053c/IHI0053C_acle_2_0.pdf
>
> Bootstrapped on aarch64-none-linux-gnu and regression tested on
> aarch64-none-elf with no regressions.
>
> Ok for trunk? If ok, could someone commit the patch on my behalf, I
> don't have commit rights.
>
> Thanks,
> Srinath
>
> gcc/ChangeLog:
>
> 2019-01-10  Srinath Parvathaneni  
>
> * config/aarch64/aarch64.md (yield): New pattern name.
> (wfe): Likewise.
> (wfi): Likewise.
> (sev): Likewise.
> (sevl): Likewise.
> (UNSPECV_YIELD): New volatile unspec.
> (UNSPECV_WFE): Likewise.
> (UNSPECV_WFI): Likewise.
> (UNSPECV_SEV): Likewise.
> (UNSPECV_SEVL): Likewise.
> * config/aarch64/aarch64-builtins.c (aarch64_builtins):
> AARCH64_SYSHINTOP_BUILTIN_NOP: New builtin.
> AARCH64_SYSHINTOP_BUILTIN_YIELD: Likewise.
> AARCH64_SYSHINTOP_BUILTIN_WFE: Likewise.
> AARCH64_SYSHINTOP_BUILTIN_WFI: Likewise.
> AARCH64_SYSHINTOP_BUILTIN_SEV: Likewise.
> AARCH64_SYSHINTOP_BUILTIN_SEVL: Likewise.
> (aarch64_init_syshintop_builtins): New function.
> (aarch64_init_builtins): New call statement.
> (aarch64_expand_builtin): New case.
> * config/aarch64/arm_acle.h (__nop ): New inline function.
> (__yield): Likewise.
> (__sev): Likewise.
> (__sevl): Likewise.
> (__wfi): Likewise.
> (__wfe): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> 2019-01-10  Srinath Parvathaneni  
>
> * gcc.target/aarch64/acle/hint-1.c: New test.
> * gcc.target/aarch64/acle/hint-2.c: Likewise.
>
>
>
>
>
>


Re: [PATCH 1/2] gcc/file-prefix-map: Allow remapping of relative paths

2022-11-01 Thread Jeff Law via Gcc-patches



On 8/17/22 06:15, Richard Purdie via Gcc-patches wrote:

Relative paths currently aren't remapped by -ffile-prefix-map and friends.
When cross compiling with separate 'source' and 'build' directories, the same
relative paths between directories may not be available on target as compared
to build time.

In order to be able to remap these relative build paths to paths that would
work on target, resolve paths within the file-prefix-map function using
realpath().


Understood.




This does cause a change of behaviour if users were previously relying upon
symlinks or absolute paths not being resolved.


I'm not too worried about this scenario.




Use basename to ensure plain filenames don't have paths added.

gcc/ChangeLog:

 * file-prefix-map.cc (remap_filename): Allow remapping of relative paths


Basically OK.  Just formatting nit:





Signed-off-by: Richard Purdie 
---
  gcc/file-prefix-map.cc | 15 ---
  1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/gcc/file-prefix-map.cc b/gcc/file-prefix-map.cc
index 24733f831d6..50d5d724a8f 100644
--- a/gcc/file-prefix-map.cc
+++ b/gcc/file-prefix-map.cc
@@ -70,19 +70,28 @@ remap_filename (file_prefix_map *maps, const char *filename)
file_prefix_map *map;
char *s;
const char *name;
+  char *realname;
size_t name_len;
  
+  if (lbasename (filename) == filename)

+return filename;
+
+  realname = lrealpath (filename);
+
for (map = maps; map; map = map->next)
-if (filename_ncmp (filename, map->old_prefix, map->old_len) == 0)
+if (filename_ncmp (realname, map->old_prefix, map->old_len) == 0)
break;
-  if (!map)
+  if (!map) {
+free (realname);
  return filename;
-  name = filename + map->old_len;
+  }



Put the the curley braces go on their own lines, indented two 
positions.  The code inside the curleys is indented two more 
positions.   I fixed that and pushed this change to the trunk.



THanks,

jeff




Re: [PATCH 2/2] libcpp: Avoid remapping filenames within directives

2022-11-01 Thread Jeff Law via Gcc-patches



On 8/17/22 06:15, Richard Purdie via Gcc-patches wrote:

Code such as:

#include __FILE__


can interact poorly with file-prefix-map options when cross compiling. In
general you're after to remap filenames for use in target context but the
local paths should be used to find include files at compile time. Ingoring
filename remapping for directives is one way to avoid such failures.

libcpp/ChangeLog:

 * macro.cc (_cpp_builtin_macro_text): Don't remap filenames within 
directives


So I went back and reviewed the old PR which introduced this code.  It 
was actually the Yocto project that got this code in to begin with :-)  
There wasn't really any discussion AFAICT about whether or not to remap 
in directives that I saw in the PR.



ISTM that given the change in behavior, we should probably document that 
we don't remap in directives.  Probably doc/invoke.texi.


With suitable documentation, this should be fine.  It seems like it 
ought to be independent of the first patch in this series which adds 
support for remapping relative paths.



jeff



Re: [PATCH 1/2] ivopts: Revert computation of address cost complexity.

2022-11-01 Thread Jeff Law via Gcc-patches



On 10/28/22 01:00, Richard Biener wrote:

On Fri, Oct 28, 2022 at 8:43 AM Dimitrije Milosevic
 wrote:

Hi Jeff,


THe part I don't understand is, if you only have BASE+OFF, why does
preventing the calculation of more complex addressing modes matter?  ie,
what's the point of computing the cost of something like base + off +
scaled index when the target can't utilize it?

Well, the complexities of all addressing modes other than BASE + OFFSET are
equal to 0. For targets like Mips, which only has BASE + OFFSET, it would still
be more complex to use a candidate with BASE + INDEX << SCALE + OFFSET
than a candidate with BASE + INDEX, for example, as it has to compensate
the lack of other addressing modes somehow. If complexities for both of
those are equal to 0, in cases where complexities decide which candidate is
to be chosen, a more complex candidate may be picked.

But something is wrong then - it shouldn't ever pick a candidate with
an addressing
mode that isn't supported?  So you say that the cost of expressing
'BASE + INDEX << SCALE + OFFSET' as 'BASE + OFFSET' is not computed
accurately?


This is exactly what I was trying to get to.   If the addressing mode 
isn't supported, then we shouldn't be picking it as a candidate.  If it 
is, then we've probably got a problem somewhere else in this code and 
this patch is likely papering over it.



Jeff



Re: Re: [PATCH] RISC-V: Fix RVV testcases.

2022-11-01 Thread Palmer Dabbelt

On Mon, 31 Oct 2022 16:52:25 PDT (-0700), juzhe.zh...@rivai.ai wrote:

These cases actually doesn't care about -mabi, they just need 'v' in -march.
Can you tell me how to fix these testcases for "fails on targets without 
ilp32d" ?
These failures are bogus failures since if you specify -mabi=ilp32d when you are using 
GNU toolchain which is build up with "--arch=ilp32" let say.
It will fail. Report there is no "ilp32d". So I fix these testcase by replacing "ilp32d" 
into "ilp32".


So the problem is this just moves the failures around, rather than 
failing on toolchains that lack ilp32d support it'll fail on toolchains 
that lack ilp32 support.  The ABI naming scheme sort of makes them look 
like extensions, but they're just incompatible with each other.


I can see a handful of ways to fix this:

* Add some sort of automatic ABI scheme to GCC.  LLVM already does this 
 and there was a GCC patch for it that had some issues, but IMO having 
 something like -mabi=auto-{min,max} would be useful as users keep 
 running into this problem.  We could also add something to DejaGNU 
 that does this.
* Add some sort of -march=+v to GCC, along the lines of the .option 
 arch,+v stuff in assembly but from the command line.  I seem to 
 remember proposals for that floating around somewhere, but can't find 
 anything.  This could probably also to DejaGNU.
* Decorate all these V functions with the +arch attributes.  That 
 wouldn't require any compiler changes, but it's kind of clunky.
* Add some sort of test suite logic (maybe in DejaGNU?) to check and see 
 if the desired ABI is linkable before attempting to do so.  That might 
 be generically useful.



Thank you.



juzhe.zh...@rivai.ai
 
From: Palmer Dabbelt

Date: 2022-11-01 06:30
To: gcc-patches
CC: juzhe.zhong; gcc-patches; schwab; Kito Cheng
Subject: Re: [PATCH] RISC-V: Fix RVV testcases.
On Mon, 31 Oct 2022 15:00:49 PDT (-0700), gcc-patches@gcc.gnu.org wrote:


On 10/30/22 19:40, juzhe.zh...@rivai.ai wrote:

From: Ju-Zhe Zhong 

gcc/testsuite/ChangeLog:

 * gcc.target/riscv/rvv/base/abi-2.c: Change ilp32d to ilp32.
 * gcc.target/riscv/rvv/base/abi-3.c: Ditto.
 * gcc.target/riscv/rvv/base/abi-4.c: Ditto.
 * gcc.target/riscv/rvv/base/abi-5.c: Ditto.
 * gcc.target/riscv/rvv/base/abi-6.c: Ditto.
 * gcc.target/riscv/rvv/base/abi-7.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-1.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-10.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-11.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-12.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-13.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-2.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-3.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-4.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-5.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-6.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-7.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-8.c: Ditto.
 * gcc.target/riscv/rvv/base/mov-9.c: Ditto.
 * gcc.target/riscv/rvv/base/pragma-1.c: Ditto.
 * gcc.target/riscv/rvv/base/user-1.c: Ditto.
 * gcc.target/riscv/rvv/base/user-2.c: Ditto.
 * gcc.target/riscv/rvv/base/user-3.c: Ditto.
 * gcc.target/riscv/rvv/base/user-4.c: Ditto.
 * gcc.target/riscv/rvv/base/user-5.c: Ditto.
 * gcc.target/riscv/rvv/base/user-6.c: Ditto.
 * gcc.target/riscv/rvv/base/vsetvl-1.c: Ditto.


I'm pretty new to the RISC-V world, but don't some of the cases
(particularly the abi-* tests) verify that the ABI specification does
not override the arch specification WRT availability of types?
 
I think that depends on what the ABI specification says here, as it 
could really go many ways.  Most of the RISC-V targets just use -mabi to 
control how arguments end up passed in functions, not the availability 
of types.  I can't find the ABI spec for these, though, so I'm not 
entirely sure how they're supposed to work...
 
That said, I'm not sure why we need any of these -mabi changes?  Just 
from spot checking some of the examples it doesn't look like there 
should be any functional difference between ilp32 and ilp32d here: 
-march is always specified so ilp32d looks valid.  If this is just to 
fix the "fails on targets without ilp32d" [1], then IMO it's not really 
a fix: we're essentially just changing that to "fails on targets without 
ilp32", we either need some sort of automatic march/mabi setting or a 
dependency on the availiable multilibs.  Some of these can probably 
avoid linking, but we'll have execution tests at some point.
 
1: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/604644.html
 


[COMMITTED] [PR tree-optimization/107490] Handle NANs in op[12]_range.

2022-11-01 Thread Aldy Hernandez via Gcc-patches
None of the build_ functions in range-op handle NANs.  This is by
design in order to force us to handle NANs specially, because
"x relop NAN" makes no sense.  This patch fixes a handful of
op[12]_range entries that weren't handling NANs.

PR tree-optimization/107490

gcc/ChangeLog:

* range-op-float.cc (foperator_unordered_lt::op1_range): Handle
NANs.
(foperator_unordered_lt::op2_range): Same.
(foperator_unordered_le::op1_range): Same.
(foperator_unordered_le::op2_range): Same.
(foperator_unordered_gt::op1_range): Same.
(foperator_unordered_gt::op2_range): Same.
(foperator_unordered_ge::op1_range): Same.
(foperator_unordered_ge::op2_range): Same.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/pr107490.c: New test.
---
 gcc/range-op-float.cc| 40 +++-
 gcc/testsuite/gcc.dg/tree-ssa/pr107490.c | 28 +
 2 files changed, 60 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr107490.c

diff --git a/gcc/range-op-float.cc b/gcc/range-op-float.cc
index 04208c88dd1..a1f372997bf 100644
--- a/gcc/range-op-float.cc
+++ b/gcc/range-op-float.cc
@@ -1332,7 +1332,10 @@ foperator_unordered_lt::op1_range (frange , tree type,
   switch (get_bool_state (r, lhs, type))
 {
 case BRS_TRUE:
-  build_lt (r, type, op2);
+  if (op2.known_isnan ())
+   r.set_varying (type);
+  else
+   build_lt (r, type, op2);
   break;
 
 case BRS_FALSE:
@@ -1359,7 +1362,10 @@ foperator_unordered_lt::op2_range (frange , tree type,
   switch (get_bool_state (r, lhs, type))
 {
 case BRS_TRUE:
-  build_gt (r, type, op1);
+  if (op1.known_isnan ())
+   r.set_varying (type);
+  else
+   build_gt (r, type, op1);
   break;
 
 case BRS_FALSE:
@@ -1420,7 +1426,10 @@ foperator_unordered_le::op1_range (frange , tree type,
   switch (get_bool_state (r, lhs, type))
 {
 case BRS_TRUE:
-  build_le (r, type, op2);
+  if (op2.known_isnan ())
+   r.set_varying (type);
+  else
+   build_le (r, type, op2);
   break;
 
 case BRS_FALSE:
@@ -1448,7 +1457,10 @@ foperator_unordered_le::op2_range (frange ,
   switch (get_bool_state (r, lhs, type))
 {
 case BRS_TRUE:
-  build_ge (r, type, op1);
+  if (op1.known_isnan ())
+   r.set_varying (type);
+  else
+   build_ge (r, type, op1);
   break;
 
 case BRS_FALSE:
@@ -1511,7 +1523,10 @@ foperator_unordered_gt::op1_range (frange ,
   switch (get_bool_state (r, lhs, type))
 {
 case BRS_TRUE:
-  build_gt (r, type, op2);
+  if (op2.known_isnan ())
+   r.set_varying (type);
+  else
+   build_gt (r, type, op2);
   break;
 
 case BRS_FALSE:
@@ -1539,7 +1554,10 @@ foperator_unordered_gt::op2_range (frange ,
   switch (get_bool_state (r, lhs, type))
 {
 case BRS_TRUE:
-  build_lt (r, type, op1);
+  if (op1.known_isnan ())
+   r.set_varying (type);
+  else
+   build_lt (r, type, op1);
   break;
 
 case BRS_FALSE:
@@ -1602,7 +1620,10 @@ foperator_unordered_ge::op1_range (frange ,
   switch (get_bool_state (r, lhs, type))
 {
 case BRS_TRUE:
-  build_ge (r, type, op2);
+  if (op2.known_isnan ())
+   r.set_varying (type);
+  else
+   build_ge (r, type, op2);
   break;
 
 case BRS_FALSE:
@@ -1629,7 +1650,10 @@ foperator_unordered_ge::op2_range (frange , tree type,
   switch (get_bool_state (r, lhs, type))
 {
 case BRS_TRUE:
-  build_le (r, type, op1);
+  if (op1.known_isnan ())
+   r.set_varying (type);
+  else
+   build_le (r, type, op1);
   break;
 
 case BRS_FALSE:
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr107490.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr107490.c
new file mode 100644
index 000..87c7f0aacdd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr107490.c
@@ -0,0 +1,28 @@
+// { dg-do compile }
+// { dg-options "-Os -fno-trapping-math -w" }
+
+extern void abort (void);
+
+#define MIN2(a,b) (((a)<(b)) ? (a) : (b))
+#define MAX2(a,b) (((a)>(b)) ? (a) : (b))
+
+double p[2] = { 4.f, 5.f };
+
+int main()
+{
+  long j;
+  double R, n, x;
+  n = 1.e300f;
+  x = -1.e300f;
+  for( j=0; j < 2; j++ )
+{
+  x = MAX2(x,p[j]);
+  n = MIN2(n,p[j]);
+}
+  R = x-n;
+
+  if( R < 0.1 )
+  abort ();
+
+  return 0;
+}
-- 
2.38.1



[COMMITTED] PR tree-optimization/107497 - Make sure ssa-name is valid.

2022-11-01 Thread Andrew MacLeod via Gcc-patches

pushed as obvious.   Testing running.

Andrew

commit 82b0345f6137b112728590d7c010dcd2cef08514
Author: Andrew MacLeod 
Date:   Tue Nov 1 13:18:33 2022 -0400

Make sure ssa-name is valid.

PR tree-optimization/107497
* tree-vrp.cc (remove_unreachable::remove_and_update_globals):
Check that ssa-name still exists before accessing it.

diff --git a/gcc/tree-vrp.cc b/gcc/tree-vrp.cc
index f0e4d37bef0..39f7eb7a75e 100644
--- a/gcc/tree-vrp.cc
+++ b/gcc/tree-vrp.cc
@@ -180,7 +180,7 @@ remove_unreachable::remove_and_update_globals (bool final_p)
   bitmap_copy (dce, all_exports);
   // Don't attempt to DCE parameters.
   EXECUTE_IF_SET_IN_BITMAP (all_exports, 0, i, bi)
-if (SSA_NAME_IS_DEFAULT_DEF (ssa_name (i)))
+if (!ssa_name (i) || SSA_NAME_IS_DEFAULT_DEF (ssa_name (i)))
   bitmap_clear_bit (dce, i);
   simple_dce_from_worklist (dce);
 


[PATCH v2] RISC-V modified add3 for large stack frame optimization [PR105733]

2022-11-01 Thread Kevin Lee
This is the updated patch of
https://gcc.gnu.org/pipermail/gcc-patches/2022-September/601824.html. Since
the riscv-selftest.cc has been added, this version of the patch adds the
logic in riscv-selftest.cc to also consider parallel insns.
  The patch has been tested with rv64imafdc / rv64imac / rv32imafdc /
rv32imac and no additional failures were detected in the testsuite.

gcc/ChangeLog:
Jim Wilson 
Michael Collison 
Kevin Lee 
* config/riscv/predicates.md (const_lui_operand): New Predicate.
(add_operand): Ditto.
(reg_or_const_int_operand): Ditto.
* config/riscv/riscv-protos.h (riscv_eliminable_reg): New
function.
* config/riscv/riscv-selftests.cc (calculate_x_in_sequence):
Consider Parallel insns.
* config/riscv/riscv.cc (riscv_eliminable_reg): New function.
(riscv_adjust_libcall_cfi_prologue): Use gen_rtx_SET and
gen_rtx_fmt_ee instead of gen_add3_insn.
(riscv_adjust_libcall_cfi_epilogue): Ditto.
* config/riscv/riscv.md (addsi3): Remove.
(add3): New instruction for large stack frame
optimization.
(add3_internal): Ditto.
(adddi3): Remove.
(add3_internal2): New instruction for insns generated in
the prologue and epilogue pass.
---
gcc/config/riscv/predicates.md | 13 +
gcc/config/riscv/riscv-protos.h | 1 +
gcc/config/riscv/riscv-selftests.cc | 3 ++
gcc/config/riscv/riscv.cc | 20 +--
gcc/config/riscv/riscv.md | 84 -
5 files changed, 104 insertions(+), 17 deletions(-)

diff --git a/gcc/config/riscv/predicates.md b/gcc/config/riscv/predicates.md
index c2ff41bb0fd..3149f7227ac 100644
--- a/gcc/config/riscv/predicates.md
+++ b/gcc/config/riscv/predicates.md
@@ -35,6 +35,14 @@
(ior (match_operand 0 "arith_operand")
(match_operand 0 "lui_operand")))
+(define_predicate "const_lui_operand"
+ (and (match_code "const_int")
+ (match_test "(INTVAL (op) & 0xFFF) == 0 && INTVAL (op) != 0")))
+
+(define_predicate "add_operand"
+ (ior (match_operand 0 "arith_operand")
+ (match_operand 0 "const_lui_operand")))
+
(define_predicate "const_csr_operand"
(and (match_code "const_int")
(match_test "IN_RANGE (INTVAL (op), 0, 31)")))
@@ -59,6 +67,11 @@
(ior (match_operand 0 "const_0_operand")
(match_operand 0 "register_operand")))
+;; For use in adds, when adding to an eliminable register.
+(define_predicate "reg_or_const_int_operand"
+ (ior (match_code "const_int")
+ (match_operand 0 "register_operand")))
+
;; Only use branch-on-bit sequences when the mask is not an ANDI immediate.
(define_predicate "branch_on_bit_operand"
(and (match_code "const_int")
diff --git a/gcc/config/riscv/riscv-protos.h
b/gcc/config/riscv/riscv-protos.h
index 5a718bb62b4..9348ac71956 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -63,6 +63,7 @@ extern void riscv_expand_conditional_move (rtx, rtx, rtx,
rtx_code, rtx, rtx);
extern rtx riscv_legitimize_call_address (rtx);
extern void riscv_set_return_address (rtx, rtx);
extern bool riscv_expand_block_move (rtx, rtx, rtx);
+extern bool riscv_eliminable_reg (rtx);
extern rtx riscv_return_addr (int, rtx);
extern poly_int64 riscv_initial_elimination_offset (int, int);
extern void riscv_expand_prologue (void);
diff --git a/gcc/config/riscv/riscv-selftests.cc
b/gcc/config/riscv/riscv-selftests.cc
index 636874ebc0f..50457db708e 100644
--- a/gcc/config/riscv/riscv-selftests.cc
+++ b/gcc/config/riscv/riscv-selftests.cc
@@ -116,6 +116,9 @@ calculate_x_in_sequence (rtx reg)
rtx pat = PATTERN (insn);
rtx dest = SET_DEST (pat);
+ if (GET_CODE (pat) == PARALLEL)
+ dest = SET_DEST (XVECEXP (pat, 0, 0));
+
if (GET_CODE (pat) == CLOBBER)
continue;
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 32f9ef9ade9..de9344b37a3 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -4686,6 +4686,16 @@ riscv_initial_elimination_offset (int from, int to)
return src - dest;
}
+/* Return true if X is a register that will be eliminated later on. */
+bool
+riscv_eliminable_reg (rtx x)
+{
+ return REG_P (x) && (REGNO (x) == FRAME_POINTER_REGNUM
+ || REGNO (x) == ARG_POINTER_REGNUM
+ || (REGNO (x) >= FIRST_VIRTUAL_REGISTER
+ && REGNO (x) <= LAST_VIRTUAL_REGISTER));
+}
+
/* Implement RETURN_ADDR_RTX. We do not support moving back to a
previous frame. */
@@ -4887,8 +4897,9 @@ riscv_adjust_libcall_cfi_prologue ()
}
/* Debug info for adjust sp. */
- adjust_sp_rtx = gen_add3_insn (stack_pointer_rtx,
- stack_pointer_rtx, GEN_INT (-saved_size));
+ adjust_sp_rtx = gen_rtx_SET (stack_pointer_rtx,
+ gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
+ stack_pointer_rtx, GEN_INT (saved_size)));
dwarf = alloc_reg_note (REG_CFA_ADJUST_CFA, adjust_sp_rtx,
dwarf);
return dwarf;
@@ -4990,8 +5001,9 @@ riscv_adjust_libcall_cfi_epilogue ()
int saved_size = cfun->machine->frame.save_libcall_adjustment;
/* Debug info for adjust sp. */
- adjust_sp_rtx = gen_add3_insn (stack_pointer_rtx,
- stack_pointer_rtx, GEN_INT (saved_size));
+ adjust_sp_rtx = gen_rtx_SET (stack_pointer_rtx,
+ gen_rtx_fmt_ee (PLUS, GET_MODE (stack_pointer_rtx),
+ 

[PATCH] c++: Disable -Wignored-qualifiers for template args [PR107492]

2022-11-01 Thread Marek Polacek via Gcc-patches
It seems wrong to issue a -Wignored-qualifiers warning for code like:

  static_assert(!is_same_v);

because there the qualifier matters.  Likewise in template
specialization:

  template struct S { };
  template<> struct S { };
  template<> struct S { }; // OK, not a redefinition

I'm of the mind that we should disable the warning for template
arguments, as in the patch below.

Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?

PR c++/107492

gcc/cp/ChangeLog:

* parser.cc (cp_parser_template_type_arg): Suppress -Wignored-qualifiers
warning.

gcc/testsuite/ChangeLog:

* g++.dg/warn/Wignored-qualifiers3.C: New test.
---
 gcc/cp/parser.cc |  4 
 gcc/testsuite/g++.dg/warn/Wignored-qualifiers3.C | 12 
 2 files changed, 16 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/warn/Wignored-qualifiers3.C

diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index e0e3cf3eaf6..54ad4b98ed3 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -24334,6 +24334,10 @@ cp_parser_template_type_arg (cp_parser *parser)
   const char *saved_message = parser->type_definition_forbidden_message;
   parser->type_definition_forbidden_message
 = G_("types may not be defined in template arguments");
+  /* It's wrong to issue a -Wignored-qualifiers warning for
+  static_assert(!is_same_v);
+ because there the qualifier matters.  */
+  warning_sentinel w (warn_ignored_qualifiers);
   r = cp_parser_type_id_1 (parser, CP_PARSER_FLAGS_NONE, true, false, NULL);
   parser->type_definition_forbidden_message = saved_message;
   if (cxx_dialect >= cxx14 && !flag_concepts && type_uses_auto (r))
diff --git a/gcc/testsuite/g++.dg/warn/Wignored-qualifiers3.C 
b/gcc/testsuite/g++.dg/warn/Wignored-qualifiers3.C
new file mode 100644
index 000..5696feaaefe
--- /dev/null
+++ b/gcc/testsuite/g++.dg/warn/Wignored-qualifiers3.C
@@ -0,0 +1,12 @@
+// PR c++/107492
+// { dg-do compile { target c++14 } }
+// { dg-additional-options "-Wignored-qualifiers" }
+
+template struct S { };
+template<> struct S { };
+template<> struct S { }; // { dg-bogus "ignored" }
+
+template constexpr bool is_same_v = false;
+template constexpr bool is_same_v = true;
+
+static_assert( ! is_same_v< void(*)(), const void(*)() >, ""); // { dg-bogus 
"ignored" }

base-commit: e7310e24b1c0ca67b1bb507c1330b2bf39e59e32
-- 
2.38.1



Re: [PATCH 1/2]middle-end: Add new tbranch optab to add support for bit-test-and-branch operations

2022-11-01 Thread Jeff Law via Gcc-patches



On 11/1/22 09:53, Tamar Christina wrote:



   from the machine description.

+@cindex @code{tbranch@var{mode}4} instruction pattern @item
+@samp{tbranch@var{mode}4} Conditional branch instruction combined
+with a bit test-and-compare instruction. Operand 0 is a comparison
+operator.  Operand 1 is the operand of the comparison. Operand 2 is
+the bit position of Operand 1 to test.
+Operand 3 is the @code{code_label} to jump to.

Should we refine/document the set of comparison operators allowed?    Is
operand 1 an arbitrary RTL expression or more limited?  I'm guessing its
relatively arbitrary given how you've massaged the existing branch-on-bit
patterns from the aarch backend.

It can be any expression in theory. However in practical terms we usually force
the values to registers before calling the expansion.  My assumption is that 
this
is for CSE purposes but that's only a guess.


Understood.  And generally yes, forcing expressions into regs is good 
for CSE.






Do we have enough information lying around from Ranger to avoid the need
to walk the def-use chain to discover that we're masking off all but one bit?


That's an interesting thought.  I'll try to see if I can figure out how to query
Ranger here.  It would be nice to do so here.


Reach out to Aldy, I suspect he can probably give you the necessary 
pseudocode pretty quickly.



Jeff




Re: [PATCH] Rename nonzero_bits to known_zero_bits.

2022-11-01 Thread Aldy Hernandez via Gcc-patches
Folks.  I have decided to put this aside until the next release.  I
originally wanted a simple rename, and reimplementing things to align
with rtl, etc, is beyond what I want to tackle on this late.

I'll archive this away, and revisit it when we implement the
irange::known_ones mask.

Thanks for your input.
Aldy

On Fri, Oct 21, 2022 at 8:01 PM Segher Boessenkool
 wrote:
>
> On Fri, Oct 21, 2022 at 06:54:32PM +0200, Jakub Jelinek wrote:
> > On Fri, Oct 21, 2022 at 06:51:19PM +0200, Jakub Jelinek wrote:
> > > Agreed.
> > >
> > > I think maybe_nonzero_bits would be fine.
> >
> > Or yet another option is to change what we track and instead of
> > having just one bitmask have 2 as tree-ssa-ccp.cc does,
> > one bitmask says which bits are known to be always the same
> > and the other which specifies the values of those bits.
> > "For X with a CONSTANT lattice value X & ~mask == value & ~mask.  The
> > zero bits in the mask cover constant values.  The ones mean no
> > information."
>
> I am still working on making the RTL nonzero_bits use DF (and indeed I
> do a known_zero instead :-) ).  This makes the special version in
> combine unnecessary: instead of working better than the generic version
> it is strictly weaker then.  This change then makes it possible to use
> nonzero_bits in instruction conditions (without causing ICEs as now --
> passes after combine return a subset of the nonzero_bits the version in
> combine does, which can make insns no longer match in later passes).
>
> My fear is tracking twice as many bits might become expensive.  OTOH
> ideally we can get rid of combine's reg_stat completely at some point
> in the future (which has all the same problems as combine's version of
> nonzero_bits: the values it returns depend on the order combine tried
> possible combinations).
>
> Storage requirements are the same for known_zero_bits and known_one_bits
> vs. known_bits and known_bit_values, but the latter is a bit more
> costly to compute, but more importantly it is usually a lot less
> convenient in use.  (A third option is known_bits and known_zero_bits?)
>
>
> Segher
>



[PATCH 2/2] i386: correct x87 multiplication modeling in znver.md

2022-11-01 Thread Alexander Monakov
All multiplication instructions are fully pipelined, except AVX256
instructions on Zen 1, which issue over two cycles on a 128-bit unit.
Correct the model accordingly to reduce combinatorial explosion in
automaton tables.

Top znver table sizes in insn-automata.o:

Before:

30056 r znver1_fp_min_issue_delay
120224 r znver1_fp_transitions

After:

6720 r znver1_fp_min_issue_delay
53760 r znver1_fp_transitions

gcc/ChangeLog:

PR target/87832
* config/i386/znver.md: (znver1_fp_op_mul): Correct cycles in
the reservation.
(znver1_fp_op_mul_load): Ditto.
(znver1_mmx_mul): Ditto.
(znver1_mmx_load): Ditto.
(znver1_ssemul_ss_ps): Ditto.
(znver1_ssemul_ss_ps_load): Ditto.
(znver1_ssemul_avx256_ps): Ditto.
(znver1_ssemul_avx256_ps_load): Ditto.
(znver1_ssemul_sd_pd): Ditto.
(znver1_ssemul_sd_pd_load): Ditto.
(znver2_ssemul_sd_pd): Ditto.
(znver2_ssemul_sd_pd_load): Ditto.
(znver1_ssemul_avx256_pd): Ditto.
(znver1_ssemul_avx256_pd_load): Ditto.
(znver1_sseimul): Ditto.
(znver1_sseimul_avx256): Ditto.
(znver1_sseimul_load): Ditto.
(znver1_sseimul_avx256_load): Ditto.
(znver1_sseimul_di): Ditto.
(znver1_sseimul_load_di): Ditto.
---
 gcc/config/i386/znver.md | 40 
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/gcc/config/i386/znver.md b/gcc/config/i386/znver.md
index c52f8b532..882f250f1 100644
--- a/gcc/config/i386/znver.md
+++ b/gcc/config/i386/znver.md
@@ -573,13 +573,13 @@ (define_insn_reservation "znver1_fp_op_mul" 5
 (and (eq_attr "cpu" "znver1,znver2,znver3")
  (and (eq_attr "type" "fop,fmul")
   (eq_attr "memory" "none")))
-"znver1-direct,znver1-fp0*5")
+"znver1-direct,znver1-fp0")
 
 (define_insn_reservation "znver1_fp_op_mul_load" 12 
 (and (eq_attr "cpu" "znver1,znver2,znver3")
  (and (eq_attr "type" "fop,fmul")
   (eq_attr "memory" "load")))
-"znver1-direct,znver1-load,znver1-fp0*5")
+"znver1-direct,znver1-load,znver1-fp0")
 
 (define_insn_reservation "znver1_fp_op_imul_load" 16
 (and (eq_attr "cpu" "znver1,znver2,znver3")
@@ -684,13 +684,13 @@ (define_insn_reservation "znver1_mmx_mul" 3
 (and (eq_attr "cpu" "znver1,znver2,znver3")
  (and (eq_attr "type" "mmxmul")
   (eq_attr "memory" "none")))
- "znver1-direct,znver1-fp0*3")
+ "znver1-direct,znver1-fp0")
 
 (define_insn_reservation "znver1_mmx_load" 10
 (and (eq_attr "cpu" "znver1,znver2,znver3")
  (and (eq_attr "type" "mmxmul")
   (eq_attr "memory" "load")))
-"znver1-direct,znver1-load,znver1-fp0*3")
+"znver1-direct,znver1-load,znver1-fp0")
 
 ;; TODO
 (define_insn_reservation "znver1_avx256_log" 1
@@ -1161,7 +1161,7 @@ (define_insn_reservation "znver1_ssemul_ss_ps" 3
  (eq_attr "mode" 
"V8SF,V4SF,SF,V4DF,V2DF,DF")))
  (and (eq_attr "type" "ssemul")
   (eq_attr "memory" "none")))
-"znver1-direct,(znver1-fp0|znver1-fp1)*3")
+"znver1-direct,znver1-fp0|znver1-fp1")
 
 (define_insn_reservation "znver1_ssemul_ss_ps_load" 10 
 (and (ior (and (eq_attr "cpu" "znver1")
@@ -1172,47 +1172,47 @@ (define_insn_reservation "znver1_ssemul_ss_ps_load" 10
  (eq_attr "mode" "V8SF,V4SF,SF")))
  (and (eq_attr "type" "ssemul")
   (eq_attr "memory" "load")))
-"znver1-direct,znver1-load,(znver1-fp0|znver1-fp1)*3")
+"znver1-direct,znver1-load,znver1-fp0|znver1-fp1")
 
 (define_insn_reservation "znver1_ssemul_avx256_ps" 3
 (and (eq_attr "cpu" "znver1")
  (and (eq_attr "mode" "V8SF")
   (and (eq_attr "type" "ssemul")
(eq_attr "memory" "none"
-"znver1-double,(znver1-fp0|znver1-fp1)*3")
+"znver1-double,znver1-fp0*2|znver1-fp1*2")
 
 (define_insn_reservation "znver1_ssemul_avx256_ps_load" 10
 (and (eq_attr "cpu" "znver1")
  (and (eq_attr "mode" "V8SF")
   (and (eq_attr "type" "ssemul")
(eq_attr 

[PATCH 1/2] i386: correct x87 division modeling in znver.md

2022-11-01 Thread Alexander Monakov
Correct modeling of division instructions in the SIMD/FP domain for
AMD Zen architectures and avoid combinatorial explosion of automaton
tables by modeling the separate floating-point division unit and
correcting reservations to reflect reciprocal throughput of the
corresponding instructions, similar to earlier commit
5cee5f94000 ("i386: correct integer division modeling in znver.md").

Division is partially pipelined and some instructions have fractional
throughput (e.g. Zen 3 can issue divss and divsd each 3.5 and 4.5
cycles on average, respectively). Considering these CPUs implement
out-of-order execution, the model doesn't need to be exact to the last
cycle, so simplify it by using 4/5 cycles for SF/DF modes, and not
modeling the fact that FP3 pipe is occupied for one cycle.

Top znver table sizes in insn-automata.o:

Before:

428108 r znver1_fp_min_issue_delay
856216 r znver1_fp_transitions

After:

30056 r znver1_fp_min_issue_delay
120224 r znver1_fp_transitions

gcc/ChangeLog:

PR target/87832
* config/i386/znver.md (znver1_fdiv): New automaton.
(znver1-fdiv): New unit.
(znver1_fp_op_div): Correct unit and cycles in the reservation.
(znver1_fp_op_div_load): Ditto.
(znver1_fp_op_idiv_load): Ditto.
(znver2_fp_op_idiv_load): Ditto.
(znver1_ssediv_ss_ps): Ditto.
(znver1_ssediv_ss_ps_load): Ditto.
(znver1_ssediv_sd_pd): Ditto.
(znver1_ssediv_sd_pd_load): Ditto.
(znver1_ssediv_avx256_ps): Ditto.
(znver1_ssediv_avx256_ps_load): Ditto.
(znver1_ssediv_avx256_pd): Ditto.
(znver1_ssediv_avx256_pd_load): Ditto.
---
 gcc/config/i386/znver.md | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/znver.md b/gcc/config/i386/znver.md
index 4aa098fd8..c52f8b532 100644
--- a/gcc/config/i386/znver.md
+++ b/gcc/config/i386/znver.md
@@ -24,7 +24,7 @@ (define_attr "znver1_decode" "direct,vector,double"
 ;; AMD znver1, znver2 and znver3 Scheduling
 ;; Modeling automatons for zen decoders, integer execution pipes,
 ;; SIMD/FP domain, AGU pipes, and dividers.
-(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu, znver1_idiv")
+(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu, znver1_idiv, 
znver1_fdiv")
 
 ;; Decoders unit has 4 decoders and all of them can decode fast path
 ;; and vector type instructions.
@@ -95,6 +95,7 @@ (define_reservation "znver2-fvector" "znver1-fp0+znver1-fp1
 
 ;; Dividers
 (define_cpu_unit "znver1-idiv" "znver1_idiv")
+(define_cpu_unit "znver1-fdiv" "znver1_fdiv")
 
 ;; Call instruction
 (define_insn_reservation "znver1_call" 1
@@ -591,27 +592,27 @@ (define_insn_reservation "znver1_fp_op_div" 15
 (and (eq_attr "cpu" "znver1,znver2,znver3")
  (and (eq_attr "type" "fdiv")
   (eq_attr "memory" "none")))
-"znver1-direct,znver1-fp3*15")
+"znver1-direct,znver1-fdiv*6")
 
 (define_insn_reservation "znver1_fp_op_div_load" 22
 (and (eq_attr "cpu" "znver1,znver2,znver3")
  (and (eq_attr "type" "fdiv")
   (eq_attr "memory" "load")))
-"znver1-direct,znver1-load,znver1-fp3*15")
+"znver1-direct,znver1-load,znver1-fdiv*6")
 
 (define_insn_reservation "znver1_fp_op_idiv_load" 27
 (and (eq_attr "cpu" "znver1")
  (and (eq_attr "type" "fdiv")
   (and (eq_attr "fp_int_src" "true")
(eq_attr "memory" "load"
-"znver1-double,znver1-load,znver1-fp3*19")
+"znver1-double,znver1-load,znver1-fdiv*6")
 
 (define_insn_reservation "znver2_fp_op_idiv_load" 26
 (and (eq_attr "cpu" "znver2,znver3")
  (and (eq_attr "type" "fdiv")
   (and (eq_attr "fp_int_src" "true")
(eq_attr "memory" "load"
-"znver1-double,znver1-load,znver1-fp3*19")
+"znver1-double,znver1-load,znver1-fdiv*6")
 
 
 ;; MMX, SSE, SSEn.n, AVX, AVX2 instructions
@@ -1088,7 +1089,7 @@ (define_insn_reservation "znver1_ssediv_ss_ps" 10
  (eq_attr "mode" "V8SF,V4SF,SF")))
  (and (eq_attr "type" "ssediv")
   (eq_attr "memory" "none")))
-"znver1-direct,znver1-fp3*10")
+"znver1-direct,znver1-fdiv*4")
 
 (define_insn_reservation "znver1_ssediv_ss_ps_load" 17
 (and (ior (and (eq_attr "cpu" "znver1")
@@ -1099,7 +1100,7 @@ (define_insn_reservation "znver1_ssediv_ss_ps_load" 17
  

[PATCH 0/2] i386: slim down insn-automata [PR 87832]

2022-11-01 Thread Alexander Monakov
Hi,

I'm sending followup fixes for combinatorial explosion of znver scheduling
automaton tables as described in the earlier thread:

https://inbox.sourceware.org/gcc-patches/23c795d6-403c-5927-e610-f0f1215f5...@ispras.ru/T/#m36e069d43d07d768d4842a779e26b4a0915cc543

I think lujiazui.md and b[dt]ver[123].md have similar issues.

Alexander Monakov (2):
  i386: correct x87 division modeling in znver.md
  i386: correct x87 multiplication modeling in znver.md

 gcc/config/i386/znver.md | 67 
 1 file changed, 34 insertions(+), 33 deletions(-)

-- 
2.37.2



Re: [PATCH v2 3/3] p1689r5: initial support

2022-11-01 Thread Ben Boeckel via Gcc-patches
On Tue, Nov 01, 2022 at 08:57:37 -0600, Tom Tromey wrote:
> > "Ben" == Ben Boeckel via Gcc-patches  writes:
> 
> Ben> - `-fdeps-file=` specifies the path to the file to write the format to.
> 
> I don't know how this output is intended to be used, but one mistake
> made with the other dependency-tracking options was that the output file
> isn't created atomically.  As a consequence, Makefiles normally have to
> work around this to be robust.  If that's a possible issue here then it
> would be best to handle it in this patch.

I don't think there'll be any race here because it's the "output" of the
rule as far as the build graph is concerned. It's also JSON, so anything
reading it "early" will get a partial object and easily detect
"something went wrong". And for clarity, the `-o` flag used in CMake
with this is just a side effect of the `-E` mechanism used and is
completely ignored in the CMake usage of this.

--Ben


RE: [PATCH 1/2]middle-end: Add new tbranch optab to add support for bit-test-and-branch operations

2022-11-01 Thread Tamar Christina via Gcc-patches
> -Original Message-
> From: Jeff Law 
> Sent: Monday, October 31, 2022 9:16 PM
> To: Tamar Christina ; gcc-patches@gcc.gnu.org
> Cc: nd ; rguent...@suse.de
> Subject: Re: [PATCH 1/2]middle-end: Add new tbranch optab to add support
> for bit-test-and-branch operations
> 
> 
> On 10/31/22 05:53, Tamar Christina wrote:
> > Hi All,
> >
> > This adds a new test-and-branch optab that can be used to do a conditional
> test
> > of a bit and branch.   This is similar to the cbranch optab but instead can
> > test any arbitrary bit inside the register.
> >
> > This patch recognizes boolean comparisons and single bit mask tests.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * dojump.cc (do_jump): Pass along value.
> > (do_jump_by_parts_greater_rtx): Likewise.
> > (do_jump_by_parts_zero_rtx): Likewise.
> > (do_jump_by_parts_equality_rtx): Likewise.
> > (do_compare_rtx_and_jump): Likewise.
> > (do_compare_and_jump): Likewise.
> > * dojump.h (do_compare_rtx_and_jump): New.
> > * optabs.cc (emit_cmp_and_jump_insn_1): Refactor to take optab
> to check.
> > (validate_test_and_branch): New.
> > (emit_cmp_and_jump_insns): Optiobally take a value, and when
> value is
> > supplied then check if it's suitable for tbranch.
> > * optabs.def (tbranch$a4): New.
> > * doc/md.texi (tbranch@var{mode}4): Document it.
> > * optabs.h (emit_cmp_and_jump_insns):
> > * tree.h (tree_zero_one_valued_p): New.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index
> >
> c08691ab4c9a4bfe55ae81e5e228a414d6242d78..f8b32ec12f46d3fb3815f121a1
> 6b
> > 5a8a1819b66a 100644
> > --- a/gcc/doc/md.texi
> > +++ b/gcc/doc/md.texi
> > @@ -6972,6 +6972,13 @@ case, you can and should make operand 1's
> predicate reject some operators
> >   in the @samp{cstore@var{mode}4} pattern, or remove the pattern
> altogether
> >   from the machine description.
> >
> > +@cindex @code{tbranch@var{mode}4} instruction pattern @item
> > +@samp{tbranch@var{mode}4} Conditional branch instruction combined
> > +with a bit test-and-compare instruction. Operand 0 is a comparison
> > +operator.  Operand 1 is the operand of the comparison. Operand 2 is
> > +the bit position of Operand 1 to test.
> > +Operand 3 is the @code{code_label} to jump to.
> 
> Should we refine/document the set of comparison operators allowed?    Is
> operand 1 an arbitrary RTL expression or more limited?  I'm guessing its
> relatively arbitrary given how you've massaged the existing branch-on-bit
> patterns from the aarch backend.

It can be any expression in theory. However in practical terms we usually force
the values to registers before calling the expansion.  My assumption is that 
this
is for CSE purposes but that's only a guess.

> 
> 
> > +
> > +  if (TREE_CODE (val) != SSA_NAME)
> > +return false;
> > +
> > +  gimple *def = SSA_NAME_DEF_STMT (val);  if (!is_gimple_assign (def)
> > +  || gimple_assign_rhs_code (def) != BIT_AND_EXPR)
> > +return false;
> > +
> > +  tree cst = gimple_assign_rhs2 (def);
> > +
> > +  if (!tree_fits_uhwi_p (cst))
> > +return false;
> > +
> > +  tree op0 = gimple_assign_rhs1 (def);
> > +  if (TREE_CODE (op0) == SSA_NAME)
> > +{
> > +  def = SSA_NAME_DEF_STMT (op0);
> > +  if (gimple_assign_cast_p (def))
> > +   op0 = gimple_assign_rhs1 (def);
> > +}
> > +
> > +  wide_int wcst = wi::uhwi (tree_to_uhwi (cst),
> > +   TYPE_PRECISION (TREE_TYPE (op0)));
> > +  int bitpos;
> > +
> > +  if ((bitpos = wi::exact_log2 (wcst)) == -1)
> > +return false;
> 
> Do we have enough information lying around from Ranger to avoid the need
> to walk the def-use chain to discover that we're masking off all but one bit?
> 

That's an interesting thought.  I'll try to see if I can figure out how to query
Ranger here.  It would be nice to do so here.

Cheers,
Tamar

> 
> 
> >
> >
> > diff --git a/gcc/tree.h b/gcc/tree.h
> > index
> >
> 8f8a9660c9e0605eb516de194640b8c1b531b798..be3d2dee82f692e81082cf21c
> 878
> > c10f9fe9e1f1 100644
> > --- a/gcc/tree.h
> > +++ b/gcc/tree.h
> > @@ -4690,6 +4690,7 @@ extern tree signed_or_unsigned_type_for (int,
> tree);
> >   extern tree signed_type_for (tree);
> >   extern tree unsigned_type_for (tree);
> >   extern bool is_truth_type_for (tree, tree);
> > +extern bool tree_zero_one_valued_p (tree);
> 
> I don't see a definition of this anywhere.
> 
> 
> jeff
> 



Re: [PATCH] riscv/RTEMS: Add RISCV_GCOV_TYPE_SIZE

2022-11-01 Thread Jeff Law via Gcc-patches



On 10/27/22 23:47, Sebastian Huber wrote:

On 28/10/2022 01:05, Palmer Dabbelt wrote:

On Thu, 27 Oct 2022 15:56:17 PDT (-0700), gcc-patches@gcc.gnu.org wrote:


On 10/26/22 01:49, Sebastian Huber wrote:
The RV32A extension does not support 64-bit atomic operations.  For 
RTEMS, use

a 32-bit gcov type for RV32.

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_gcov_type_size): New.
(TARGET_GCOV_TYPE_SIZE): Likewise.
* config/riscv/rtems.h (RISCV_GCOV_TYPE_SIZE): New.


Why make this specific to rtems?  ISTM the logic behind this change
would apply independently of the os.


Reducing the gcov type to 32-bit has the drawback that the program 
runtime is reduced. I am not sure if this is generally acceptable.


Right, but if you're limited by RV32A, then we're architecturally 
limited to 32bit atomics.  So something has to give.



I'm not objecting to this for rtems.  I'm just noting that if we're 
dealing with an architectural limitation, then the issue is likely to 
show up in other operating systems, so we should at least ponder if we 
want to do an OS specific change or something more general.



Jeff




Re: [committed] libstdc++: Fix compare_exchange_padding.cc test for std::atomic_ref

2022-11-01 Thread Eric Botcazou via Gcc-patches
> Do those loads still get scalarized at -O0?

Presumably not at the GIMPLE level, but possibly at the RTL level.

-- 
Eric Botcazou




RE: [PATCH 8/8]AArch64: Have reload not choose to do add on the scalar side if both values exist on the SIMD side.

2022-11-01 Thread Tamar Christina via Gcc-patches
> -Original Message-
> From: Richard Sandiford 
> Sent: Tuesday, November 1, 2022 3:05 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; Kyrylo Tkachov 
> Subject: Re: [PATCH 8/8]AArch64: Have reload not choose to do add on the
> scalar side if both values exist on the SIMD side.
> 
> Tamar Christina  writes:
> > Hi All,
> >
> > Currently we often times generate an r -> r add even if it means we
> > need two reloads to perform it, i.e. in the case that the values are on the
> SIMD side.
> >
> > The pairwise operations expose these more now and so we get suboptimal
> codegen.
> >
> > Normally I would have liked to use ^ or $ here, but while this works
> > for the simple examples, reload inexplicably falls apart on examples
> > that should have been trivial. It forces a move to r -> w to use the w
> > ADD, which is counter to what ^ and $ should do.
> >
> > However ! seems to fix all the regression and still maintains the good
> codegen.
> >
> > I have tried looking into whether it's our costings that are off, but
> > I can't seem anything logical here.  So I'd like to push this change
> > instead along with test that augment the other testcases that guard the r ->
> r variants.
> 
> This feels like a hack though.  r<-r+r is one of the simplest thing the 
> processor
> can do, so I don't think it makes logical sense to mark it with !, which means
> "prohibitively expensive".  It's likely to push operations that require 
> reloads
> onto the SIMD side.

I agree. Though at the moment, reload isn't behaving as it should. It's almost 
as if
the register transfer costs are not taken into account when deciding on an 
alternative.

It seems to think that an r->r and w->w are as cheap even when the value has 
been assigned
to w before.  For instance, some of the testcases below don't work correctly 
because of this.

I don't think I can influence this costing, and as I mentioned ^ works for the 
simple example
But then somehow makes w->w cheaper even though the value was assigned to r.

I'm not really sure where to look here, but the current version is also equally 
broken..
It basically always forces to r.

Thanks,
Tamar

> 
> Thanks,
> Richard
> 
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64.md (*add3_aarch64): Add ! to the
> r -> r
> > alternative.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/simd/scalar_addp.c: New test.
> > * gcc.target/aarch64/simd/scalar_faddp.c: New test.
> > * gcc.target/aarch64/simd/scalar_faddp2.c: New test.
> > * gcc.target/aarch64/simd/scalar_fmaxp.c: New test.
> > * gcc.target/aarch64/simd/scalar_fminp.c: New test.
> > * gcc.target/aarch64/simd/scalar_maxp.c: New test.
> > * gcc.target/aarch64/simd/scalar_minp.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64.md
> > b/gcc/config/aarch64/aarch64.md index
> >
> 09ae1118371f82ca63146fceb953eb9e820d05a4..c333fb1f72725992bb304c560f
> 12
> > 45a242d5192d 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -2043,7 +2043,7 @@ (define_expand "add3"
> >
> >  (define_insn "*add3_aarch64"
> >[(set
> > -(match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,r,rk")
> > +(match_operand:GPI 0 "register_operand" "=rk,!rk,w,rk,r,r,rk")
> >  (plus:GPI
> >   (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,0,rk")
> >   (match_operand:GPI 2 "aarch64_pluslong_operand"
> > "I,r,w,J,Uaa,Uai,Uav")))] diff --git
> > a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > new file mode 100644
> > index
> >
> ..5b8d40f19884fc7b4e7decd80
> 758
> > bc36fa76d058
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> > @@ -0,0 +1,70 @@
> > +/* { dg-do assemble } */
> > +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +typedef long long v2di __attribute__((vector_size (16))); typedef
> > +unsigned long long v2udi __attribute__((vector_size (16))); typedef
> > +int v2si __attribute__((vector_size (16))); typedef unsigned int
> > +v2usi __attribute__((vector_size (16)));
> > +
> > +/*
> > +** foo:
> > +** addpd0, v0.2d
> > +** fmovx0, d0
> > +** ret
> > +*/
> > +long long
> > +foo (v2di x)
> > +{
> > +  return x[1] + x[0];
> > +}
> > +
> > +/*
> > +** foo1:
> > +** saddlp  v0.1d, v0.2s
> > +** fmovx0, d0
> > +** ret
> > +*/
> > +long long
> > +foo1 (v2si x)
> > +{
> > +  return x[1] + x[0];
> > +}
> > +
> > +/*
> > +** foo2:
> > +** uaddlp  v0.1d, v0.2s
> > +** fmovx0, d0
> > +** ret
> > +*/
> > 

RE: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.

2022-11-01 Thread Tamar Christina via Gcc-patches
> -Original Message-
> From: Richard Sandiford 
> Sent: Tuesday, November 1, 2022 2:59 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw
> ; Marcus Shawcroft
> ; Kyrylo Tkachov 
> Subject: Re: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.
> 
> Tamar Christina  writes:
> > Hi All,
> >
> > The backend has an existing V2HFmode that is used by pairwise operations.
> > This mode was however never made fully functional.  Amongst other
> > things it was never declared as a vector type which made it unusable from
> the mid-end.
> >
> > It's also lacking an implementation for load/stores so reload ICEs if
> > this mode is every used.  This finishes the implementation by providing the
> above.
> >
> > Note that I have created a new iterator VHSDF_P instead of extending
> > VHSDF because the previous iterator is used in far more things than just
> load/stores.
> >
> > It's also used for instance in intrinsics and extending this would
> > force me to provide support for mangling the type while we never
> > expose it through intrinsics.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
> > (mov, movmisalign, aarch64_dup_lane,
> > aarch64_store_lane0, aarch64_simd_vec_set,
> > @aarch64_simd_vec_copy_lane, vec_set,
> > reduc__scal_, reduc__scal_,
> > aarch64_reduc__internal,
> aarch64_get_lane,
> > vec_init, vec_extract): Support V2HF.
> > * config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
> > Add E_V2HFmode.
> > * config/aarch64/iterators.md (VHSDF_P): New.
> > (V2F, VALL_F16_FULL, nunits, Vtype, Vmtype, Vetype, stype, VEL,
> > Vel, q, vp): Add V2HF.
> > * config/arm/types.md (neon_fp_reduc_add_h): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/sve/slp_1.c: Update testcase.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64-simd.md
> > index
> >
> 25aed74f8cf939562ed65a578fe32ca76605b58a..93a2888f567460ad10ec050ea7
> d4
> > f701df4729d1 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -19,10 +19,10 @@
> >  ;; .
> >
> >  (define_expand "mov"
> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> > -   (match_operand:VALL_F16 1 "general_operand"))]
> > +  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
> > +   (match_operand:VALL_F16_FULL 1 "general_operand"))]
> >"TARGET_SIMD"
> > -  "
> > +{
> >/* Force the operand into a register if it is not an
> >   immediate whose use can be replaced with xzr.
> >   If the mode is 16 bytes wide, then we will be doing @@ -46,12
> > +46,11 @@ (define_expand "mov"
> >aarch64_expand_vector_init (operands[0], operands[1]);
> >DONE;
> >  }
> > -  "
> > -)
> > +})
> >
> >  (define_expand "movmisalign"
> > -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> > -(match_operand:VALL_F16 1 "general_operand"))]
> > +  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
> > +(match_operand:VALL_F16_FULL 1 "general_operand"))]
> >"TARGET_SIMD && !STRICT_ALIGNMENT"
> >  {
> >/* This pattern is not permitted to fail during expansion: if both
> > arguments @@ -85,10 +84,10 @@ (define_insn
> "aarch64_simd_dup"
> >  )
> >
> >  (define_insn "aarch64_dup_lane"
> > -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> > -   (vec_duplicate:VALL_F16
> > +  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
> > +   (vec_duplicate:VALL_F16_FULL
> >   (vec_select:
> > -   (match_operand:VALL_F16 1 "register_operand" "w")
> > +   (match_operand:VALL_F16_FULL 1 "register_operand" "w")
> > (parallel [(match_operand:SI 2 "immediate_operand" "i")])
> >)))]
> >"TARGET_SIMD"
> > @@ -142,6 +141,29 @@ (define_insn
> "*aarch64_simd_mov"
> >  mov_reg, neon_move")]
> >  )
> >
> > +(define_insn "*aarch64_simd_movv2hf"
> > +  [(set (match_operand:V2HF 0 "nonimmediate_operand"
> > +   "=w, m,  m,  w, ?r, ?w, ?r, w, w")
> > +   (match_operand:V2HF 1 "general_operand"
> > +   "m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
> > +  "TARGET_SIMD_F16INST
> > +   && (register_operand (operands[0], V2HFmode)
> > +   || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
> > +   "@
> > +ldr\\t%s0, %1
> > +str\\twzr, %0
> > +str\\t%s1, %0
> > +mov\\t%0.2s[0], %1.2s[0]
> > +umov\\t%w0, %1.s[0]
> > +fmov\\t%s0, %1
> > +mov\\t%0, %1
> > +movi\\t%d0, 0
> > +* return aarch64_output_simd_mov_immediate (operands[1], 32);"
> > +  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
> > +neon_logic, neon_to_gp, f_mcr,\
> > +mov_reg, 

Re: [PATCH 8/8]AArch64: Have reload not choose to do add on the scalar side if both values exist on the SIMD side.

2022-11-01 Thread Richard Sandiford via Gcc-patches
Tamar Christina  writes:
> Hi All,
>
> Currently we often times generate an r -> r add even if it means we need two
> reloads to perform it, i.e. in the case that the values are on the SIMD side.
>
> The pairwise operations expose these more now and so we get suboptimal 
> codegen.
>
> Normally I would have liked to use ^ or $ here, but while this works for the
> simple examples, reload inexplicably falls apart on examples that should have
> been trivial. It forces a move to r -> w to use the w ADD, which is counter to
> what ^ and $ should do.
>
> However ! seems to fix all the regression and still maintains the good 
> codegen.
>
> I have tried looking into whether it's our costings that are off, but I can't
> seem anything logical here.  So I'd like to push this change instead along 
> with
> test that augment the other testcases that guard the r -> r variants.

This feels like a hack though.  r<-r+r is one of the simplest thing
the processor can do, so I don't think it makes logical sense to mark
it with !, which means "prohibitively expensive".  It's likely to
push operations that require reloads onto the SIMD side.

Thanks,
Richard

> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64.md (*add3_aarch64): Add ! to the r -> r
>   alternative.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/simd/scalar_addp.c: New test.
>   * gcc.target/aarch64/simd/scalar_faddp.c: New test.
>   * gcc.target/aarch64/simd/scalar_faddp2.c: New test.
>   * gcc.target/aarch64/simd/scalar_fmaxp.c: New test.
>   * gcc.target/aarch64/simd/scalar_fminp.c: New test.
>   * gcc.target/aarch64/simd/scalar_maxp.c: New test.
>   * gcc.target/aarch64/simd/scalar_minp.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> 09ae1118371f82ca63146fceb953eb9e820d05a4..c333fb1f72725992bb304c560f1245a242d5192d
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -2043,7 +2043,7 @@ (define_expand "add3"
>  
>  (define_insn "*add3_aarch64"
>[(set
> -(match_operand:GPI 0 "register_operand" "=rk,rk,w,rk,r,r,rk")
> +(match_operand:GPI 0 "register_operand" "=rk,!rk,w,rk,r,r,rk")
>  (plus:GPI
>   (match_operand:GPI 1 "register_operand" "%rk,rk,w,rk,rk,0,rk")
>   (match_operand:GPI 2 "aarch64_pluslong_operand" 
> "I,r,w,J,Uaa,Uai,Uav")))]
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> new file mode 100644
> index 
> ..5b8d40f19884fc7b4e7decd80758bc36fa76d058
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> @@ -0,0 +1,70 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef long long v2di __attribute__((vector_size (16)));
> +typedef unsigned long long v2udi __attribute__((vector_size (16)));
> +typedef int v2si __attribute__((vector_size (16)));
> +typedef unsigned int v2usi __attribute__((vector_size (16)));
> +
> +/*
> +** foo:
> +**   addpd0, v0.2d
> +**   fmovx0, d0
> +**   ret
> +*/
> +long long
> +foo (v2di x)
> +{
> +  return x[1] + x[0];
> +}
> +
> +/*
> +** foo1:
> +**   saddlp  v0.1d, v0.2s
> +**   fmovx0, d0
> +**   ret
> +*/
> +long long
> +foo1 (v2si x)
> +{
> +  return x[1] + x[0];
> +}
> +
> +/*
> +** foo2:
> +**   uaddlp  v0.1d, v0.2s
> +**   fmovx0, d0
> +**   ret
> +*/
> +unsigned long long
> +foo2 (v2usi x)
> +{
> +  return x[1] + x[0];
> +}
> +
> +/*
> +** foo3:
> +**   uaddlp  v0.1d, v0.2s
> +**   add d0, d0, d1
> +**   fmovx0, d0
> +**   ret
> +*/
> +unsigned long long
> +foo3 (v2usi x, v2udi y)
> +{
> +  return (x[1] + x[0]) + y[0];
> +}
> +
> +/*
> +** foo4:
> +**   saddlp  v0.1d, v0.2s
> +**   add d0, d0, d1
> +**   fmovx0, d0
> +**   ret
> +*/
> +long long
> +foo4 (v2si x, v2di y)
> +{
> +  return (x[1] + x[0]) + y[0];
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c 
> b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> new file mode 100644
> index 
> ..ff455e060fc833b2f63e89c467b91a76fbe31aff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> @@ -0,0 +1,66 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> +/* { dg-additional-options "-save-temps -O1" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef double v2df __attribute__((vector_size (16)));
> +typedef float v4sf __attribute__((vector_size (16)));
> +typedef __fp16 v8hf __attribute__((vector_size (16)));
> +
> +/*
> +** foo:
> +**   faddp   

Re: [PATCH 5/8]AArch64 aarch64: Make existing V2HF be usable.

2022-11-01 Thread Richard Sandiford via Gcc-patches
Tamar Christina  writes:
> Hi All,
>
> The backend has an existing V2HFmode that is used by pairwise operations.
> This mode was however never made fully functional.  Amongst other things it 
> was
> never declared as a vector type which made it unusable from the mid-end.
>
> It's also lacking an implementation for load/stores so reload ICEs if this 
> mode
> is every used.  This finishes the implementation by providing the above.
>
> Note that I have created a new iterator VHSDF_P instead of extending VHSDF
> because the previous iterator is used in far more things than just 
> load/stores.
>
> It's also used for instance in intrinsics and extending this would force me to
> provide support for mangling the type while we never expose it through
> intrinsics.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64-simd.md (*aarch64_simd_movv2hf): New.
>   (mov, movmisalign, aarch64_dup_lane,
>   aarch64_store_lane0, aarch64_simd_vec_set,
>   @aarch64_simd_vec_copy_lane, vec_set,
>   reduc__scal_, reduc__scal_,
>   aarch64_reduc__internal, aarch64_get_lane,
>   vec_init, vec_extract): Support V2HF.
>   * config/aarch64/aarch64.cc (aarch64_classify_vector_mode):
>   Add E_V2HFmode.
>   * config/aarch64/iterators.md (VHSDF_P): New.
>   (V2F, VALL_F16_FULL, nunits, Vtype, Vmtype, Vetype, stype, VEL,
>   Vel, q, vp): Add V2HF.
>   * config/arm/types.md (neon_fp_reduc_add_h): New.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/sve/slp_1.c: Update testcase.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> 25aed74f8cf939562ed65a578fe32ca76605b58a..93a2888f567460ad10ec050ea7d4f701df4729d1
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -19,10 +19,10 @@
>  ;; .
>  
>  (define_expand "mov"
> -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> - (match_operand:VALL_F16 1 "general_operand"))]
> +  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
> + (match_operand:VALL_F16_FULL 1 "general_operand"))]
>"TARGET_SIMD"
> -  "
> +{
>/* Force the operand into a register if it is not an
>   immediate whose use can be replaced with xzr.
>   If the mode is 16 bytes wide, then we will be doing
> @@ -46,12 +46,11 @@ (define_expand "mov"
>aarch64_expand_vector_init (operands[0], operands[1]);
>DONE;
>  }
> -  "
> -)
> +})
>  
>  (define_expand "movmisalign"
> -  [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
> -(match_operand:VALL_F16 1 "general_operand"))]
> +  [(set (match_operand:VALL_F16_FULL 0 "nonimmediate_operand")
> +(match_operand:VALL_F16_FULL 1 "general_operand"))]
>"TARGET_SIMD && !STRICT_ALIGNMENT"
>  {
>/* This pattern is not permitted to fail during expansion: if both 
> arguments
> @@ -85,10 +84,10 @@ (define_insn "aarch64_simd_dup"
>  )
>  
>  (define_insn "aarch64_dup_lane"
> -  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
> - (vec_duplicate:VALL_F16
> +  [(set (match_operand:VALL_F16_FULL 0 "register_operand" "=w")
> + (vec_duplicate:VALL_F16_FULL
> (vec_select:
> - (match_operand:VALL_F16 1 "register_operand" "w")
> + (match_operand:VALL_F16_FULL 1 "register_operand" "w")
>   (parallel [(match_operand:SI 2 "immediate_operand" "i")])
>)))]
>"TARGET_SIMD"
> @@ -142,6 +141,29 @@ (define_insn "*aarch64_simd_mov"
>mov_reg, neon_move")]
>  )
>  
> +(define_insn "*aarch64_simd_movv2hf"
> +  [(set (match_operand:V2HF 0 "nonimmediate_operand"
> + "=w, m,  m,  w, ?r, ?w, ?r, w, w")
> + (match_operand:V2HF 1 "general_operand"
> + "m,  Dz, w,  w,  w,  r,  r, Dz, Dn"))]
> +  "TARGET_SIMD_F16INST
> +   && (register_operand (operands[0], V2HFmode)
> +   || aarch64_simd_reg_or_zero (operands[1], V2HFmode))"
> +   "@
> +ldr\\t%s0, %1
> +str\\twzr, %0
> +str\\t%s1, %0
> +mov\\t%0.2s[0], %1.2s[0]
> +umov\\t%w0, %1.s[0]
> +fmov\\t%s0, %1
> +mov\\t%0, %1
> +movi\\t%d0, 0
> +* return aarch64_output_simd_mov_immediate (operands[1], 32);"
> +  [(set_attr "type" "neon_load1_1reg, store_8, neon_store1_1reg,\
> +  neon_logic, neon_to_gp, f_mcr,\
> +  mov_reg, neon_move, neon_move")]
> +)
> +
>  (define_insn "*aarch64_simd_mov"
>[(set (match_operand:VQMOV 0 "nonimmediate_operand"
>   "=w, Umn,  m,  w, ?r, ?w, ?r, w")
> @@ -182,7 +204,7 @@ (define_insn "*aarch64_simd_mov"
>  
>  (define_insn "aarch64_store_lane0"
>[(set (match_operand: 0 "memory_operand" "=m")
> - (vec_select: (match_operand:VALL_F16 1 "register_operand" "w")
> + (vec_select: (match_operand:VALL_F16_FULL 1 "register_operand" "w")
>   

Re: [PATCH v2 3/3] p1689r5: initial support

2022-11-01 Thread Tom Tromey
> "Ben" == Ben Boeckel via Gcc-patches  writes:

Ben> - `-fdeps-file=` specifies the path to the file to write the format to.

I don't know how this output is intended to be used, but one mistake
made with the other dependency-tracking options was that the output file
isn't created atomically.  As a consequence, Makefiles normally have to
work around this to be robust.  If that's a possible issue here then it
would be best to handle it in this patch.

Tom


Re: [PATCH 4/8]AArch64 aarch64: Implement widening reduction patterns

2022-11-01 Thread Richard Sandiford via Gcc-patches
Tamar Christina  writes:
> Hi All,
>
> This implements the new widening reduction optab in the backend.
> Instead of introducing a duplicate definition for the same thing I have
> renamed the intrinsics defintions to use the same optab.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64-simd-builtins.def (saddlv, uaddlv): Rename to
>   reduc_splus_widen_scal_ and reduc_uplus_widen_scal_ respectively.
>   * config/aarch64/aarch64-simd.md (aarch64_addlv): Renamed to
>   ...
>   (reduc_plus_widen_scal_): ... This.
>   * config/aarch64/arm_neon.h (vaddlv_s8, vaddlv_s16, vaddlv_u8,
>   vaddlv_u16, vaddlvq_s8, vaddlvq_s16, vaddlvq_s32, vaddlvq_u8,
>   vaddlvq_u16, vaddlvq_u32, vaddlv_s32, vaddlv_u32): Use it.

OK, thanks.

Richard

> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def 
> b/gcc/config/aarch64/aarch64-simd-builtins.def
> index 
> cf46b31627b84476a25762ffc708fd84a4086e43..a4b21e1495c5699d8557a4bcb9e73ef98ae60b35
>  100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -190,9 +190,9 @@
>BUILTIN_VDQV_L (UNOP, saddlp, 0, NONE)
>BUILTIN_VDQV_L (UNOPU, uaddlp, 0, NONE)
>  
> -  /* Implemented by aarch64_addlv.  */
> -  BUILTIN_VDQV_L (UNOP, saddlv, 0, NONE)
> -  BUILTIN_VDQV_L (UNOPU, uaddlv, 0, NONE)
> +  /* Implemented by reduc_plus_widen_scal_.  */
> +  BUILTIN_VDQV_L (UNOP, reduc_splus_widen_scal_, 10, NONE)
> +  BUILTIN_VDQV_L (UNOPU, reduc_uplus_widen_scal_, 10, NONE)
>  
>/* Implemented by aarch64_abd.  */
>BUILTIN_VDQ_BHSI (BINOP, sabd, 0, NONE)
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> cf8c094bd4b76981cef2dd5dd7b8e6be0d56101f..25aed74f8cf939562ed65a578fe32ca76605b58a
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3455,7 +3455,7 @@ (define_expand "reduc_plus_scal_v4sf"
>DONE;
>  })
>  
> -(define_insn "aarch64_addlv"
> +(define_insn "reduc_plus_widen_scal_"
>   [(set (match_operand: 0 "register_operand" "=w")
> (unspec: [(match_operand:VDQV_L 1 "register_operand" "w")]
>   USADDLV))]
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 
> cf6af728ca99dae1cb6ab647466cfec32f7e913e..7b2c4c016191bcd6c3e075d27810faedb23854b7
>  100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -3664,70 +3664,70 @@ __extension__ extern __inline int16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlv_s8 (int8x8_t __a)
>  {
> -  return __builtin_aarch64_saddlvv8qi (__a);
> +  return __builtin_aarch64_reduc_splus_widen_scal_v8qi (__a);
>  }
>  
>  __extension__ extern __inline int32_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlv_s16 (int16x4_t __a)
>  {
> -  return __builtin_aarch64_saddlvv4hi (__a);
> +  return __builtin_aarch64_reduc_splus_widen_scal_v4hi (__a);
>  }
>  
>  __extension__ extern __inline uint16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlv_u8 (uint8x8_t __a)
>  {
> -  return __builtin_aarch64_uaddlvv8qi_uu (__a);
> +  return __builtin_aarch64_reduc_uplus_widen_scal_v8qi_uu (__a);
>  }
>  
>  __extension__ extern __inline uint32_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlv_u16 (uint16x4_t __a)
>  {
> -  return __builtin_aarch64_uaddlvv4hi_uu (__a);
> +  return __builtin_aarch64_reduc_uplus_widen_scal_v4hi_uu (__a);
>  }
>  
>  __extension__ extern __inline int16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlvq_s8 (int8x16_t __a)
>  {
> -  return __builtin_aarch64_saddlvv16qi (__a);
> +  return __builtin_aarch64_reduc_splus_widen_scal_v16qi (__a);
>  }
>  
>  __extension__ extern __inline int32_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlvq_s16 (int16x8_t __a)
>  {
> -  return __builtin_aarch64_saddlvv8hi (__a);
> +  return __builtin_aarch64_reduc_splus_widen_scal_v8hi (__a);
>  }
>  
>  __extension__ extern __inline int64_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlvq_s32 (int32x4_t __a)
>  {
> -  return __builtin_aarch64_saddlvv4si (__a);
> +  return __builtin_aarch64_reduc_splus_widen_scal_v4si (__a);
>  }
>  
>  __extension__ extern __inline uint16_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlvq_u8 (uint8x16_t __a)
>  {
> -  return __builtin_aarch64_uaddlvv16qi_uu (__a);
> +  return __builtin_aarch64_reduc_uplus_widen_scal_v16qi_uu (__a);
>  }
>  
>  __extension__ extern __inline uint32_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vaddlvq_u16 (uint16x8_t __a)
>  {
> -  return __builtin_aarch64_uaddlvv8hi_uu (__a);
> +  return 

Re: [PATCH 3/8]middle-end: Support extractions of subvectors from arbitrary element position inside a vector

2022-11-01 Thread Richard Sandiford via Gcc-patches
Tamar Christina via Gcc-patches  writes:
> Hi All,
>
> The current vector extract pattern can only extract from a vector when the
> position to extract is a multiple of the vector bitsize as a whole.
>
> That means extract something like a V2SI from a V4SI vector from position 32
> isn't possible as 32 is not a multiple of 64.  Ideally this optab should have
> worked on multiple of the element size, but too many targets rely on this
> semantic now.
>
> So instead add a new case which allows any extraction as long as the bit pos
> is a multiple of the element size.  We use a VEC_PERM to shuffle the elements
> into the bottom parts of the vector and then use a subreg to extract the 
> values
> out.  This now allows various vector operations that before were being
> decomposed into very inefficient scalar operations.
>
> NOTE: I added 3 testcases, I only fixed the 3rd one.
>
> The 1st one missed because we don't optimize VEC_PERM expressions into
> bitfields.  The 2nd one is missed because extract_bit_field only works on
> vector modes.  In this case the intermediate extract is DImode.
>
> On targets where the scalar mode is tiable to vector modes the extract should
> work fine.
>
> However I ran out of time to fix the first two and so will do so in GCC 14.
> For now this catches the case that my pattern now introduces more easily.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   * expmed.cc (extract_bit_field_1): Add support for vector element
>   extracts.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/ext_1.c: New.
>
> --- inline copy of patch -- 
> diff --git a/gcc/expmed.cc b/gcc/expmed.cc
> index 
> bab020c07222afa38305ef8d7333f271b1965b78..ffdf65210d17580a216477cfe4ac1598941ac9e4
>  100644
> --- a/gcc/expmed.cc
> +++ b/gcc/expmed.cc
> @@ -1718,6 +1718,45 @@ extract_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, 
> poly_uint64 bitnum,
> return target;
>   }
>   }
> +  else if (!known_eq (bitnum, 0U)
> +&& multiple_p (GET_MODE_UNIT_BITSIZE (tmode), bitnum, ))
> + {
> +   /* The encoding has a single stepped pattern.  */
> +   poly_uint64 nunits = GET_MODE_NUNITS (new_mode);
> +   int nelts = nunits.to_constant ();
> +   vec_perm_builder sel (nunits, nelts, 1);
> +   int delta = -pos.to_constant ();
> +   for (int i = 0; i < nelts; ++i)
> + sel.quick_push ((i - delta) % nelts);
> +   vec_perm_indices indices (sel, 1, nunits);

Thanks for doing this, looks good.  But I don't think the to_constant
calls are safe.  new_mode and pos could in principle be nonconstant.

To build a stepped pattern, we just need:

vec_perm_builder sel (nunits, 1, 3);

and then push pos, pos + 1, and pos + 2 to it.  There's no need to
clamp the position to nelts, it happens automatically.

> +
> +   if (can_vec_perm_const_p (new_mode, new_mode, indices, false))
> + {
> +   class expand_operand ops[4];
> +   machine_mode outermode = new_mode;
> +   machine_mode innermode = tmode;
> +   enum insn_code icode
> + = direct_optab_handler (vec_perm_optab, outermode);
> +   target = gen_reg_rtx (outermode);
> +   if (icode != CODE_FOR_nothing)
> + {
> +   rtx sel = vec_perm_indices_to_rtx (outermode, indices);
> +   create_output_operand ([0], target, outermode);
> +   ops[0].target = 1;
> +   create_input_operand ([1], op0, outermode);
> +   create_input_operand ([2], op0, outermode);
> +   create_input_operand ([3], sel, outermode);

I think this should be GET_MODE (sel).  Looks like the current
version would ICE for float vectors.  That said...

> +   if (maybe_expand_insn (icode, 4, ops))
> + return simplify_gen_subreg (innermode, target, outermode, 
> 0);
> + }
> +   else if (targetm.vectorize.vec_perm_const != NULL)
> + {
> +   if (targetm.vectorize.vec_perm_const (outermode, outermode,
> + target, op0, op0, 
> indices))
> + return simplify_gen_subreg (innermode, target, outermode, 
> 0);
> + }

...can we use expand_vec_perm_const here?  It will try the constant
expansion first, which is the preferred order.  It also has a few
variations up its sleeve.

Thanks,
Richard


> + }
> + }
>  }
>  
>/* See if we can get a better vector mode before extracting.  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/ext_1.c 
> b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> new file mode 100644
> index 
> ..18a10a14f1161584267a8472e571b3bc2ddf887a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/ext_1.c
> @@ -0,0 +1,54 @@
> +/* { dg-do compile } */
> +/* { 

[PATCH] doc: Remove outdated reference to "core" and front-end downloads

2022-11-01 Thread Jonathan Wakely via Gcc-patches
This will just confuse most users, the separate tarballs haven't existed
for years (as already noted elsewhere in install.texi).

OK for trunk?

-- >8 --

gcc/ChangeLog:

* doc/install.texi: Remove anachronism about separate source
tarballs.
---
 gcc/doc/install.texi | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gcc/doc/install.texi b/gcc/doc/install.texi
index c1876f24a84..e2c397c538a 100644
--- a/gcc/doc/install.texi
+++ b/gcc/doc/install.texi
@@ -3102,9 +3102,7 @@ but it can give you confidence in your new GCC 
installation or point out
 problems before you install and start using your new GCC@.
 
 First, you must have @uref{download.html,,downloaded the testsuites}.
-These are part of the full distribution, but if you downloaded the
-``core'' compiler plus any front ends, you must download the testsuites
-separately.
+These are included in the source tarball.
 
 Second, you must have the testing tools installed.  This includes
 @uref{https://www.gnu.org/software/dejagnu/,,DejaGnu}, Tcl, and Expect;
-- 
2.38.1



[COMMITTED] Make ranger the vrp1 default.

2022-11-01 Thread Andrew MacLeod via Gcc-patches

This patch turns ranger on by default for the VRP1 pass.

I needed to adjust gcc.dg/pr68217.c to scan for a better range ([-INF, 
-INF][0, 0]) than the original [-INF, 0] it was looking for.


This also triggers the new __builtin_unreachable code in the ranger VRP 
pass, so I added a new testcase to show accumulated unreachables are 
combined properly.  tree-ssa/pr107009.c also verifies that a 
non-dominated unreachable call doesn't incorrectly affect the global range.


Bootstrapped on x86_64-pc-linux-gnu with no regressions*. Pushed.

* OK. no regressions is debatable.  I've been seeing the following 
spurious failure for the past few weeks.  Ranger made them pass for a 
while, then made them fail, so Ive been ignoring them. I took a quick 
look.  Basically, we optimize away an unnecessary statement feeding a 
condition based on a combination of what ranger calculates and results 
from loop analysis.  (The statement is not referenced in any debug 
statement).  That later causes a PHI to no longer been needed, and DCE2 
removed the PHI and some other stuff, which then causes the debug_stmt 
to lose its reference.  /blame dce2 :-)


< FAIL: gcc.dg/guality/pr54693-2.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 21 x == 10 - i
< FAIL: gcc.dg/guality/pr54693-2.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 21 y == 20 - 2 * i
< FAIL: gcc.dg/guality/pr54693-2.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  -DPREVENT_OPTIMIZATION line 21 z == 30 - 3 * i


I choose to ignore this for now...  I want to get VRP1 turned to ranger 
by default and see if any issues show up.  I believe this removes the 
last remaining use of legacy vrp.


Andrew
commit e7310e24b1c0ca67b1bb507c1330b2bf39e59e32
Author: Andrew MacLeod 
Date:   Tue Oct 25 16:42:41 2022 -0400

Make ranger vrp1 default.

Turn on ranger as the default vrp1 pass and adjust testcases.

gcc/
* params.opt (param_vrp1_mode): Make ranger default.

gcc/testsuite/
* gcc.dg/pr68217.c: Test [-INF, -INF][0, 0] instead of [-INF, 0].
* gcc.dg/tree-ssa/vrp-unreachable.c: New.  Test unreachable removal.

diff --git a/gcc/params.opt b/gcc/params.opt
index 3001566e641..a34fee193fc 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1166,7 +1166,7 @@ Common Joined UInteger Var(param_vect_induction_float) 
Init(1) IntegerRage(0, 1)
 Enable loop vectorization of floating point inductions.
 
 -param=vrp1-mode=
-Common Joined Var(param_vrp1_mode) Enum(vrp_mode) Init(VRP_MODE_VRP) Param 
Optimization
+Common Joined Var(param_vrp1_mode) Enum(vrp_mode) Init(VRP_MODE_RANGER) Param 
Optimization
 --param=vrp1-mode=[vrp|ranger] Specifies the mode VRP1 should operate in.
 
 -param=vrp2-mode=
diff --git a/gcc/testsuite/gcc.dg/pr68217.c b/gcc/testsuite/gcc.dg/pr68217.c
index eb4f15e048f..60c80106760 100644
--- a/gcc/testsuite/gcc.dg/pr68217.c
+++ b/gcc/testsuite/gcc.dg/pr68217.c
@@ -10,4 +10,4 @@ int foo (void)
 return 0;
 }
 
-/* { dg-final { scan-tree-dump "\\\[-INF, 0\\\]" "vrp1" } } */
+/* { dg-final { scan-tree-dump "\\\[-INF, -INF\\\]\\\[0, 0\\\]" "vrp1" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vrp-unreachable.c 
b/gcc/testsuite/gcc.dg/tree-ssa/vrp-unreachable.c
new file mode 100644
index 000..cdc57403c6e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/vrp-unreachable.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-vrp1-alias -fdump-tree-vrp2-alias" } */
+
+void dead (unsigned n);
+void alive (unsigned n);
+
+void func (unsigned n, unsigned m)
+{
+  if (n == 0)
+__builtin_unreachable();
+  if (n == 1)
+__builtin_unreachable();
+  if (n & 0x1)
+__builtin_unreachable();
+  if (n == 2)
+__builtin_unreachable();
+  if (n == 3)
+__builtin_unreachable();
+  if (n & 0x2)
+__builtin_unreachable();
+  if (n == 4)
+__builtin_unreachable();
+  if (n == 5)
+__builtin_unreachable();
+  if (n & 0x4)
+__builtin_unreachable();
+  if (n == 6)
+__builtin_unreachable();
+  if (n == 7)
+__builtin_unreachable();
+ if (n <8)
+   dead (n);
+ if (n != m)
+__builtin_unreachable();
+ alive (n);
+ alive (m);
+}
+
+/* { dg-final { scan-tree-dump-not "dead" "vrp1" } } */
+/* { dg-final { scan-tree-dump-times "builtin_unreachable" 1 "vrp1" } } */
+/* { dg-final { scan-tree-dump-not "builtin_unreachable" "vrp2" } } */
+/* { dg-final { scan-tree-dump-times "fff8" 4 "vrp2" } } */


Re: [committed] libstdc++: Fix compare_exchange_padding.cc test for std::atomic_ref

2022-11-01 Thread Jonathan Wakely via Gcc-patches
On Mon, 31 Oct 2022 at 17:05, Jonathan Wakely  wrote:
>
> On Mon, 31 Oct 2022 at 17:03, Eric Botcazou  wrote:
> >
> > > I suppose we could use memcmp on the as variable itself, to inspect
> > > the actual stored padding rather than the returned copy of it.
> >
> > Yes, that's probably the only safe stance when optimization is enabled.
>
>
> Strictly speaking, it's not safe, because it's undefined to use memcmp
> on an object of a non-trivial type. But it should work.

Do those loads still get scalarized at -O0?



Re: [PATCH] libstdc++: Shortest denormal hex std::to_chars

2022-11-01 Thread Patrick Palka via Gcc-patches
On Tue, 1 Nov 2022, Jonathan Wakely wrote:

> On Tue, 1 Nov 2022 at 12:18, Jakub Jelinek  wrote:
> >
> > On Fri, Oct 28, 2022 at 12:52:44PM -0400, Patrick Palka wrote:
> > > > The following patch on top of
> > > > https://gcc.gnu.org/pipermail/libstdc++/2022-October/054849.html
> > > > adds std::{,b}float16_t support for std::to_chars.
> > > > When precision is specified (or for std::bfloat16_t for hex mode even 
> > > > if not),
> > > > I believe we can just use the std::to_chars float (when float is mode
> > > > compatible with std::float32_t) overloads, both formats are proper 
> > > > subsets
> > > > of std::float32_t.
> > > > Unfortunately when precision is not specified and we are supposed to 
> > > > emit
> > > > shortest string, the std::{,b}float16_t strings are usually much 
> > > > shorter.
> > > > E.g. 1.e7p-14f16 shortest fixed representation is
> > > > 0.0001161 and shortest scientific representation is
> > > > 1.161e-04 while 1.e7p-14f32 (same number promoted to std::float32_t)
> > > > 0.00011610985 and
> > > > 1.1610985e-04.
> > > > Similarly for 1.38p-112bf16,
> > > > 0.0235
> > > > 2.35e-34 vs. 1.38p-112f32
> > > > 0.023472271
> > > > 2.3472271e-34
> > > > For std::float16_t there are differences even in the shortest hex, say:
> > > > 0.01p-14 vs. 1p-22
> > > > but only for denormal std::float16_t values (where all std::float16_t
> > > > denormals converted to std::float32_t are normal), __FLT16_MIN__ and
> > > > everything larger in absolute value than that is the same.  Unless
> > > > that is a bug and we should try to discover shorter representations
> > > > even for denormals...
> > >
> > > IIRC for hex formatting of denormals I opted to be consistent with how
> > > glibc printf formats them, instead of outputting the truly shortest
> > > form.
> > >
> > > I wouldn't be against using the float32 overloads even for shortest hex
> > > formatting of float16.  The output is shorter but equivalent so it
> > > shouldn't cause any problems.
> >
> > The following patch changes the behavior of the shortest hex denormals,
> > such that they are printed like normals (so for has_implicit_leading_bit
> > with 1p-149 instead of 0.02p-126 etc., otherwise (Intel extended)
> > with the leading digit before dot being [89abcdef]).  I think for all the
> > supported format it is never longer, it can be equal length e.g. for
> > 0.fep-126 vs. 1.fcp-127 but fortunately no largest subnormal
> > in any format has the unbiased exponent like -9, -99, -999, - because
> > then it would be longer and often it is shorter, sometimes much shorter.
> >
> > For the cases with precision it keeps the handling as is.
> >
> > While for !has_implicit_leading_bit we for normals or with this patch
> > even denormals have really shortest representation, for other formats
> > we sometimes do not, but this patch doesn't deal with that (we
> > always use 1.NNN while we could use 1.NNN up to f.NNN and by that shortening
> > by the last hexit if the last hexit doesn't have least significant bit set
> > and unbiased exponent is not -9, -99, -999 or -.
> >
> > Tested on x86_64-linux (on top of the 3 to/from_chars {,b}float16_t
> > patches).
> 
> This looks good to me. Please give Patrick a chance to comment, but
> it's approved for trunk unless he objects. Thanks!

LGTM.  This'll mean the output of to_chars(denormal, hex, precision)
will no longer be based on the shortest form to_chars(denormal, hex)
which slightly bothers me, but doesn't seem to be nonconforming either.

> 
> 
> >
> > 2022-11-01  Jakub Jelinek  
> >
> > * src/c++17/floating_to_chars.cc (__floating_to_chars_hex): Drop 
> > const
> > from unbiased_exponent.  Canonicalize denormals such that they have
> > the leading bit set by shifting effective mantissa up and decreasing
> > unbiased_exponent.
> > (__floating_to_chars_shortest): Don't instantiate
> > __floating_to_chars_hex for float16_t either and use float instead.
> > * testsuite/20_util/to_chars/float.cc (float_to_chars_test_cases):
> > Adjust testcases for shortest hex denormals.
> > * testsuite/20_util/to_chars/double.cc (double_to_chars_test_cases):
> > Likewise.
> >
> > --- libstdc++-v3/src/c++17/floating_to_chars.cc.jj  2022-10-31 
> > 22:20:35.881121902 +0100
> > +++ libstdc++-v3/src/c++17/floating_to_chars.cc 2022-11-01 
> > 12:16:14.352652455 +0100
> > @@ -844,9 +844,9 @@ template
> >  const bool is_normal_number = (biased_exponent != 0);
> >
> >  // Calculate the unbiased exponent.
> > -const int32_t unbiased_exponent = (is_normal_number
> > -  ? biased_exponent - exponent_bias
> > -  : 1 - exponent_bias);
> > +int32_t unbiased_exponent = (is_normal_number
> > +? biased_exponent - exponent_bias
> > + 

[COMMITTED] Remove builtin_unreachable in ranger VRP.

2022-11-01 Thread Andrew MacLeod via Gcc-patches
Removal of __builtin_unreachable calls were being handled in an 
inconsistent way, and Im not convinced always correctly.   This removes 
them in the ranger VRP pass, and sets the global range appropriately.


This new approach should be consistent. After VRP runs, it uses ranger 
to query all the uses of every export affected by an edge leading to an 
unreachable builtin. It calculates their range at those use locations, 
and then verifies that the range at the end of the function also 
reflects those restrictions.


If that all holds true, then the unreachable call is removed and the 
global range updated.  If that does not hold true, then the global range 
is not set as the condition guarding the unreachable is contextual. if 
this is also the final VRP pass, the unreachable call is removed 
regardless. Otherwise it is left so other pases can still pick up the 
contextual ranges


This will only happen when ranger is the default VRP1 pass (not enabled 
yet), although this code will trigger to ensure all unreachables are 
removed in VRP2.


Bootstrapped on x86_64-pc-linux-gnu with no regressions.  Pushed.

Andrew
From 7b1cdca6d6d594a8a9d88062252212e145f2f4eb Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Mon, 31 Oct 2022 15:18:00 -0400
Subject: [PATCH 3/3] Remove builtin_unreachable in VRP

Removal of __builtin_unreachable calls were handled in an inconsistent
way.  This removes then in the VRP pass, and sets the global range
appropriately.

	* tree-vrp.cc (class remove_unreachable): New.
	(remove_unreachable::maybe_register_block): New.
	(remove_unreachable::remove_and_update_globals): New.
	(rvrp_folder::rvrp_folder): Initialize m_unreachable.
	(rvrp_folder::post_fold_bb): Maybe register unreachable block.
	(rvrp_folder::m_unreachable): New member.
	(execute_ranger_vrp): Add final_pass flag, remove unreachables.
---
 gcc/tree-vrp.cc | 190 +++-
 1 file changed, 187 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vrp.cc b/gcc/tree-vrp.cc
index e5a292bb875..f0e4d37bef0 100644
--- a/gcc/tree-vrp.cc
+++ b/gcc/tree-vrp.cc
@@ -51,6 +51,183 @@ along with GCC; see the file COPYING3.  If not see
 #include "value-pointer-equiv.h"
 #include "gimple-fold.h"
 #include "tree-dfa.h"
+#include "tree-ssa-dce.h"
+
+// This class is utilized by VRP and ranger to remove __builtin_unreachable
+// calls, and reflect any resulting global ranges.
+//
+// maybe_register_block () is called on basic blocks, and if that block
+// matches the pattern of one branch being a builtin_unreachable, register
+// the resulting executable edge in a list.
+//
+// After all blocks have been processed, remove_and_update_globals() will
+// - check all exports from registered blocks
+// - ensure the cache entry of each export is set with the appropriate range
+// - rewrite the conditions to take the executable edge
+// - perform DCE on any feeding instructions to those rewritten conditions
+//
+// Then each of the immediate use chain of each export is walked, and a new
+// global range created by unioning the ranges at all remaining use locations.
+
+class remove_unreachable {
+public:
+  remove_unreachable (gimple_ranger ) : m_ranger (r) { m_list.create (30); }
+  ~remove_unreachable () { m_list.release (); }
+  void maybe_register_block (basic_block bb);
+  bool remove_and_update_globals (bool final_p);
+  vec m_list;
+  gimple_ranger _ranger;
+};
+
+// Check if block BB has a __builtin_unreachable () call on one arm, and
+// register the executable edge if so.
+
+void
+remove_unreachable::maybe_register_block (basic_block bb)
+{
+  gimple *s = gimple_outgoing_range_stmt_p (bb);
+  if (!s || gimple_code (s) != GIMPLE_COND)
+return;
+
+  edge e0 = EDGE_SUCC (bb, 0);
+  basic_block bb0 = e0->dest;
+  bool un0 = EDGE_COUNT (bb0->succs) == 0
+	 && gimple_seq_unreachable_p (bb_seq (bb0));
+  edge e1 = EDGE_SUCC (bb, 1);
+  basic_block bb1 = e1->dest;
+  bool un1 = EDGE_COUNT (bb1->succs) == 0
+	 && gimple_seq_unreachable_p (bb_seq (bb1));
+
+  // If the 2 blocks are not different, ignore.
+  if (un0 == un1)
+return;
+
+  if (un0)
+m_list.safe_push (e1);
+  else
+m_list.safe_push (e0);
+}
+
+// Process the edges in the list, change the conditions and removing any
+// dead code feeding those conditions.  Calculate the range of any
+// names that may have been exported from those blocks, and determine if
+// there is any updates to their global ranges..
+// FINAL_P indicates all builtin_unreachable calls should be removed.
+// Return true if any builtin_unreachables/globals eliminated/updated.
+
+bool
+remove_unreachable::remove_and_update_globals (bool final_p)
+{
+  if (m_list.length () == 0)
+return false;
+
+  bool change = false;
+  tree name;
+  unsigned i;
+  bitmap_iterator bi;
+  auto_bitmap all_exports;
+  for (i = 0; i < m_list.length (); i++)
+{
+  edge e = m_list[i];
+  gimple *s = gimple_outgoing_range_stmt_p (e->src);
+  

[COMMITTED] Allow ranger queries on exit block.

2022-11-01 Thread Andrew MacLeod via Gcc-patches
Ranger was not allowing the exit block to be queried for range_on_entry 
or exit, for no good reason.  This removes that restriction.


Interestingly, it seems that when we calculate dominance info, GCC does 
not set the dominators for the EXIT_BLOCK?  I worked around it by 
starting with a single pred of the exit block for my queries, but as a 
result it doesn't support multiple exit blocks.


For the record:

  get_immediate_dominator (CDI_DOMINATORS, EXIT_BLOCK_PTR_FOR_FN (cfun))

returns NULL.   Is this actually working as intended?  It was unexpected 
on my part.


Bootstrapped on x86_64-pc-linux-gnu with no regressions.  Pushed.

Andrew
From 592bbe3d7eb3cff656c731e84ad872719a4a9d16 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Mon, 31 Oct 2022 10:56:25 -0400
Subject: [PATCH 2/3] Allow queries on exit block.

Ranger was not allowing the exit block to be queried for range_on_entry
or exit.  This removes that restriction.

	* gimple-range-cache.cc (ranger_cache::fill_block_cache): Allow
	exit block to be specified.
	(ranger_cache::range_from_dom): If exit block is specified, use
	the immediate predecessor instead of the dominator to start.
	* gimple-range.cc (gimple_ranger::range_on_exit): Allow query
	for exit block.
---
 gcc/gimple-range-cache.cc | 16 ++--
 gcc/gimple-range.cc   |  1 -
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/gcc/gimple-range-cache.cc b/gcc/gimple-range-cache.cc
index f279371948a..89e2403acce 100644
--- a/gcc/gimple-range-cache.cc
+++ b/gcc/gimple-range-cache.cc
@@ -1193,9 +1193,8 @@ ranger_cache::fill_block_cache (tree name, basic_block bb, basic_block def_bb)
   Value_Range block_result (type);
   Value_Range undefined (type);
 
-  // At this point we shouldn't be looking at the def, entry or exit block.
-  gcc_checking_assert (bb != def_bb && bb != ENTRY_BLOCK_PTR_FOR_FN (cfun) &&
-		   bb != EXIT_BLOCK_PTR_FOR_FN (cfun));
+  // At this point we shouldn't be looking at the def, entry block.
+  gcc_checking_assert (bb != def_bb && bb != ENTRY_BLOCK_PTR_FOR_FN (cfun));
   gcc_checking_assert (m_workback.length () == 0);
 
   // If the block cache is set, then we've already visited this block.
@@ -1434,10 +1433,15 @@ ranger_cache::range_from_dom (vrange , tree name, basic_block start_bb,
   // Default value is global range.
   get_global_range (r, name);
 
+  // The dominator of EXIT_BLOCK doesn't seem to be set, so at least handle
+  // the common single exit cases.
+  if (start_bb == EXIT_BLOCK_PTR_FOR_FN (cfun) && single_pred_p (start_bb))
+bb = single_pred_edge (start_bb)->src;
+  else
+bb = get_immediate_dominator (CDI_DOMINATORS, start_bb);
+
   // Search until a value is found, pushing blocks which may need calculating.
-  for (bb = get_immediate_dominator (CDI_DOMINATORS, start_bb);
-   bb;
-   prev_bb = bb, bb = get_immediate_dominator (CDI_DOMINATORS, bb))
+  for ( ; bb; prev_bb = bb, bb = get_immediate_dominator (CDI_DOMINATORS, bb))
 {
   // Accumulate any block exit inferred ranges.
   m_exit.maybe_adjust_range (infer, name, bb);
diff --git a/gcc/gimple-range.cc b/gcc/gimple-range.cc
index 058439733ee..110cf574454 100644
--- a/gcc/gimple-range.cc
+++ b/gcc/gimple-range.cc
@@ -167,7 +167,6 @@ void
 gimple_ranger::range_on_exit (vrange , basic_block bb, tree name)
 {
   // on-exit from the exit block?
-  gcc_checking_assert (bb != EXIT_BLOCK_PTR_FOR_FN (cfun));
   gcc_checking_assert (gimple_range_ssa_p (name));
 
   unsigned idx;
-- 
2.37.3



[COMMITTED] Irange::intersect with nonzero bits can indicate change incorrectly.

2022-11-01 Thread Andrew MacLeod via Gcc-patches
irange::intersect returns true if the intersection operation changes the 
value. If both ranges had nonzero bits set, intersect_nonzero_bits was 
not checking to see if the operation actually changes the bits or not, 
it changed the mask and returned true.


Bootstrapped on x86_64-pc-linux-gnu with no regressions.  Pushed.

Andrew
From 7cc2824e39440dd71a9d2832c51ef260bb36d8ca Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Mon, 31 Oct 2022 09:53:01 -0400
Subject: [PATCH 1/3] Intersect with nonzero bits can indicate change
 incorrectly.

	* value-range.cc (irange::intersect_nonzero_bits): If new
	non-zero mask is the same as original, flag no change.
---
 gcc/value-range.cc | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/value-range.cc b/gcc/value-range.cc
index 03b3c4b4a65..3743ec714b3 100644
--- a/gcc/value-range.cc
+++ b/gcc/value-range.cc
@@ -3017,6 +3017,10 @@ irange::intersect_nonzero_bits (const irange )
   if (mask_to_wi (m_nonzero_mask, t) != mask_to_wi (r.m_nonzero_mask, t))
 {
   wide_int nz = get_nonzero_bits () & r.get_nonzero_bits ();
+  // If the nonzero bits did not change, return false.
+  if (nz == get_nonzero_bits ())
+	return false;
+
   m_nonzero_mask = wide_int_to_tree (t, nz);
   if (set_range_from_nonzero_bits ())
 	return true;
-- 
2.37.3



[PATCH][AArch64] Cleanup move immediate code

2022-11-01 Thread Wilco Dijkstra via Gcc-patches
Hi Richard,

Here is the immediate cleanup splitoff from the previous patch:

Simplify, refactor and improve various move immediate functions.
Allow 32-bit MOVZ/N as a valid 64-bit immediate which removes special
cases in aarch64_internal_mov_immediate.  Add new constraint so the movdi
pattern only needs a single alternative for move immediate.

Passes bootstrap and regress, OK for commit?

gcc/ChangeLog:

* config/aarch64/aarch64.cc (aarch64_bitmask_imm): Use unsigned type.
(aarch64_zeroextended_move_imm): New function.
(aarch64_move_imm): Refactor, assert mode is SImode or DImode.
(aarch64_internal_mov_immediate): Assert mode is SImode or DImode.
Simplify special cases.
(aarch64_uimm12_shift): Simplify code.
(aarch64_clamp_to_uimm12_shift): Likewise.
(aarch64_movw_imm): Remove.
(aarch64_float_const_rtx_p): Pass either SImode or DImode to
aarch64_internal_mov_immediate.
(aarch64_rtx_costs): Likewise.
* config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
constraints into single 'O'.
(mov_aarch64): Likewise.
* config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
(aarch64_bitmask_imm): Likewise.
(aarch64_uimm12_shift): Likewise.
(aarch64_zeroextended_move_imm): New prototype.
* config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
limit 'N' to 64-bit only moves.

---

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 
1a71f02284137c64e7115b26e6aa00447596f105..a73bfa20acb9b92ae0475794c3f11c67d22feb97
 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -755,7 +755,7 @@ void aarch64_post_cfi_startproc (void);
 poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
 int aarch64_get_condition_code (rtx);
 bool aarch64_address_valid_for_prefetch_p (rtx, bool);
-bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
+bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
 unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
 unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode 
mode);
@@ -792,7 +792,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, 
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT,
unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
-bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
+bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
 machine_mode aarch64_sve_pred_mode (machine_mode);
@@ -842,8 +842,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
 bool aarch64_sve_float_mul_immediate_p (rtx);
 bool aarch64_split_dimode_const_store (rtx, rtx);
 bool aarch64_symbolic_address_p (rtx);
-bool aarch64_uimm12_shift (HOST_WIDE_INT);
+bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
 int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
+bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
 bool aarch64_use_return_insn_p (void);
 const char *aarch64_output_casesi (rtx *);
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
5d1ab5aa42b2cda0a655d2bc69c4df19da457ab3..798363bcc449c414de5bbb4f26b8e1c64a0cf71a
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -5558,12 +5558,10 @@ aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
 
 /* Return true if VAL is a valid bitmask immediate for MODE.  */
 bool
-aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
+aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
 {
   if (mode == DImode)
-return aarch64_bitmask_imm (val_in);
-
-  unsigned HOST_WIDE_INT val = val_in;
+return aarch64_bitmask_imm (val);
 
   if (mode == SImode)
 return aarch64_bitmask_imm ((val & 0x) | (val << 32));
@@ -5602,51 +5600,60 @@ aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
 }
 
 
-/* Return true if val is an immediate that can be loaded into a
-   register by a MOVZ instruction.  */
-static bool
-aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
+/* Return true if immediate VAL can only be created by using a 32-bit
+   zero-extended move immediate, not by a 64-bit move.  */
+bool
+aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT val)
 {
-  if (GET_MODE_SIZE (mode) > 4)
-{
-  if ((val & (((HOST_WIDE_INT) 0x) << 32)) == val
-  || (val & (((HOST_WIDE_INT) 0x) << 48)) == val)
-   return 1;
-}
-  else
-{
-  /* Ignore sign extension.  */
-  val &= (HOST_WIDE_INT) 0x;
-}
-  return ((val & (((HOST_WIDE_INT) 0x) 

Re: [PATCH] libstdc++: Implement ranges::as_rvalue_view from P2446R2

2022-11-01 Thread Jonathan Wakely via Gcc-patches
On Mon, 31 Oct 2022 at 20:33, Patrick Palka via Libstdc++
 wrote:
>
> Tested on x86_64-pc-linux-gnu, does this look OK for trunk?

OK, thanks.

We should update https://gcc.gnu.org/gcc-13/changes.html#libstdcxx again soon.

>
> libstdc++-v3/ChangeLog:
>
> * include/std/ranges (as_rvalue_view): Define.
> (enable_borrowed_range): Define.
> (views::__detail::__can_as_rvalue_view): Define.
> (views::_AsRvalue, views::as_rvalue): Define.
> * testsuite/std/ranges/adaptors/as_rvalue/1.cc: New test.
> ---
>  libstdc++-v3/include/std/ranges   | 88 +++
>  .../std/ranges/adaptors/as_rvalue/1.cc| 47 ++
>  2 files changed, 135 insertions(+)
>  create mode 100644 libstdc++-v3/testsuite/std/ranges/adaptors/as_rvalue/1.cc
>
> diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
> index 959886a1a55..239b3b61d30 100644
> --- a/libstdc++-v3/include/std/ranges
> +++ b/libstdc++-v3/include/std/ranges
> @@ -8486,6 +8486,94 @@ namespace views::__adaptor
>
>  inline constexpr _CartesianProduct cartesian_product;
>}
> +
> +  template
> +requires view<_Vp>
> +  class as_rvalue_view : public view_interface>
> +  {
> +_Vp _M_base = _Vp();
> +
> +  public:
> +as_rvalue_view() requires default_initializable<_Vp> = default;
> +
> +constexpr explicit
> +as_rvalue_view(_Vp __base)
> +: _M_base(std::move(__base))
> +{ }
> +
> +constexpr _Vp
> +base() const& requires copy_constructible<_Vp> { return _M_base; }
> +
> +constexpr _Vp
> +base() && { return std::move(_M_base); }
> +
> +constexpr auto
> +begin() requires (!__detail::__simple_view<_Vp>)
> +{ return move_iterator(ranges::begin(_M_base)); }
> +
> +constexpr auto
> +begin() const requires range
> +{ return move_iterator(ranges::begin(_M_base)); }
> +
> +constexpr auto
> +end() requires (!__detail::__simple_view<_Vp>)
> +{
> +  if constexpr (common_range<_Vp>)
> +   return move_iterator(ranges::end(_M_base));
> +  else
> +   return move_sentinel(ranges::end(_M_base));
> +}
> +
> +constexpr auto
> +end() const requires range
> +{
> +  if constexpr (common_range)
> +   return move_iterator(ranges::end(_M_base));
> +  else
> +   return move_sentinel(ranges::end(_M_base));
> +}
> +
> +constexpr auto
> +size() requires sized_range<_Vp>
> +{ return ranges::size(_M_base); }
> +
> +constexpr auto
> +size() const requires sized_range
> +{ return ranges::size(_M_base); }
> +  };
> +
> +  template
> +as_rvalue_view(_Range&&) -> as_rvalue_view>;
> +
> +  template
> +inline constexpr bool enable_borrowed_range>
> +  = enable_borrowed_range<_Tp>;
> +
> +  namespace views
> +  {
> +namespace __detail
> +{
> +  template
> +   concept __can_as_rvalue_view = requires { 
> as_rvalue_view(std::declval<_Tp>()); };
> +}
> +
> +struct _AsRvalue : __adaptor::_RangeAdaptorClosure
> +{
> +  template
> +   requires __detail::__can_as_rvalue_view<_Range>
> +   constexpr auto
> +   operator() [[nodiscard]] (_Range&& __r) const
> +   {
> + if constexpr (same_as,
> +   range_reference_t<_Range>>)
> +   return views::all(std::forward<_Range>(__r));
> + else
> +   return as_rvalue_view(std::forward<_Range>(__r));
> +   }
> +};
> +
> +inline constexpr _AsRvalue as_rvalue;
> +  }
>  #endif // C++23
>  } // namespace ranges
>
> diff --git a/libstdc++-v3/testsuite/std/ranges/adaptors/as_rvalue/1.cc 
> b/libstdc++-v3/testsuite/std/ranges/adaptors/as_rvalue/1.cc
> new file mode 100644
> index 000..8ca4f50e9d2
> --- /dev/null
> +++ b/libstdc++-v3/testsuite/std/ranges/adaptors/as_rvalue/1.cc
> @@ -0,0 +1,47 @@
> +// { dg-options "-std=gnu++23" }
> +// { dg-do run { target c++23 } }
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +namespace ranges = std::ranges;
> +namespace views = std::views;
> +
> +constexpr bool
> +test01()
> +{
> +
> +  std::unique_ptr a[3] = { std::make_unique(1),
> +   std::make_unique(2),
> +   std::make_unique(3) };
> +  std::unique_ptr b[3];
> +  auto v = a | views::as_rvalue;
> +  ranges::copy(v, b);
> +  VERIFY( ranges::all_of(a, [](auto& p) { return p.get() == nullptr; }) );
> +  VERIFY( ranges::equal(b | views::transform([](auto& p) { return *p; }), 
> (int[]){1, 2, 3}) );
> +
> +  return true;
> +}
> +
> +void
> +test02()
> +{
> +  std::unique_ptr x = std::make_unique(42);
> +  std::unique_ptr y;
> +  __gnu_test::test_input_range rx(, +1);
> +  auto v = rx | views::as_rvalue;
> +  static_assert(!ranges::common_range);
> +  ranges::copy(v, );
> +  VERIFY( x.get() == nullptr );
> +  VERIFY( *y == 42 );
> +}
> +
> +int
> +main()
> +{
> +  static_assert(test01());
> +  test02();
> +}
> --
> 

Re: [PATCH] libstdc++: std::from_chars std::{,b}float16_t support

2022-11-01 Thread Jonathan Wakely via Gcc-patches
On Tue, 1 Nov 2022 at 09:36, Jakub Jelinek  wrote:
>
> Hi!
>
> On top of the
> https://gcc.gnu.org/pipermail/libstdc++/2022-October/054849.html
> https://gcc.gnu.org/pipermail/libstdc++/2022-October/054886.html
> the following patch adds std::from_chars support, similarly to the
> previous std::to_chars patch through APIs that use float instead of
> the 16-bit floating point formats as container.
> The patch uses the fast_float library and doesn't need any changes
> to it, like the previous patch it introduces wrapper classes around
> float that represent the float holding float16_t or bfloat16_t value,
> and specializes binary_format etc. from fast_float for these classes.
>
> The new test verifies exhaustively to_chars and from_chars afterward
> results in the original value (except for nans) in all the fmt cases.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK, thanks.

>
> 2022-11-01  Jakub Jelinek  
>
> * include/std/charconv (__from_chars_float16_t,
> __from_chars_bfloat16_t): Declare.
> (from_chars): Add _Float16 and __gnu_cxx::__bfloat16_t overloads.
> * config/abi/pre/gnu.ver (GLIBCXX_3.4.31): Export
> _ZSt22__from_chars_float16_tPKcS0_RfSt12chars_format and
> _ZSt23__from_chars_bfloat16_tPKcS0_RfSt12chars_format.
> * src/c++17/floating_from_chars.cc
> (fast_float::floating_type_float16_t,
> fast_float::floating_type_bfloat16_t): New classes.
> (fast_float::binary_format,
> fast_float::binary_format): New
> specializations.
> (fast_float::to_float,
> fast_float::to_float,
> fast_float::to_extended,
> fast_float::to_extended): Likewise.
> (fast_float::from_chars_16): New template function.
> (__floating_from_chars_hex): Allow instantiation with
> fast_float::floating_type_{,b}float16_t.
> (from_chars): Formatting fixes for float/double/long double overloads.
> (__from_chars_float16_t, __from_chars_bfloat16_t): New functions.
> * testsuite/20_util/to_chars/float16_c++23.cc: New test.
>
> --- libstdc++-v3/include/std/charconv.jj2022-10-28 11:15:40.113959052 
> +0200
> +++ libstdc++-v3/include/std/charconv   2022-10-28 11:28:04.172657801 +0200
> @@ -673,6 +673,32 @@ namespace __detail
>from_chars(const char* __first, const char* __last, long double& __value,
>  chars_format __fmt = chars_format::general) noexcept;
>
> +  // Library routines for 16-bit extended floating point formats
> +  // using float as interchange format.
> +  from_chars_result
> +  __from_chars_float16_t(const char* __first, const char* __last,
> +float& __value,
> +chars_format __fmt = chars_format::general) noexcept;
> +  from_chars_result
> +  __from_chars_bfloat16_t(const char* __first, const char* __last,
> + float& __value,
> + chars_format __fmt = chars_format::general) 
> noexcept;
> +
> +#if defined(__STDCPP_FLOAT16_T__) && 
> defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32) \
> +&& defined(__cpp_lib_to_chars)
> +  inline from_chars_result
> +  from_chars(const char* __first, const char* __last, _Float16& __value,
> +chars_format __fmt = chars_format::general) noexcept
> +  {
> +float __val;
> +from_chars_result __res
> +  = __from_chars_float16_t(__first, __last, __val, __fmt);
> +if (__res.ec == errc{})
> +  __value = __val;
> +return __res;
> +  }
> +#endif
> +
>  #if defined(__STDCPP_FLOAT32_T__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32)
>inline from_chars_result
>from_chars(const char* __first, const char* __last, _Float32& __value,
> @@ -709,6 +735,22 @@ namespace __detail
>  if (__res.ec == errc{})
>__value = __val;
>  return __res;
> +  }
> +#endif
> +
> +#if defined(__STDCPP_BFLOAT16_T__) && 
> defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32) \
> +&& defined(__cpp_lib_to_chars)
> +  inline from_chars_result
> +  from_chars(const char* __first, const char* __last,
> +__gnu_cxx::__bfloat16_t & __value,
> +chars_format __fmt = chars_format::general) noexcept
> +  {
> +float __val;
> +from_chars_result __res
> +  = __from_chars_bfloat16_t(__first, __last, __val, __fmt);
> +if (__res.ec == errc{})
> +  __value = __val;
> +return __res;
>}
>  #endif
>  #endif
> --- libstdc++-v3/config/abi/pre/gnu.ver.jj  2022-10-28 11:15:40.115959024 
> +0200
> +++ libstdc++-v3/config/abi/pre/gnu.ver 2022-10-28 16:55:55.274849390 +0200
> @@ -2448,6 +2448,8 @@ GLIBCXX_3.4.31 {
>  
> _ZNSt7__cxx1112basic_stringI[cw]St11char_traitsI[cw]ESaI[cw]EE15_M_replace_cold*;
>  _ZSt20__to_chars_float16_tPcS_fSt12chars_format;
>  _ZSt21__to_chars_bfloat16_tPcS_fSt12chars_format;
> +_ZSt22__from_chars_float16_tPKcS0_RfSt12chars_format;
> +_ZSt23__from_chars_bfloat16_tPKcS0_RfSt12chars_format;
>  } 

Re: [PATCH] libstdc++: Shortest denormal hex std::to_chars

2022-11-01 Thread Jonathan Wakely via Gcc-patches
On Tue, 1 Nov 2022 at 12:18, Jakub Jelinek  wrote:
>
> On Fri, Oct 28, 2022 at 12:52:44PM -0400, Patrick Palka wrote:
> > > The following patch on top of
> > > https://gcc.gnu.org/pipermail/libstdc++/2022-October/054849.html
> > > adds std::{,b}float16_t support for std::to_chars.
> > > When precision is specified (or for std::bfloat16_t for hex mode even if 
> > > not),
> > > I believe we can just use the std::to_chars float (when float is mode
> > > compatible with std::float32_t) overloads, both formats are proper subsets
> > > of std::float32_t.
> > > Unfortunately when precision is not specified and we are supposed to emit
> > > shortest string, the std::{,b}float16_t strings are usually much shorter.
> > > E.g. 1.e7p-14f16 shortest fixed representation is
> > > 0.0001161 and shortest scientific representation is
> > > 1.161e-04 while 1.e7p-14f32 (same number promoted to std::float32_t)
> > > 0.00011610985 and
> > > 1.1610985e-04.
> > > Similarly for 1.38p-112bf16,
> > > 0.0235
> > > 2.35e-34 vs. 1.38p-112f32
> > > 0.023472271
> > > 2.3472271e-34
> > > For std::float16_t there are differences even in the shortest hex, say:
> > > 0.01p-14 vs. 1p-22
> > > but only for denormal std::float16_t values (where all std::float16_t
> > > denormals converted to std::float32_t are normal), __FLT16_MIN__ and
> > > everything larger in absolute value than that is the same.  Unless
> > > that is a bug and we should try to discover shorter representations
> > > even for denormals...
> >
> > IIRC for hex formatting of denormals I opted to be consistent with how
> > glibc printf formats them, instead of outputting the truly shortest
> > form.
> >
> > I wouldn't be against using the float32 overloads even for shortest hex
> > formatting of float16.  The output is shorter but equivalent so it
> > shouldn't cause any problems.
>
> The following patch changes the behavior of the shortest hex denormals,
> such that they are printed like normals (so for has_implicit_leading_bit
> with 1p-149 instead of 0.02p-126 etc., otherwise (Intel extended)
> with the leading digit before dot being [89abcdef]).  I think for all the
> supported format it is never longer, it can be equal length e.g. for
> 0.fep-126 vs. 1.fcp-127 but fortunately no largest subnormal
> in any format has the unbiased exponent like -9, -99, -999, - because
> then it would be longer and often it is shorter, sometimes much shorter.
>
> For the cases with precision it keeps the handling as is.
>
> While for !has_implicit_leading_bit we for normals or with this patch
> even denormals have really shortest representation, for other formats
> we sometimes do not, but this patch doesn't deal with that (we
> always use 1.NNN while we could use 1.NNN up to f.NNN and by that shortening
> by the last hexit if the last hexit doesn't have least significant bit set
> and unbiased exponent is not -9, -99, -999 or -.
>
> Tested on x86_64-linux (on top of the 3 to/from_chars {,b}float16_t
> patches).

This looks good to me. Please give Patrick a chance to comment, but
it's approved for trunk unless he objects. Thanks!


>
> 2022-11-01  Jakub Jelinek  
>
> * src/c++17/floating_to_chars.cc (__floating_to_chars_hex): Drop const
> from unbiased_exponent.  Canonicalize denormals such that they have
> the leading bit set by shifting effective mantissa up and decreasing
> unbiased_exponent.
> (__floating_to_chars_shortest): Don't instantiate
> __floating_to_chars_hex for float16_t either and use float instead.
> * testsuite/20_util/to_chars/float.cc (float_to_chars_test_cases):
> Adjust testcases for shortest hex denormals.
> * testsuite/20_util/to_chars/double.cc (double_to_chars_test_cases):
> Likewise.
>
> --- libstdc++-v3/src/c++17/floating_to_chars.cc.jj  2022-10-31 
> 22:20:35.881121902 +0100
> +++ libstdc++-v3/src/c++17/floating_to_chars.cc 2022-11-01 12:16:14.352652455 
> +0100
> @@ -844,9 +844,9 @@ template
>  const bool is_normal_number = (biased_exponent != 0);
>
>  // Calculate the unbiased exponent.
> -const int32_t unbiased_exponent = (is_normal_number
> -  ? biased_exponent - exponent_bias
> -  : 1 - exponent_bias);
> +int32_t unbiased_exponent = (is_normal_number
> +? biased_exponent - exponent_bias
> +: 1 - exponent_bias);
>
>  // Shift the mantissa so that its bitwidth is a multiple of 4.
>  constexpr unsigned rounded_mantissa_bits = (mantissa_bits + 3) / 4 * 4;
> @@ -863,6 +863,16 @@ template
>   __glibcxx_assert(effective_mantissa & (mantissa_t{1} << 
> (mantissa_bits
>- 1u)));
>}
> +else if (!precision.has_value() && effective_mantissa)
> +  {
> + 

Re: [PATCH] libstdc++: std::to_chars std::{,b}float16_t support

2022-11-01 Thread Jonathan Wakely via Gcc-patches
On Thu, 27 Oct 2022 at 09:00, Jakub Jelinek  wrote:
>
> Hi!
>
> The following patch on top of
> https://gcc.gnu.org/pipermail/libstdc++/2022-October/054849.html
> adds std::{,b}float16_t support for std::to_chars.
> When precision is specified (or for std::bfloat16_t for hex mode even if not),
> I believe we can just use the std::to_chars float (when float is mode
> compatible with std::float32_t) overloads, both formats are proper subsets
> of std::float32_t.
> Unfortunately when precision is not specified and we are supposed to emit
> shortest string, the std::{,b}float16_t strings are usually much shorter.
> E.g. 1.e7p-14f16 shortest fixed representation is
> 0.0001161 and shortest scientific representation is
> 1.161e-04 while 1.e7p-14f32 (same number promoted to std::float32_t)
> 0.00011610985 and
> 1.1610985e-04.
> Similarly for 1.38p-112bf16,
> 0.0235
> 2.35e-34 vs. 1.38p-112f32
> 0.023472271
> 2.3472271e-34
> For std::float16_t there are differences even in the shortest hex, say:
> 0.01p-14 vs. 1p-22
> but only for denormal std::float16_t values (where all std::float16_t
> denormals converted to std::float32_t are normal), __FLT16_MIN__ and
> everything larger in absolute value than that is the same.  Unless
> that is a bug and we should try to discover shorter representations
> even for denormals...
> std::bfloat16_t has the same exponent range as std::float32_t, so all
> std::bfloat16_t denormals are also std::float32_t denormals and thus
> the shortest hex representations are the same.
>
> As documented, ryu can handle arbitrary IEEE like floating point formats
> (probably not wider than IEEE quad) using the generic_128 handling, but
> ryu is hidden in libstdc++.so.  As only few architectures support
> std::float16_t right now and some of them have special ISA requirements
> for those (e.g. on i?86 one needs -msse2) and std::bfloat16_t is right
> now supported only on x86 (again with -msse2), perhaps with aarch64/arm
> coming next if ARM is interested, but I think it is possible that more
> will be added later, instead of exporting APIs from the library to handle
> directly the std::{,b}float16_t overloads this patch instead exports
> functions which take a float which is a superset of those and expects
> the inline overloads to promote the 16-bit formats to 32-bit, then inside
> of the library it ensures they are printed right.
> With the added [[gnu::cold]] attribute because I think most users
> will primarily use these formats as storage formats and perform arithmetics
> in the excess precision for them and print also as std::float32_t the
> added support doesn't seem to be too large, on x86_64:
> readelf -Ws libstdc++.so.6.0.31 | grep float16_t
>912: 000ae824   950 FUNCGLOBAL DEFAULT   13 
> _ZSt21__to_chars_bfloat16_tPcS_fSt12chars_format@@GLIBCXX_3.4.31
>   5767: 000ae4a1   899 FUNCGLOBAL DEFAULT   13 
> _ZSt20__to_chars_float16_tPcS_fSt12chars_format@@GLIBCXX_3.4.31
>842: 0016d430   106 FUNCLOCAL  DEFAULT   13 
> _ZN12_GLOBAL__N_113get_ieee_reprINS_23floating_type_float16_tEEENS_6ieee_tIT_EES3_
>865: 00170980  1613 FUNCLOCAL  DEFAULT   13 
> _ZSt23__floating_to_chars_hexIN12_GLOBAL__N_123floating_type_float16_tEESt15to_chars_resultPcS3_T_St8optionalIiE.constprop.0.isra.0
>   7205: 000ae824   950 FUNCGLOBAL DEFAULT   13 
> _ZSt21__to_chars_bfloat16_tPcS_fSt12chars_format
>   7985: 000ae4a1   899 FUNCGLOBAL DEFAULT   13 
> _ZSt20__to_chars_float16_tPcS_fSt12chars_format
> so 3568 code bytes together or so.
>
> Tested with the attached test (which doesn't prove the shortest
> representation, just prints std::{,b}float16_t and std::float32_t
> shortest strings side by side, then tries to verify it can be
> emitted even into the exact sized range and can't be into range
> one smaller than that and tries to read what is printed
> back using from_chars float32_t overload (so there could be
> double rounding, but apparently there is none for the shortest strings).
> The only differences printed are for NaNs, where sNaNs are canonicalized
> to canonical qNaNs and as to_chars doesn't print NaN mantissa, even qNaNs
> other than the canonical one are read back just as the canonical NaN.
>
> Also attaching what Patrick wrote to generate the pow10_adjustment_tab,
> for std::float16_t only 1.0, 10.0, 100.0, 1000.0 and 1.0 are powers
> of 10 in the range because __FLT16_MAX__ is 65504.0, and all of the above
> are exactly representable in std::float16_t, so we want to use 0 in
> pow10_adjustment_tab.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>

Unless I misunderstood something in Patrick's review, this is good and
can be incrementally improved.

OK for trunk, thanks.



Re: [PATCH] [X86_64]: Enable support for next generation AMD Zen4 CPU

2022-11-01 Thread Alexander Monakov

On Mon, 31 Oct 2022, Jan Hubička wrote:

> Hello,
> thanks for checking the performance.  The patch is OK.

Thanks, pushed the attached patch, and working on a corresponding change for
floating-point divisions.

AlexanderFrom 1962a8b22d3d3fb5b6bb5598295a4571daf8876f Mon Sep 17 00:00:00 2001
From: Alexander Monakov 
Date: Mon, 31 Oct 2022 17:35:57 +0300
Subject: [PATCH] i386: correct integer division modeling in znver.md

In znver.md, division instructions have descriptions like

(define_insn_reservation "znver1_idiv_DI" 41
(and (eq_attr "cpu" "znver1,znver2")
 (and (eq_attr "type" "idiv")
  (and (eq_attr "mode" "DI")
   (eq_attr "memory" "none"
"znver1-double,znver1-ieu2*41")

which says that DImode idiv has latency 41 (which is correct) and that
it occupies 2nd integer execution unit for 41 consecutive cycles, but
that is not correct:

1) the division instruction is partially pipelined, and has throughput
   1/14, not 1/41;

2) for the most part it occupies a separate division unit, not the
   general arithmetic unit.

Evidently, interaction of such 41-cycle paths with the rest of
reservations causes a combinatorial explosion in the automaton.

Fix this by modeling the integer division unit properly, and correcting
reservations to use the measured reciprocal throughput of those
instructions (available from uops.info). A similar correction for
floating-point divisions is left for a followup patch.

Top 5 znver table sizes, before:

68692 r znver1_ieu_check
68692 r znver1_ieu_transitions
99792 r znver1_ieu_min_issue_delay
428108 r znver1_fp_min_issue_delay
856216 r znver1_fp_transitions

After:

1454 r znver1_ieu_translate
1454 r znver1_translate
2304 r znver1_ieu_transitions
428108 r znver1_fp_min_issue_delay
856216 r znver1_fp_transitions

gcc/ChangeLog:

PR target/87832
* config/i386/znver.md (znver1_idiv): New automaton.
(znver1-idiv): New unit.
(znver1_idiv_DI): Correct unit and cycles in the reservation.
(znver1_idiv_SI): Ditto.
(znver1_idiv_HI): Ditto.
(znver1_idiv_QI): Ditto.
(znver1_idiv_mem_DI): Ditto.
(znver1_idiv_mem_SI): Ditto.
(znver1_idiv_mem_HI): Ditto.
(znver1_idiv_mem_QI): Ditto.
(znver3_idiv_DI): Ditto.
(znver3_idiv_SI): Ditto.
(znver3_idiv_HI): Ditto.
(znver3_idiv_QI): Ditto.
(znver3_idiv_mem_DI): Ditto.
(znver3_idiv_mem_SI): Ditto.
(znver3_idiv_mem_HI): Ditto.
(znver3_idiv_mem_QI): Ditto.
---
 gcc/config/i386/znver.md | 39 +--
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/gcc/config/i386/znver.md b/gcc/config/i386/znver.md
index 9c25b4e27..4aa098fd8 100644
--- a/gcc/config/i386/znver.md
+++ b/gcc/config/i386/znver.md
@@ -23,8 +23,8 @@ (define_attr "znver1_decode" "direct,vector,double"
 
 ;; AMD znver1, znver2 and znver3 Scheduling
 ;; Modeling automatons for zen decoders, integer execution pipes,
-;; AGU pipes and floating point execution units.
-(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu")
+;; SIMD/FP domain, AGU pipes, and dividers.
+(define_automaton "znver1, znver1_ieu, znver1_fp, znver1_agu, znver1_idiv")
 
 ;; Decoders unit has 4 decoders and all of them can decode fast path
 ;; and vector type instructions.
@@ -93,6 +93,9 @@ (define_reservation "znver2-fvector" "znver1-fp0+znver1-fp1
  +znver1-fp2+znver1-fp3
  +znver1-agu0+znver1-agu1+znver2-agu2")
 
+;; Dividers
+(define_cpu_unit "znver1-idiv" "znver1_idiv")
+
 ;; Call instruction
 (define_insn_reservation "znver1_call" 1
 (and (eq_attr "cpu" "znver1")
@@ -176,28 +179,28 @@ (define_insn_reservation "znver1_idiv_DI" 41
  (and (eq_attr "type" "idiv")
   (and (eq_attr "mode" "DI")
(eq_attr "memory" "none"
-"znver1-double,znver1-ieu2*41")
+"znver1-double,znver1-idiv*14")
 
 (define_insn_reservation "znver1_idiv_SI" 25
 (and (eq_attr "cpu" "znver1,znver2")
  (and (eq_attr "type" "idiv")
   (and (eq_attr "mode" "SI")
(eq_attr "memory" "none"
-"znver1-double,znver1-ieu2*25")
+"znver1-double,znver1-idiv*14")
 
 (define_insn_reservation "znver1_idiv_HI" 17
 (and (eq_attr "cpu" "znver1,znver2")
  (and (eq_attr "type" "idiv")
   (and (eq_attr "mode" "HI")
(eq_attr "memory" "none"
-

[PATCH] libstdc++: Shortest denormal hex std::to_chars

2022-11-01 Thread Jakub Jelinek via Gcc-patches
On Fri, Oct 28, 2022 at 12:52:44PM -0400, Patrick Palka wrote:
> > The following patch on top of
> > https://gcc.gnu.org/pipermail/libstdc++/2022-October/054849.html
> > adds std::{,b}float16_t support for std::to_chars.
> > When precision is specified (or for std::bfloat16_t for hex mode even if 
> > not),
> > I believe we can just use the std::to_chars float (when float is mode
> > compatible with std::float32_t) overloads, both formats are proper subsets
> > of std::float32_t.
> > Unfortunately when precision is not specified and we are supposed to emit
> > shortest string, the std::{,b}float16_t strings are usually much shorter.
> > E.g. 1.e7p-14f16 shortest fixed representation is
> > 0.0001161 and shortest scientific representation is
> > 1.161e-04 while 1.e7p-14f32 (same number promoted to std::float32_t)
> > 0.00011610985 and
> > 1.1610985e-04.
> > Similarly for 1.38p-112bf16,
> > 0.0235
> > 2.35e-34 vs. 1.38p-112f32
> > 0.023472271
> > 2.3472271e-34
> > For std::float16_t there are differences even in the shortest hex, say:
> > 0.01p-14 vs. 1p-22
> > but only for denormal std::float16_t values (where all std::float16_t
> > denormals converted to std::float32_t are normal), __FLT16_MIN__ and
> > everything larger in absolute value than that is the same.  Unless
> > that is a bug and we should try to discover shorter representations
> > even for denormals...
> 
> IIRC for hex formatting of denormals I opted to be consistent with how
> glibc printf formats them, instead of outputting the truly shortest
> form.
> 
> I wouldn't be against using the float32 overloads even for shortest hex
> formatting of float16.  The output is shorter but equivalent so it
> shouldn't cause any problems.

The following patch changes the behavior of the shortest hex denormals,
such that they are printed like normals (so for has_implicit_leading_bit
with 1p-149 instead of 0.02p-126 etc., otherwise (Intel extended)
with the leading digit before dot being [89abcdef]).  I think for all the
supported format it is never longer, it can be equal length e.g. for
0.fep-126 vs. 1.fcp-127 but fortunately no largest subnormal
in any format has the unbiased exponent like -9, -99, -999, - because
then it would be longer and often it is shorter, sometimes much shorter.

For the cases with precision it keeps the handling as is.

While for !has_implicit_leading_bit we for normals or with this patch
even denormals have really shortest representation, for other formats
we sometimes do not, but this patch doesn't deal with that (we
always use 1.NNN while we could use 1.NNN up to f.NNN and by that shortening
by the last hexit if the last hexit doesn't have least significant bit set
and unbiased exponent is not -9, -99, -999 or -.

Tested on x86_64-linux (on top of the 3 to/from_chars {,b}float16_t
patches).

2022-11-01  Jakub Jelinek  

* src/c++17/floating_to_chars.cc (__floating_to_chars_hex): Drop const
from unbiased_exponent.  Canonicalize denormals such that they have
the leading bit set by shifting effective mantissa up and decreasing
unbiased_exponent.
(__floating_to_chars_shortest): Don't instantiate
__floating_to_chars_hex for float16_t either and use float instead.
* testsuite/20_util/to_chars/float.cc (float_to_chars_test_cases):
Adjust testcases for shortest hex denormals.
* testsuite/20_util/to_chars/double.cc (double_to_chars_test_cases):
Likewise.

--- libstdc++-v3/src/c++17/floating_to_chars.cc.jj  2022-10-31 
22:20:35.881121902 +0100
+++ libstdc++-v3/src/c++17/floating_to_chars.cc 2022-11-01 12:16:14.352652455 
+0100
@@ -844,9 +844,9 @@ template
 const bool is_normal_number = (biased_exponent != 0);
 
 // Calculate the unbiased exponent.
-const int32_t unbiased_exponent = (is_normal_number
-  ? biased_exponent - exponent_bias
-  : 1 - exponent_bias);
+int32_t unbiased_exponent = (is_normal_number
+? biased_exponent - exponent_bias
+: 1 - exponent_bias);
 
 // Shift the mantissa so that its bitwidth is a multiple of 4.
 constexpr unsigned rounded_mantissa_bits = (mantissa_bits + 3) / 4 * 4;
@@ -863,6 +863,16 @@ template
  __glibcxx_assert(effective_mantissa & (mantissa_t{1} << (mantissa_bits
   - 1u)));
   }
+else if (!precision.has_value() && effective_mantissa)
+  {
+   // 1.8p-23 is shorter than 0.00cp-14, so if precision is
+   // omitted, try to canonicalize denormals such that they
+   // have the leading bit set.
+   int width = __bit_width(effective_mantissa);
+   int shift = rounded_mantissa_bits - width + has_implicit_leading_bit;
+   unbiased_exponent -= shift;
+   effective_mantissa 

Re: [PATCH] libstdc++-v3: Some std::*float*_t charconv and i/ostream overloads

2022-11-01 Thread Jonathan Wakely via Gcc-patches
On Wed, 19 Oct 2022 at 13:59, Jakub Jelinek  wrote:
>
> Hi!
>
> The following patch adds the easy part of ,  and
>  changes for extended floats.
> In particular, for the first one only overloads where the _Float* has
> the same format as float/double/long double and for the latter two
> everything but the _GLIBCXX_HAVE_FLOAT128_MATH case.
> For charconv, I'm not really familiar with it, I'm pretty sure
> we need new libstdc++.so.6 side implementation of from_chars for
> {,b}float16_t and for to_chars not really sure but for unspecified precision
> if it should emit minimum characters that to_chars then can unambiguously
> parse, I think it is less than in the float case.  For float128_t
> {to,from}_chars I think we even have it on the library side already, just
> ifdefed for powerpc64le only.
> For i/o stream operator<>, not sure what is better, if not providing
> anything at all, or doing what we in the end do if user doesn't override
> the virtual functions, or use {to,from}_chars under the hood, something
> else?
> Besides this, the patch adds some further missed
> // { dg-options "-std=gnu++2b" }
> spots, I've also noticed I got the formatting wrong in some testcases
> by not using spaces around VERIFY conditions and elsewhere by having
> space before ( for calls.
> The testsuite coverage is limited, I've added test for from_chars because
> it was easy to port, but not really sure what to do about to_chars, it has
> for float/double huge testcases which would be excessive to repeat.
> And for i/ostream not really sure what exactly is worth testing.
>
> Tested on x86_64-linux with --target_board=unix/-std=gnu++23, ok for trunk?
>

OK, thanks!



[committed] wwwdocs: readings: Switch sourceforge.net sub-sites to https

2022-11-01 Thread Gerald Pfeifer
Pushed.

Gerald
---
 htdocs/readings.html | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/htdocs/readings.html b/htdocs/readings.html
index 5622bcc3..df89bc9c 100644
--- a/htdocs/readings.html
+++ b/htdocs/readings.html
@@ -34,7 +34,7 @@
   http://ftp.axis.se/pub/users/hp/pgccfd/;>Porting GCC for
   Dunces by Hans-Peter Nilsson mailto:hans-peter.nils...@axis.com;>hans-peter.nils...@axis.com.
 -->
-  http://cobolforgcc.sourceforge.net/cobol_toc.html;>Using,
+  https://cobolforgcc.sourceforge.net/cobol_toc.html;>Using,
   Maintaining and Enhancing COBOL for the GNU Compiler Collection (GCC)
   by Joachim Nadler and Tim Josling
   mailto:t...@melbpc.org.au;>t...@melbpc.org.au.
@@ -440,12 +440,12 @@ names.
 therefore makes it possible to stress the compiler error handling.
   
   
-http://flibs.sourceforge.net;>Checking
+https://flibs.sourceforge.net;>Checking
 properties of the compiler and the run-time environment by
 Arjen Markus (source provided).
   
   
-http://gdbf95.sourceforge.net/;>gdbf95 testsuite.
+https://gdbf95.sourceforge.net;>gdbf95 testsuite.
   
   
 Tests of run-time checking capabilities
-- 
2.38.0


[committed] wwwdocs: gcc-4.4: Switch www.open-std.org to https

2022-11-01 Thread Gerald Pfeifer
Pushed.

Gerald
---
 htdocs/gcc-4.4/changes.html  |   2 +-
 htdocs/gcc-4.4/cxx0x_status.html | 116 +++
 2 files changed, 59 insertions(+), 59 deletions(-)

diff --git a/htdocs/gcc-4.4/changes.html b/htdocs/gcc-4.4/changes.html
index 748787e7..40a6f0c8 100644
--- a/htdocs/gcc-4.4/changes.html
+++ b/htdocs/gcc-4.4/changes.html
@@ -305,7 +305,7 @@
 is now supported for the C, C++, and Fortran compilers.
 
 New character data types, per http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1040.pdf;>
+href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n1040.pdf;>
TR 19769: New character types in C, are now supported for the C 
   compiler in -std=gnu99 mode, as __CHAR16_TYPE__
   and __CHAR32_TYPE__, and for the C++ compiler in 
diff --git a/htdocs/gcc-4.4/cxx0x_status.html b/htdocs/gcc-4.4/cxx0x_status.html
index 88184913..1fe57fc9 100644
--- a/htdocs/gcc-4.4/cxx0x_status.html
+++ b/htdocs/gcc-4.4/cxx0x_status.html
@@ -18,7 +18,7 @@
 GCC's C++0x mode tracks the C++0x working paper drafts produced by
 the ISO C++ committee, available on the ISO C++ committee's web site
 at http://www.open-std.org/jtc1/sc22/wg21/;>http://www.open-std.org/jtc1/sc22/wg21/.
 Since
+href="https://www.open-std.org/jtc1/sc22/wg21/;>https://www.open-std.org/jtc1/sc22/wg21/.
 Since
 this standard is still being extended and modified, the feature set
 provided by the experimental C++0x mode may vary greatly from one GCC
 version to another. No attempts will be made to preserve backward
@@ -40,217 +40,217 @@ page.
 
 
   Rvalue references
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2006/n2118.html;>N2118
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2006/n2118.html;>N2118
Yes
 
 
   Rvalue references for *this
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2439.htm;>N2439
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2439.htm;>N2439
   No
 
 
   Initialization of class objects by rvalues
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2004/n1610.html;>N1610
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2004/n1610.html;>N1610
   Yes
 
 
 
   Variadic templates
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2242.pdf;>N2242
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2242.pdf;>N2242
Yes
 
 
   Extending variadic template template 
parameters
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2555.pdf;>N2555
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2555.pdf;>N2555
Yes
 
 
   Initializer lists
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2672.htm;>N2672
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2672.htm;>N2672
Yes
 
 
   Static assertions
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2004/n1720.html;>N1720
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2004/n1720.html;>N1720
Yes
 
 
   auto-typed variables
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2006/n1984.pdf;>N1984
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2006/n1984.pdf;>N1984
Yes
 
 
   Multi-declarator auto
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2004/n1737.pdf;>N1737
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2004/n1737.pdf;>N1737
Yes
 
 
   Removal of auto as a storage-class 
specifier
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2546.htm;>N2546
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2546.htm;>N2546
Yes
 
 
   New function declarator syntax
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2541.htm;>N2541
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2541.htm;>N2541
Yes
 
 
   Lambda expressions and closures
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2550.pdf;>N2550
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2550.pdf;>N2550
   No
 
 
   Constness of lambda functions
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2658.pdf;>N2658
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2658.pdf;>N2658
   No
 
 
   Declared type of an expression
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2343.pdf;>N2343
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2007/n2343.pdf;>N2343
Yes
 
 
   Right angle brackets
-  http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1757.html;>N1757
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2005/n1757.html;>N1757
Yes
 
 
   Default template arguments for function templates
-  

[PATCH v3] LoongArch: Optimize immediate load.

2022-11-01 Thread Lulu Cheng
v1 -> v2:
1. Change the code format.
2. Fix bugs in the code.

v2 -> v3:
Modifying a code implementation of an undefined behavior.

Both regression tests and spec2006 passed.

The problem mentioned in the link does not move the four immediate load
instructions out of the loop. It has been optimized. Now, as in the test case,
four immediate load instructions are generated outside the loop.
(https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html)




Fixed an issue where the compiler would not take four 64-bit immediate
load instructions out of the loop.

gcc/ChangeLog:

* config/loongarch/constraints.md (x): New constraint.
* config/loongarch/loongarch.cc (struct loongarch_address_info):
Adds a method to load the immediate 32 to 64 bit field.
(struct loongarch_integer_op): Define a new member curr_value,
that records the value of the number stored in the destination
register immediately after the current instruction has run.
(LARCH_MAX_INTEGER_OPS): Define this macro as 3.
(LU32I_B): Move to the loongarch.h.
(LU52I_B): Likewise.
(loongarch_build_integer): Adds a method to load the immediate
32 to 63 bits.
(loongarch_move_integer): Likewise.
(loongarch_print_operand_reloc): Modifying comment information.
* config/loongarch/loongarch.h (LU32I_B): Move from loongarch.cc.
(LU52I_B): Likewise.
(HWIT_UC_0x): New macro.
(HI32_OPERAND): New macro.
* config/loongarch/loongarch.md (load_hi32): New template.
* config/loongarch/predicates.md (const_hi32_operand): Determines
whether the value is an immediate number that has a value of only
the higher 32 bits.
(hi32_mask_operand): Immediately counts the mask of 32 to 61 bits.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/imm-load.c: New test.
---
 gcc/config/loongarch/constraints.md   |   7 +-
 gcc/config/loongarch/loongarch.cc | 105 +++---
 gcc/config/loongarch/loongarch.h  |   9 ++
 gcc/config/loongarch/loongarch.md |  34 ++
 gcc/config/loongarch/predicates.md|   8 ++
 gcc/testsuite/gcc.target/loongarch/imm-load.c |  25 +
 6 files changed, 148 insertions(+), 40 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c

diff --git a/gcc/config/loongarch/constraints.md 
b/gcc/config/loongarch/constraints.md
index 43cb7b5f0f5..1dcf09ce5eb 100644
--- a/gcc/config/loongarch/constraints.md
+++ b/gcc/config/loongarch/constraints.md
@@ -46,7 +46,7 @@
 ;; "u" "A signed 52bit constant and low 32-bit is zero (for logic 
instructions)"
 ;; "v" "A signed 64-bit constant and low 44-bit is zero (for logic 
instructions)."
 ;; "w" "Matches any valid memory."
-;; "x" <-unused
+;; "x" "A signed 64-bit constant and low 32-bit is zero (for logic 
instructions)."
 ;; "y" <-unused
 ;; "z" FCC_REGS
 ;; "A" <-unused
@@ -139,6 +139,11 @@ (define_constraint "v"
   (and (match_code "const_int")
(match_test "LU52I_OPERAND (ival)")))
 
+(define_constraint "x"
+  "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
+  (and (match_code "const_int")
+   (match_test "HI32_OPERAND (ival)")))
+
 (define_register_constraint "z" "FCC_REGS"
   "A floating-point condition code register.")
 
diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index f54c233f90c..28c05c2a193 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -139,6 +139,9 @@ struct loongarch_address_info
METHOD_LU52I:
  Load 52-63 bit of the immediate number.
 
+   METHOD_LD_HI32:
+ Load 32-63 bit of the immediate number.
+
METHOD_INSV:
  immediate like 0xfff0fxxx
*/
@@ -147,20 +150,26 @@ enum loongarch_load_imm_method
   METHOD_NORMAL,
   METHOD_LU32I,
   METHOD_LU52I,
+  METHOD_LD_HI32,
   METHOD_INSV
 };
 
 struct loongarch_integer_op
 {
   enum rtx_code code;
+  /* Current Immediate Count The immediate count of the load instruction.  */
   HOST_WIDE_INT value;
+  /* Represent the result of the immediate count of the load instruction at
+ each step.  */
+  HOST_WIDE_INT curr_value;
   enum loongarch_load_imm_method method;
 };
 
 /* The largest number of operations needed to load an integer constant.
-   The worst accepted case for 64-bit constants is LU12I.W,LU32I.D,LU52I.D,ORI
-   or LU12I.W,LU32I.D,LU52I.D,ADDI.D DECL_ASSEMBLER_NAME.  */
-#define LARCH_MAX_INTEGER_OPS 4
+   The worst accepted case for 64-bit constants is LU12I.W,
+   LOAD_HI32(LU32I.D,LU52I.D),ORI or LU12I.W,LOAD_HI32(LU32I.D,LU52I.D),
+   ADDI.D DECL_ASSEMBLER_NAME.  */
+#define LARCH_MAX_INTEGER_OPS 3
 
 /* Arrays that map GCC register numbers to debugger register numbers.  */
 int loongarch_dwarf_regno[FIRST_PSEUDO_REGISTER];
@@ -1454,9 +1463,6 

[committed] wwwdocs: codingconventions: Properly link to flake8

2022-11-01 Thread Gerald Pfeifer
Nearly all hrefs= on our site are https:// or http://, and that's the case 
pretty much across the web. Still the protocol needs to be provided for 
links to work.

Pushed.

Gerald


Web links need to be prefixed by https:// or http://.
---
 htdocs/codingconventions.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htdocs/codingconventions.html b/htdocs/codingconventions.html
index f5e22983..348f1e1d 100644
--- a/htdocs/codingconventions.html
+++ b/htdocs/codingconventions.html
@@ -1487,7 +1487,7 @@ Definitions within the body of a namespace are not 
indented.
 Python Language Conventions
 
 Python scripts should follow https://peps.python.org/pep-0008/;>PEP 8 ??? Style Guide for Python 
Code
-which can be verified by the flake8 tool.
+which can be verified by the https://flake8.pycqa.org;>flake8 
tool.
 We recommend using the following flake8 plug-ins:
 
 
-- 
2.38.0


[committed] wwwdocs: *: Remove extraneous whitespaces around headings

2022-11-01 Thread Gerald Pfeifer
Pushed.

Gerald
---
 htdocs/faq.html| 2 +-
 htdocs/gcc-2.95/regress.html   | 2 +-
 htdocs/gcc-3.1/gcj-status.html | 6 +++---
 htdocs/gcc-3.3/gcj-status.html | 2 +-
 htdocs/gcc-8/changes.html  | 2 +-
 htdocs/gcc-9/changes.html  | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/htdocs/faq.html b/htdocs/faq.html
index 183e7bec..b09e3920 100644
--- a/htdocs/faq.html
+++ b/htdocs/faq.html
@@ -272,7 +272,7 @@ the testsuite?
 
 
 
- How can I run the testsuite with multiple options? 

+How can I run the testsuite with multiple options?
 
 If you invoke runtest directly, you can use the
 --target_board option, e.g:
diff --git a/htdocs/gcc-2.95/regress.html b/htdocs/gcc-2.95/regress.html
index 5a02b1f7..bbe47396 100644
--- a/htdocs/gcc-2.95/regress.html
+++ b/htdocs/gcc-2.95/regress.html
@@ -334,7 +334,7 @@ N/A
 
 
 
- Package/Feature Testing 
+Package/Feature Testing
 
 
 
diff --git a/htdocs/gcc-3.1/gcj-status.html b/htdocs/gcc-3.1/gcj-status.html
index 49962e04..2ac14b89 100644
--- a/htdocs/gcc-3.1/gcj-status.html
+++ b/htdocs/gcc-3.1/gcj-status.html
@@ -11,12 +11,12 @@
 
 GCJ 3.1 status
 
- Bugs 
+Bugs
 
 We are tracking all bugs in GNATS.  Any bugs that we want to fix
 for the 3.1 release will be marked high priority.
 
- Platforms 
+Platforms
 
 
 
@@ -107,7 +107,7 @@ for the 3.1 release will be marked high 
priority.
 
 
 
- Packages 
+Packages
 
 
 
diff --git a/htdocs/gcc-3.3/gcj-status.html b/htdocs/gcc-3.3/gcj-status.html
index 47e94260..7ef0bc25 100644
--- a/htdocs/gcc-3.3/gcj-status.html
+++ b/htdocs/gcc-3.3/gcj-status.html
@@ -18,7 +18,7 @@ for the 3.3 release will be marked high priority (https://gcc.gnu.org/bugzilla/buglist.cgi?short_desc_type=allwordssubstcomponent=javalong_desc_type=substringlong_desc=bug_file_loc_type=allwordssubstrbug_file_loc=gcchost_type=allwordssubstrgcchost=gcctarget_type=allwordssubstrgcctarget=gccbuild_type=allwordssubstrgccbuild=keywords_type=allwordskeywords=bug_status=UNCONFIRMEDbug_status=NEWbug_status=ASSIGNEDbug_status=SUSPENDEDbug_status=WAITINGbug_status=REOPENEDbug_severity=criticalemailassigned_to1=1emailtype1=substringemail1=emailassigned_to2=1emailreporter2=1emailcc2=1emailtype2=substringbugidtype=includecmdtype=doitorder=Bug+Number;>see
 list).
 
- Packages 
+Packages
 
 
 
diff --git a/htdocs/gcc-8/changes.html b/htdocs/gcc-8/changes.html
index c5521f72..73ccd07d 100644
--- a/htdocs/gcc-8/changes.html
+++ b/htdocs/gcc-8/changes.html
@@ -1403,7 +1403,7 @@ known to be fixed in the 8.5 release. This list might not 
be
 complete (that is, it is possible that some PRs that have been fixed
 are not listed here).
 
- Target Specific Changes
+Target Specific Changes
 
 AArch64
   
diff --git a/htdocs/gcc-9/changes.html b/htdocs/gcc-9/changes.html
index 27ac675b..ffaf4824 100644
--- a/htdocs/gcc-9/changes.html
+++ b/htdocs/gcc-9/changes.html
@@ -1140,7 +1140,7 @@ known to be fixed in the 9.4 release. This list might not 
be
 complete (that is, it is possible that some PRs that have been fixed
 are not listed here).
 
- Target Specific Changes
+Target Specific Changes
 
 AArch64
   
-- 
2.38.0


c++: Reorganize per-scope lambda discriminators

2022-11-01 Thread Nathan Sidwell via Gcc-patches


We currently use a per-extra-scope counter to discriminate multiple
lambdas in a particular such scope.  This is not ABI compliant.  This
patch merely refactors the existing code to make it easier to drop in
a conformant mangling -- there's no functional change here.  I rename
the LAMBDA_EXPR_DISCIMINATOR to LAMBDA_EXPR_SCOPE_ONLY_DISCRIMINATOR,
foreshadowing that there'll be a new discriminator.  To provide ABI
warnings we'll need to calculate both, and that requires some
repacking of the lambda_expr's fields.  Finally, although we end up
calling the discriminator setter and the scope recorder (nearly)
always consecutively, it's clearer to handle it as two separate
operations.  That also allows us to remove the instantiation
special-case for a null extra-scope.

nathan


--
Nathan SidwellFrom 0122faae30fe1ad1dfa8c69f3d3f0428b996b600 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell 
Date: Mon, 31 Oct 2022 06:11:28 -0400
Subject: [PATCH] c++: Reorganize per-scope lambda discriminators

We currently use a per-extra-scope counter to discriminate multiple
lambdas in a particular such scope.  This is not ABI compliant.  This
patch merely refactors the existing code to make it easier to drop in
a conformant mangling -- there's no functional change here.  I rename
the LAMBDA_EXPR_DISCIMINATOR to LAMBDA_EXPR_SCOPE_ONLY_DISCRIMINATOR,
foreshadowing that there'll be a new discriminator.  To provide ABI
warnings we'll need to calculate both, and that requires some
repacking of the lambda_expr's fields.  Finally, although we end up
calling the discriminator setter and the scope recorder (nearly)
always consecutively, it's clearer to handle it as two separate
operations.  That also allows us to remove the instantiation
special-case for a null extra-scope.

	gcc/cp/
	* cp-tree.h (LAMBDA_EXPR_DISCRIMINATOR): Rename to ...
	(LAMBDA_EXPR_SCOPE_ONLY_DISCRIMINATOR): ... here.
	(struct tree_lambda_expr): Make default_capture_mode &
	discriminator_scope bitfields.
	(record_null_lambda_scope) Delete.
	(record_lambda_scope_discriminator): Declare.
	* lambda.cc (struct lambda_discriminator): New struct.
	(lambda_scope, lambda_scope_stack): Adjust types.
	(lambda_count): Delete.
	(struct tree_int): Delete.
	(start_lambda_scope, finish_lambda_scope): Adjust.
	(record_lambda_scope): Only record the scope.
	(record_lambda_scope_discriminator): New.
	* mangle.cc (write_closure_type_name): Adjust.
	* module.cc (trees_out::core_vals): Likewise,
	(trees_in::core_vals): Likewise.
	* parser.cc (cp_parser_lambda_expression): Call
	record_lambda_scope_discriminator.
	* pt.cc (tsubst_lambda_expr): Adjust record_lambda_scope caling.  Call
	record_lambda_scope_discriminator. Commonize control flow on tsubsting
	the operator function.
	libcc1/
	* libcp1plugin.cc (plugin_start_closure): Adjust.
	gcc/testsuite/
	* g++.dg/abi/lambda-sig1-17.C: New.
	* g++.dg/abi/lambda-sig1.h: New.
	* g++.dg/cpp1y/lambda-mangle-1.C: Extracted to ...
	* g++.dg/cpp1y/lambda-mangle-1.h: ... here.
	* g++.dg/cpp1y/lambda-mangle-1-11.C: New
	* g++.dg/cpp1y/lambda-mangle-1-17.C
---
 gcc/cp/cp-tree.h  |  17 +--
 gcc/cp/lambda.cc  | 114 +-
 gcc/cp/mangle.cc  |   2 +-
 gcc/cp/module.cc  |   4 +-
 gcc/cp/parser.cc  |   5 +-
 gcc/cp/pt.cc  |  50 +++-
 gcc/testsuite/g++.dg/abi/lambda-sig1-17.C |  26 
 gcc/testsuite/g++.dg/abi/lambda-sig1.h|  42 +++
 .../g++.dg/cpp1y/lambda-mangle-1-11.C |  25 
 .../g++.dg/cpp1y/lambda-mangle-1-17.C |  25 
 .../{lambda-mangle-1.C => lambda-mangle-1.h}  |   3 +-
 libcc1/libcp1plugin.cc|   2 +-
 12 files changed, 208 insertions(+), 107 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/abi/lambda-sig1-17.C
 create mode 100644 gcc/testsuite/g++.dg/abi/lambda-sig1.h
 create mode 100644 gcc/testsuite/g++.dg/cpp1y/lambda-mangle-1-11.C
 create mode 100644 gcc/testsuite/g++.dg/cpp1y/lambda-mangle-1-17.C
 rename gcc/testsuite/g++.dg/cpp1y/{lambda-mangle-1.C => lambda-mangle-1.h} (98%)

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 6d84514e4c0..4c0bacb91da 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -1500,9 +1500,10 @@ enum cp_lambda_default_capture_mode_type {
 #define LAMBDA_EXPR_EXTRA_SCOPE(NODE) \
   (((struct tree_lambda_expr *)LAMBDA_EXPR_CHECK (NODE))->extra_scope)
 
-/* If EXTRA_SCOPE, this is the number of the lambda within that scope.  */
-#define LAMBDA_EXPR_DISCRIMINATOR(NODE) \
-  (((struct tree_lambda_expr *)LAMBDA_EXPR_CHECK (NODE))->discriminator)
+/* Lambdas in the same extra scope might need a discriminating count.
+   This is a single per-scope count.  */
+#define LAMBDA_EXPR_SCOPE_ONLY_DISCRIMINATOR(NODE) \
+  (((struct tree_lambda_expr *)LAMBDA_EXPR_CHECK (NODE))->discriminator_scope)
 
 /* During parsing of the lambda, a vector of capture proxies which 

[committed] wwwdocs: codingconventions: Move two links to https

2022-11-01 Thread Gerald Pfeifer
Pushed.
Gerald

---
 htdocs/codingconventions.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/htdocs/codingconventions.html b/htdocs/codingconventions.html
index f88ef019..f5e22983 100644
--- a/htdocs/codingconventions.html
+++ b/htdocs/codingconventions.html
@@ -737,7 +737,7 @@ the latest version is at
 before going into GCC.
 
 fastjar: The master sources were at http://fastjar.sourceforge.net/;>fastjar.sourceforge.net.
+href="https://fastjar.sourceforge.net/;>fastjar.sourceforge.net.
 However, the upstream source seems to be dead, so fastjar is
 essentially maintained in the GCC source tree.
 
@@ -764,7 +764,7 @@ FSF website, or are autogenerated.  These files should not 
be changed
 without prior permission, if at all.
 
 libgcc/config/libbid: The master sources come from Intel BID library
-http://www.netlib.org/misc/intel/;>Intel BID library. 
+https://www.netlib.org/misc/intel/;>Intel BID library. 
 Bugs should be reported to
 mailto:marius.cor...@intel.com;>marius.cor...@intel.com
 and
-- 
2.38.0


[committed] wwwdocs: readings: Remove

2022-11-01 Thread Gerald Pfeifer
Google has not been using that forever and there are indications
search engines even use it as one indication for spam sites.

Pushed.
Gerald
---
 htdocs/readings.html | 1 -
 1 file changed, 1 deletion(-)

diff --git a/htdocs/readings.html b/htdocs/readings.html
index 01ccd55d..5622bcc3 100644
--- a/htdocs/readings.html
+++ b/htdocs/readings.html
@@ -8,7 +8,6 @@
 
 
 
-
 Links and Selected Readings
 https://gcc.gnu.org/gcc.css;>
 
-- 
2.38.0


[committed] wwwdocs: projects/tree-ssa: Adjust mark up

2022-11-01 Thread Gerald Pfeifer
Remove extraneous whitespace around heading and adjust level.

Pushed.
Gerald
---
 htdocs/projects/tree-ssa/tree-browser.html | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/htdocs/projects/tree-ssa/tree-browser.html 
b/htdocs/projects/tree-ssa/tree-browser.html
index 23f46e92..a6ffc46b 100644
--- a/htdocs/projects/tree-ssa/tree-browser.html
+++ b/htdocs/projects/tree-ssa/tree-browser.html
@@ -298,7 +298,8 @@ else
 TB  
 
 
- Extensions 
+
+Extensions
 
 Some extensions could be added in the near future, such as including
 commands that allows you to modify tree structure, apply optimizations
-- 
2.38.0


[PATCH] libstdc++: std::from_chars std::{,b}float16_t support

2022-11-01 Thread Jakub Jelinek via Gcc-patches
Hi!

On top of the
https://gcc.gnu.org/pipermail/libstdc++/2022-October/054849.html
https://gcc.gnu.org/pipermail/libstdc++/2022-October/054886.html
the following patch adds std::from_chars support, similarly to the
previous std::to_chars patch through APIs that use float instead of
the 16-bit floating point formats as container.
The patch uses the fast_float library and doesn't need any changes
to it, like the previous patch it introduces wrapper classes around
float that represent the float holding float16_t or bfloat16_t value,
and specializes binary_format etc. from fast_float for these classes.

The new test verifies exhaustively to_chars and from_chars afterward
results in the original value (except for nans) in all the fmt cases.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-11-01  Jakub Jelinek  

* include/std/charconv (__from_chars_float16_t,
__from_chars_bfloat16_t): Declare.
(from_chars): Add _Float16 and __gnu_cxx::__bfloat16_t overloads.
* config/abi/pre/gnu.ver (GLIBCXX_3.4.31): Export
_ZSt22__from_chars_float16_tPKcS0_RfSt12chars_format and
_ZSt23__from_chars_bfloat16_tPKcS0_RfSt12chars_format.
* src/c++17/floating_from_chars.cc
(fast_float::floating_type_float16_t,
fast_float::floating_type_bfloat16_t): New classes.
(fast_float::binary_format,
fast_float::binary_format): New
specializations.
(fast_float::to_float,
fast_float::to_float,
fast_float::to_extended,
fast_float::to_extended): Likewise.
(fast_float::from_chars_16): New template function.
(__floating_from_chars_hex): Allow instantiation with
fast_float::floating_type_{,b}float16_t.
(from_chars): Formatting fixes for float/double/long double overloads.
(__from_chars_float16_t, __from_chars_bfloat16_t): New functions.
* testsuite/20_util/to_chars/float16_c++23.cc: New test.

--- libstdc++-v3/include/std/charconv.jj2022-10-28 11:15:40.113959052 
+0200
+++ libstdc++-v3/include/std/charconv   2022-10-28 11:28:04.172657801 +0200
@@ -673,6 +673,32 @@ namespace __detail
   from_chars(const char* __first, const char* __last, long double& __value,
 chars_format __fmt = chars_format::general) noexcept;
 
+  // Library routines for 16-bit extended floating point formats
+  // using float as interchange format.
+  from_chars_result
+  __from_chars_float16_t(const char* __first, const char* __last,
+float& __value,
+chars_format __fmt = chars_format::general) noexcept;
+  from_chars_result
+  __from_chars_bfloat16_t(const char* __first, const char* __last,
+ float& __value,
+ chars_format __fmt = chars_format::general) noexcept;
+
+#if defined(__STDCPP_FLOAT16_T__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32) \
+&& defined(__cpp_lib_to_chars)
+  inline from_chars_result
+  from_chars(const char* __first, const char* __last, _Float16& __value,
+chars_format __fmt = chars_format::general) noexcept
+  {
+float __val;
+from_chars_result __res
+  = __from_chars_float16_t(__first, __last, __val, __fmt);
+if (__res.ec == errc{})
+  __value = __val;
+return __res;
+  }
+#endif
+
 #if defined(__STDCPP_FLOAT32_T__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32)
   inline from_chars_result
   from_chars(const char* __first, const char* __last, _Float32& __value,
@@ -709,6 +735,22 @@ namespace __detail
 if (__res.ec == errc{})
   __value = __val;
 return __res;
+  }
+#endif
+
+#if defined(__STDCPP_BFLOAT16_T__) && defined(_GLIBCXX_FLOAT_IS_IEEE_BINARY32) 
\
+&& defined(__cpp_lib_to_chars)
+  inline from_chars_result
+  from_chars(const char* __first, const char* __last,
+__gnu_cxx::__bfloat16_t & __value,
+chars_format __fmt = chars_format::general) noexcept
+  {
+float __val;
+from_chars_result __res
+  = __from_chars_bfloat16_t(__first, __last, __val, __fmt);
+if (__res.ec == errc{})
+  __value = __val;
+return __res;
   }
 #endif
 #endif
--- libstdc++-v3/config/abi/pre/gnu.ver.jj  2022-10-28 11:15:40.115959024 
+0200
+++ libstdc++-v3/config/abi/pre/gnu.ver 2022-10-28 16:55:55.274849390 +0200
@@ -2448,6 +2448,8 @@ GLIBCXX_3.4.31 {
 
_ZNSt7__cxx1112basic_stringI[cw]St11char_traitsI[cw]ESaI[cw]EE15_M_replace_cold*;
 _ZSt20__to_chars_float16_tPcS_fSt12chars_format;
 _ZSt21__to_chars_bfloat16_tPcS_fSt12chars_format;
+_ZSt22__from_chars_float16_tPKcS0_RfSt12chars_format;
+_ZSt23__from_chars_bfloat16_tPKcS0_RfSt12chars_format;
 } GLIBCXX_3.4.30;
 
 # Symbols in the support library (libsupc++) have their own tag.
--- libstdc++-v3/src/c++17/floating_from_chars.cc.jj2022-05-23 
21:44:49.107846783 +0200
+++ libstdc++-v3/src/c++17/floating_from_chars.cc   2022-10-31 
15:06:30.338480517 +0100
@@ -75,6 +75,272 @@ extern 

Re: Adding a new thread model to GCC

2022-11-01 Thread Eric Botcazou via Gcc-patches
> I have faced with "#error Timed lock primitives are not supported on
> Windows targets" and I'm not sure I understood the reason correctly.
> 
> as far as I understand, the definition for
> `_GTHREAD_USE_MUTEX_TIMEDLOCK` comes from libstdc++/configure as a
> result of some test.
> 
> why did I faced with this error? what should I do to avoid this?

Run autoheader + autoconf in the libstdc++-v3 source repository.

-- 
Eric Botcazou




Re: [PATCH v2] LoongArch: Optimize immediate load.

2022-11-01 Thread Lulu Cheng



在 2022/11/1 下午4:02, Xi Ruoyao 写道:

On Tue, 2022-11-01 at 14:19 +0800, Lulu Cheng wrote:

+;; Load immediate to the 32-63 bits of the source register.
+(define_insn_and_split "load_hi32"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (ior:DI
+ (and:DI (match_operand:DI 1 "register_operand" "0")
+ (match_operand 2 "hi32_mask_operand"))
+   (match_operand 3 "const_hi32_operand" "x")))]
+  "TARGET_64BIT"
+  "#"
+  ""
+  [(set (match_dup 0)
+   (ior:DI
+ (zero_extend:DI
+   (subreg:SI (match_dup 1) 0))
+ (match_dup 4)))
+   (set (match_dup 0)
+   (ior:DI
+ (and:DI (match_dup 0)
+ (match_dup 6))
+ (match_dup 5)))]
+{
+  operands[4] = GEN_INT (INTVAL (operands[3]) << 12 >> 12);

It's an undefined behavior if INTVAL (operands[3]) is negative.

I'm sorry I made the same mistake twice:-(



+  operands[5] = GEN_INT (INTVAL (operands[3]) & 0xfff0);
+  operands[6] = GEN_INT (0xf);
+}
+  [(set_attr "insn_count" "2")])




Re: [PATCH V2] [x86] Fix incorrect digit constraint

2022-11-01 Thread Hongtao Liu via Gcc-patches
On Mon, Oct 31, 2022 at 5:22 PM Uros Bizjak  wrote:
>
> On Mon, Oct 31, 2022 at 2:10 AM liuhongt  wrote:
> >
> > >You have a couple of other patterns where operand 1 is matched to
> > >produce vmovddup insn. These are *avx512f_unpcklpd512 and
> > >avx_unpcklpd256. You can also remove expander in both
> > >cases.
> >
> > Yes, changed in V2 patch.
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
> >
> >
> > Matching constraints are used in these circumstances. More precisely,
> > the two operands that match must include one input-only operand and
> > one output-only operand. Moreover, the digit must be a smaller number
> > than the number of the operand that uses it in the constraint.
> >
> > In pr107057, the 2 operands in the pattern are both input operands.
> >
> > gcc/ChangeLog:
> >
> > PR target/107057
> > * config/i386/sse.md (*vec_interleave_highv2df): Remove
> > constraint 1.
> > (*vec_interleave_lowv2df): Ditto.
> > (vec_concatv2df): Ditto.
> > (*avx512f_unpcklpd512): Ditto and renamed to ..
> > (avx512f_unpcklpd512): .. this.
> > (avx512f_movddup512): Change to define_insn.
> > (avx_movddup256): Ditto.
> > (*avx_unpcklpd256): Remove constraint 1 and renamed
> > to ..
> > (avx_unpcklpd256): .. this.
> > * config/i386/i386.cc (ix86_vec_interleave_v2df_operator_ok):
> > Disallow MEM_P (op1) && MEM_P (op2).
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/i386/pr107057.c: New test.



> OK with two small adjustments.
Changed and committed.
>


> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386.cc  |   2 +-
> >  gcc/config/i386/sse.md   | 140 +--
> >  gcc/testsuite/gcc.target/i386/pr107057.c |  19 +++
> >  3 files changed, 77 insertions(+), 84 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107057.c
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index aeea26ef4be..e3b7bea0d68 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -15652,7 +15652,7 @@ ix86_vec_interleave_v2df_operator_ok (rtx 
> > operands[3], bool high)
> >if (MEM_P (operands[0]))
> >  return rtx_equal_p (operands[0], operands[1 + high]);
> >if (MEM_P (operands[1]) && MEM_P (operands[2]))
> > -return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
> > +return false;
> >return true;
> >  }
> >
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index f4b5506703f..b7922521734 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -12170,107 +12170,88 @@ (define_expand "vec_interleave_highv2df"
> >  })
> >
> >  (define_insn "*vec_interleave_highv2df"
> > -  [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,v,v,x,v,m")
> > +  [(set (match_operand:V2DF 0 "nonimmediate_operand" "=x,v,x,v,m")
> > (vec_select:V2DF
> >   (vec_concat:V4DF
> > -   (match_operand:V2DF 1 "nonimmediate_operand" " 0,v,o,o,o,v")
> > -   (match_operand:V2DF 2 "nonimmediate_operand" " x,v,1,0,v,0"))
> > +   (match_operand:V2DF 1 "nonimmediate_operand" " 0,v,o,o,v")
> > +   (match_operand:V2DF 2 "nonimmediate_operand" " x,v,0,v,0"))
> >   (parallel [(const_int 1)
> >  (const_int 3)])))]
> >"TARGET_SSE2 && ix86_vec_interleave_v2df_operator_ok (operands, 1)"
> >"@
> > unpckhpd\t{%2, %0|%0, %2}
> > vunpckhpd\t{%2, %1, %0|%0, %1, %2}
> > -   %vmovddup\t{%H1, %0|%0, %H1}
> > movlpd\t{%H1, %0|%0, %H1}
> > vmovlpd\t{%H1, %2, %0|%0, %2, %H1}
> > %vmovhpd\t{%1, %0|%q0, %1}"
> > -  [(set_attr "isa" "noavx,avx,sse3,noavx,avx,*")
> > -   (set_attr "type" "sselog,sselog,sselog,ssemov,ssemov,ssemov")
> > +  [(set_attr "isa" "noavx,avx,noavx,avx,*")
> > +   (set_attr "type" "sselog,sselog,ssemov,ssemov,ssemov")
> > (set (attr "prefix_data16")
> > - (if_then_else (eq_attr "alternative" "3,5")
> > + (if_then_else (eq_attr "alternative" "2,4")
> >(const_string "1")
> >(const_string "*")))
> > -   (set_attr "prefix" 
> > "orig,maybe_evex,maybe_vex,orig,maybe_evex,maybe_vex")
> > -   (set_attr "mode" "V2DF,V2DF,DF,V1DF,V1DF,V1DF")])
> > +   (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex")
> > +   (set_attr "mode" "V2DF,V2DF,V1DF,V1DF,V1DF")])
> >
> > -(define_expand "avx512f_movddup512"
> > -  [(set (match_operand:V8DF 0 "register_operand")
> > +(define_insn "avx512f_movddup512"
> > +  [(set (match_operand:V8DF 0 "register_operand" "=v")
> > (vec_select:V8DF
> >   (vec_concat:V16DF
> > -   (match_operand:V8DF 1 "nonimmediate_operand")
> > +   (match_operand:V8DF 1 "memory_operand" "m")
>
> I think you should leave nonimmediate_operand here with "m" predicate.
> Reload is able to move the register to the memory, and it is
> 

Re: [PATCH v2] LoongArch: Optimize immediate load.

2022-11-01 Thread Xi Ruoyao via Gcc-patches
On Tue, 2022-11-01 at 14:19 +0800, Lulu Cheng wrote:
> +;; Load immediate to the 32-63 bits of the source register.
> +(define_insn_and_split "load_hi32"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +   (ior:DI
> + (and:DI (match_operand:DI 1 "register_operand" "0")
> + (match_operand 2 "hi32_mask_operand"))
> +   (match_operand 3 "const_hi32_operand" "x")))]
> +  "TARGET_64BIT"
> +  "#"
> +  ""
> +  [(set (match_dup 0)
> +   (ior:DI
> + (zero_extend:DI
> +   (subreg:SI (match_dup 1) 0))
> + (match_dup 4)))
> +   (set (match_dup 0)
> +   (ior:DI
> + (and:DI (match_dup 0)
> + (match_dup 6))
> + (match_dup 5)))]
> +{
> +  operands[4] = GEN_INT (INTVAL (operands[3]) << 12 >> 12);

It's an undefined behavior if INTVAL (operands[3]) is negative.

> +  operands[5] = GEN_INT (INTVAL (operands[3]) & 0xfff0);
> +  operands[6] = GEN_INT (0xf);
> +}
> +  [(set_attr "insn_count" "2")])

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


[PATCH v2] LoongArch: Optimize immediate load.

2022-11-01 Thread Lulu Cheng
v1 -> v2:
1. Change the code format.
2. Fix bugs in the code.

Both regression tests and spec2006 passed.

The problem mentioned in the link does not move the four immediate load
instructions out of the loop. It has been optimized. Now, as in the test case,
four immediate load instructions are generated outside the loop.
(https://sourceware.org/pipermail/libc-alpha/2022-September/142202.html)




Fixed an issue where the compiler would not take four 64-bit immediate
load instructions out of the loop.

gcc/ChangeLog:

* config/loongarch/constraints.md (x): New constraint.
* config/loongarch/loongarch.cc (struct loongarch_integer_op):
Define a new member curr_value, that records the value of
the number stored in the destination register immediately
after the current instruction has run.
(loongarch_build_integer): Adds a method to load the immediate
32-bit to 63-bit field.
(loongarch_move_integer): Same as above.
* config/loongarch/loongarch.h (HWIT_UC_0x):
(HI32_OPERAND): NEW macro.
* config/loongarch/loongarch.md (load_hi32):New template.
* config/loongarch/predicates.md (const_hi32_operand): Determines
whether the value is an immediate number that has a value of only
the higher 32 bits.
(hi32_mask_operand): Immediately counts the mask of 32 to 61 bits.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/imm-load.c: New test.
---
 gcc/config/loongarch/constraints.md   |   7 +-
 gcc/config/loongarch/loongarch.cc | 102 +++---
 gcc/config/loongarch/loongarch.h  |   6 ++
 gcc/config/loongarch/loongarch.md |  31 ++
 gcc/config/loongarch/predicates.md|   8 ++
 gcc/testsuite/gcc.target/loongarch/imm-load.c |  25 +
 6 files changed, 142 insertions(+), 37 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/imm-load.c

diff --git a/gcc/config/loongarch/constraints.md 
b/gcc/config/loongarch/constraints.md
index d3addd02c0a..43b55d74c5a 100644
--- a/gcc/config/loongarch/constraints.md
+++ b/gcc/config/loongarch/constraints.md
@@ -46,7 +46,7 @@
 ;; "u" "A signed 52bit constant and low 32-bit is zero (for logic 
instructions)"
 ;; "v" "A signed 64-bit constant and low 44-bit is zero (for logic 
instructions)."
 ;; "w" "Matches any valid memory."
-;; "x" <-unused
+;; "x" "A signed 64-bit constant and low 32-bit is zero (for logic 
instructions)."
 ;; "y" <-unused
 ;; "z" FCC_REGS
 ;; "A" <-unused
@@ -139,6 +139,11 @@ (define_constraint "v"
   (and (match_code "const_int")
(match_test "LU52I_OPERAND (ival)")))
 
+(define_constraint "x"
+  "A signed 64-bit constant and low 32-bit is zero (for logic instructions)."
+  (and (match_code "const_int")
+   (match_test "HI32_OPERAND (ival)")))
+
 (define_register_constraint "z" "FCC_REGS"
   "A floating-point condition code register.")
 
diff --git a/gcc/config/loongarch/loongarch.cc 
b/gcc/config/loongarch/loongarch.cc
index 1a637431503..acde9efce40 100644
--- a/gcc/config/loongarch/loongarch.cc
+++ b/gcc/config/loongarch/loongarch.cc
@@ -140,6 +140,9 @@ struct loongarch_address_info
METHOD_LU52I:
  Load 52-63 bit of the immediate number.
 
+   METHOD_LD_HI32:
+ Load 32-63 bit of the immediate number.
+
METHOD_INSV:
  immediate like 0xfff0fxxx
*/
@@ -148,20 +151,26 @@ enum loongarch_load_imm_method
   METHOD_NORMAL,
   METHOD_LU32I,
   METHOD_LU52I,
+  METHOD_LD_HI32,
   METHOD_INSV
 };
 
 struct loongarch_integer_op
 {
   enum rtx_code code;
+  /* Current Immediate Count The immediate count of the load instruction.  */
   HOST_WIDE_INT value;
+  /* Represent the result of the immediate count of the load instruction at
+ each step.  */
+  HOST_WIDE_INT curr_value;
   enum loongarch_load_imm_method method;
 };
 
 /* The largest number of operations needed to load an integer constant.
-   The worst accepted case for 64-bit constants is LU12I.W,LU32I.D,LU52I.D,ORI
-   or LU12I.W,LU32I.D,LU52I.D,ADDI.D DECL_ASSEMBLER_NAME.  */
-#define LARCH_MAX_INTEGER_OPS 4
+   The worst accepted case for 64-bit constants is LU12I.W,
+   LOAD_HI32(LU32I.D,LU52I.D),ORI or LU12I.W,LOAD_HI32(LU32I.D,LU52I.D),
+   ADDI.D DECL_ASSEMBLER_NAME.  */
+#define LARCH_MAX_INTEGER_OPS 3
 
 /* Arrays that map GCC register numbers to debugger register numbers.  */
 int loongarch_dwarf_regno[FIRST_PSEUDO_REGISTER];
@@ -1475,24 +1484,27 @@ loongarch_build_integer (struct loongarch_integer_op 
*codes,
 {
   /* The value of the lower 32 bit be loaded with one instruction.
 lu12i.w.  */
-  codes[0].code = UNKNOWN;
-  codes[0].method = METHOD_NORMAL;
-  codes[0].value = low_part;
+  codes[cost].code = UNKNOWN;
+  codes[cost].method = METHOD_NORMAL;
+  codes[cost].value = low_part;
+  codes[cost].curr_value = low_part;