Re: [PATCH]middle-end Fix trapping access in test PR101750

2021-08-03 Thread Richard Biener
On Tue, 3 Aug 2021, Tamar Christina wrote:

> Hi All,
> 
> I believe PR101750 to be a testism. The reduced case accesses h[0] but h is
> uninitialized and so the changes added in r12-2523 makes the compiler realize
> this and replaces the code with a trap.
> 
> This fixes the case by just making the variable static.
> 
> regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?

I don't think that's a correct fix - the variable 'n' is global and thus
n.h could be initialized elsewhere, no?  As said in the PR, the issue
also appears when 'main' is renamed to 'foo'.

Richard.

> Thanks,
> Tamar
> 
> gcc/testsuite/ChangeLog:
> 
>   PR tree-optimization/101750
>   * g++.dg/vect/pr99149.cc: Fix access of h.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/testsuite/g++.dg/vect/pr99149.cc 
> b/gcc/testsuite/g++.dg/vect/pr99149.cc
> index 
> 00ebe9d9cdf600ada8e66b4b854f0e18ad0b6a7d..4b885a5d432130d5eff3e96c833ec6c97de3e95d
>  100755
> --- a/gcc/testsuite/g++.dg/vect/pr99149.cc
> +++ b/gcc/testsuite/g++.dg/vect/pr99149.cc
> @@ -11,8 +11,8 @@ public:
>a operator*(a d) { return a(b * b - c * c, b * c + c * d.b); }
>  };
>  int f, g;
> -class {
> -  a *h;
> +class mp {
> +  static a *h;
>a *i;
>  
>  public:
> 
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)


[PATCH] Add dg-require-effective-target for testcases.

2021-08-03 Thread liuhongt via Gcc-patches
Hi:
  Pushed to trunk as an abvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_addsubmul_d-2.c: Add
dg-require-effective-target for avx512.
* gcc.target/i386/cond_op_addsubmul_q-2.c: Ditto.
* gcc.target/i386/cond_op_addsubmul_w-2.c: Ditto.
* gcc.target/i386/cond_op_addsubmuldiv_double-2.c: Ditto.
* gcc.target/i386/cond_op_addsubmuldiv_float-2.c: Ditto.
* gcc.target/i386/cond_op_fma_double-2.c: Ditto.
* gcc.target/i386/cond_op_fma_float-2.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c | 1 +
 gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c  | 1 +
 gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c  | 2 ++
 gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c   | 1 +
 7 files changed, 11 insertions(+)

diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
index 490f4afbf18..046804bacbd 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
 #define AVX512VL
 #ifndef CHECK
 #define CHECK "avx512f-helper.h"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
index 09a87deb529..56245b143fa 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
@@ -1,4 +1,6 @@
 /* { dg-do run { target { ! ia32 } } } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -mavx512dq 
-DTYPE=long" } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512dq } */
 
 #include "cond_op_addsubmul_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
index fdcdb34346c..bdcd2ef3db7 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -mavx512bw 
-DTYPE=short" } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
 
 #define AVX512BW
 #include "cond_op_addsubmul_d-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
index 360891f3d21..5ec38df5933 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_double-2.c
@@ -1,5 +1,6 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
 
 #define AVX512VL
 #ifndef CHECK
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
index 20ed737cbf3..c99c04c0b41 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_addsubmuldiv_float-2.c
@@ -1,4 +1,5 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float" } */
+/* { dg-require-effective-target avx512vl } */
 
 #include "cond_op_addsubmuldiv_double-2.c"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
index d8180de7491..4c6514e756c 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
@@ -1,5 +1,7 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512vl } */
+
 #define AVX512VL
 #ifndef CHECK
 #define CHECK "avx512f-helper.h"
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c 
b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
index 0097735dddb..e13d37720fe 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c
@@ -1,4 +1,5 @@
 /* { dg-do run } */
 /* { dg-options "-O2 -mavx512vl -mprefer-vector-width=256 -DTYPE=float 
-D__BUILTIN_FMA=__builtin_fmaf" } */
+/* { dg-require-effective-target avx512vl } */
 
 #include "cond_op_fma_double-2.c"
-- 
2.18.1



[PATCH] [i386] Support cond_{fma, fms, fnma, fnms} for vector float/double under AVX512.

2021-08-03 Thread liuhongt via Gcc-patches
Hi:
  This patch add expanders cond_{fma,fms,fnms,fnms}
for vector float/double modes.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Pushed to trunk.

gcc/ChangeLog:

* config/i386/sse.md (cond_fma): New expander.
(cond_fms): Ditto.
(cond_fnma): Ditto.
(cond_fnms): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_fma_double-1.c: New test.
* gcc.target/i386/cond_op_fma_double-2.c: New test.
* gcc.target/i386/cond_op_fma_float-1.c: New test.
* gcc.target/i386/cond_op_fma_float-2.c: New test.
---
 gcc/config/i386/sse.md|  96 
 .../gcc.target/i386/cond_op_fma_double-1.c|  87 
 .../gcc.target/i386/cond_op_fma_double-2.c| 206 ++
 .../gcc.target/i386/cond_op_fma_float-1.c |  20 ++
 .../gcc.target/i386/cond_op_fma_float-2.c |   4 +
 5 files changed, 413 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_double-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_double-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_float-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_fma_float-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 52b2b4214d7..f5968e04669 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4438,6 +4438,29 @@ (define_insn 
"fma_fmadd_"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand "cond_fma"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+   (vec_merge:VF_AVX512VL
+ (fma:VF_AVX512VL
+   (match_operand:VF_AVX512VL 2 "vector_operand")
+   (match_operand:VF_AVX512VL 3 "vector_operand")
+   (match_operand:VF_AVX512VL 4 "vector_operand"))
+ (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (mode);
+  emit_insn (gen_fma4 (tmp,
+operands[2],
+operands[3],
+operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode,
+ tmp,
+ operands[5],
+ operands[1]));
+  DONE;
+})
+
 (define_insn "_fmadd__mask"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VF_AVX512VL
@@ -4515,6 +4538,30 @@ (define_insn 
"fma_fmsub_"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand "cond_fms"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+   (vec_merge:VF_AVX512VL
+ (fma:VF_AVX512VL
+   (match_operand:VF_AVX512VL 2 "vector_operand")
+   (match_operand:VF_AVX512VL 3 "vector_operand")
+   (neg:VF_AVX512VL
+ (match_operand:VF_AVX512VL 4 "vector_operand")))
+ (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (mode);
+  emit_insn (gen_fms4 (tmp,
+operands[2],
+operands[3],
+operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode,
+ tmp,
+ operands[5],
+ operands[1]));
+  DONE;
+})
+
 (define_insn "_fmsub__mask"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VF_AVX512VL
@@ -4594,6 +4641,30 @@ (define_insn 
"fma_fnmadd_"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 
+(define_expand "cond_fnma"
+  [(set (match_operand:VF_AVX512VL 0 "register_operand")
+   (vec_merge:VF_AVX512VL
+ (fma:VF_AVX512VL
+   (neg:VF_AVX512VL
+ (match_operand:VF_AVX512VL 2 "vector_operand"))
+   (match_operand:VF_AVX512VL 3 "vector_operand")
+   (match_operand:VF_AVX512VL 4 "vector_operand"))
+ (match_operand:VF_AVX512VL 5 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  rtx tmp = gen_reg_rtx (mode);
+  emit_insn (gen_fnma4 (tmp,
+ operands[2],
+ operands[3],
+ operands[4]));
+  emit_move_insn (operands[0], gen_rtx_VEC_MERGE (mode,
+ tmp,
+ operands[5],
+ operands[1]));
+  DONE;
+})
+
 (define_insn "_fnmadd__mask"
   [(set (match_operand:VF_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VF_AVX512VL
@@ -4675,6 +4746,31 @@ (define_insn 
"fma_fnmsub_"
   [(set_attr "type" "ssemuladd")
(set_attr "mode" "")])
 

[PATCH] [i386] Refine predicate of peephole2 to general_reg_operand. [PR target/101743]

2021-08-03 Thread liuhongt via Gcc-patches
Hi:
  The define_peephole2 which is added by r12-2640-gf7bf03cf69ccb7dc
should only work on general registers, considering that x86 also
supports mov instructions between gpr, sse reg, mask reg, limiting the
peephole2 predicate to general_reg_operand.
  I failed to contruct a testcase, but I believe that the PR problem
should be solved by this patch.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

PR target/101743
* config/i386/i386.md (peephole2): Refine predicate from
register_operand to general_reg_operand.
---
 gcc/config/i386/i386.md | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 0c23ddb8d1f..51e8b475bca 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -19423,11 +19423,11 @@ (define_peephole2
 ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#1).
 ;; mov r0,r1; dec r0; mov r2,r3; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
 (define_peephole2
- [(set (match_operand:SWI248 0 "register_operand")
-   (match_operand:SWI248 1 "register_operand"))
+ [(set (match_operand:SWI248 0 "general_reg_operand")
+   (match_operand:SWI248 1 "general_reg_operand"))
   (parallel [(set (reg FLAGS_REG) (match_operand 5))
 (set (match_dup 0) (match_operand:SWI248 6))])
-  (set (match_operand:SWI248 2 "register_operand")
+  (set (match_operand:SWI248 2 "general_reg_operand")
(match_operand:SWI248 3))
   (set (match_dup 0)
(if_then_else:SWI248 (match_operator 4 "ix86_comparison_operator"
@@ -19455,10 +19455,10 @@ (define_peephole2
 ;; Eliminate a reg-reg mov by inverting the condition of a cmov (#2).
 ;; mov r2,r3; mov r0,r1; dec r0; cmov r0,r2 -> dec r1; mov r0,r3; cmov r0, r1
 (define_peephole2
- [(set (match_operand:SWI248 2 "register_operand")
+ [(set (match_operand:SWI248 2 "general_reg_operand")
(match_operand:SWI248 3))
-  (set (match_operand:SWI248 0 "register_operand")
-   (match_operand:SWI248 1 "register_operand"))
+  (set (match_operand:SWI248 0 "general_reg_operand")
+   (match_operand:SWI248 1 "general_reg_operand"))
   (parallel [(set (reg FLAGS_REG) (match_operand 5))
 (set (match_dup 0) (match_operand:SWI248 6))])
   (set (match_dup 0)
-- 
2.27.0



Ping: [PATCH v2] Analyze niter for until-wrap condition [PR101145]

2021-08-03 Thread guojiufu via Gcc-patches

Hi,

I would like to have a ping on this.

https://gcc.gnu.org/pipermail/gcc-patches/2021-July/574596.html

BR,
Jiufu

On 2021-07-15 08:17, guojiufu via Gcc-patches wrote:

Hi,

I would like to have an early ping on this with more mail addresses.

BR,
Jiufu.

On 2021-07-07 20:47, Jiufu Guo wrote:

Changes since v1:
* Update assumptions for niter, add more test cases check
* Use widest_int/wide_int instead mpz to do +-/
* Move some early check for quick return

For code like:
unsigned foo(unsigned val, unsigned start)
{
  unsigned cnt = 0;
  for (unsigned i = start; i > val; ++i)
cnt++;
  return cnt;
}

The number of iterations should be about UINT_MAX - start.

There is function adjust_cond_for_loop_until_wrap which
handles similar work for const bases.
Like adjust_cond_for_loop_until_wrap, this patch enhance
function number_of_iterations_cond/number_of_iterations_lt
to analyze number of iterations for this kind of loop.

Bootstrap and regtest pass on powerpc64le, x86_64 and aarch64.
Is this ok for trunk?

gcc/ChangeLog:

2021-07-07  Jiufu Guo  

PR tree-optimization/101145
* tree-ssa-loop-niter.c (number_of_iterations_until_wrap):
New function.
(number_of_iterations_lt): Invoke above function.
(adjust_cond_for_loop_until_wrap):
Merge to number_of_iterations_until_wrap.
(number_of_iterations_cond): Update invokes for
adjust_cond_for_loop_until_wrap and number_of_iterations_lt.

gcc/testsuite/ChangeLog:

2021-07-07  Jiufu Guo  

PR tree-optimization/101145
* gcc.dg/vect/pr101145.c: New test.
* gcc.dg/vect/pr101145.inc: New test.
* gcc.dg/vect/pr101145_1.c: New test.
* gcc.dg/vect/pr101145_2.c: New test.
* gcc.dg/vect/pr101145_3.c: New test.
* gcc.dg/vect/pr101145inf.c: New test.
* gcc.dg/vect/pr101145inf.inc: New test.
* gcc.dg/vect/pr101145inf_1.c: New test.
---
 gcc/testsuite/gcc.dg/vect/pr101145.c  | 187 
++

 gcc/testsuite/gcc.dg/vect/pr101145.inc|  63 
 gcc/testsuite/gcc.dg/vect/pr101145_1.c|  15 ++
 gcc/testsuite/gcc.dg/vect/pr101145_2.c|  15 ++
 gcc/testsuite/gcc.dg/vect/pr101145_3.c|  15 ++
 gcc/testsuite/gcc.dg/vect/pr101145inf.c   |  25 +++
 gcc/testsuite/gcc.dg/vect/pr101145inf.inc |  28 
 gcc/testsuite/gcc.dg/vect/pr101145inf_1.c |  23 +++
 gcc/tree-ssa-loop-niter.c | 157 ++
 9 files changed, 463 insertions(+), 65 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145.inc
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145_3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145inf.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145inf.inc
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr101145inf_1.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr101145.c
b/gcc/testsuite/gcc.dg/vect/pr101145.c
new file mode 100644
index 000..74031b031cf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr101145.c
@@ -0,0 +1,187 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-options "-O3 -fdump-tree-vect-details" } */
+#include 
+
+unsigned __attribute__ ((noinline))
+foo (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned 
n)

+{
+  while (n < ++l)
+*a++ = *b++ + 1;
+  return l;
+}
+
+unsigned __attribute__ ((noinline))
+foo_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, 
unsigned)

+{
+  while (UINT_MAX - 64 < ++l)
+*a++ = *b++ + 1;
+  return l;
+}
+
+unsigned __attribute__ ((noinline))
+foo_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned 
n)

+{
+  l = UINT_MAX - 32;
+  while (n < ++l)
+*a++ = *b++ + 1;
+  return l;
+}
+
+unsigned __attribute__ ((noinline))
+foo_3 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned 
n)

+{
+  while (n <= ++l)
+*a++ = *b++ + 1;
+  return l;
+}
+
+unsigned __attribute__ ((noinline))
+foo_4 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned 
n)

+{  // infininate
+  while (0 <= ++l)
+*a++ = *b++ + 1;
+  return l;
+}
+
+unsigned __attribute__ ((noinline))
+foo_5 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned 
n)

+{
+  //no loop
+  l = UINT_MAX;
+  while (n < ++l)
+*a++ = *b++ + 1;
+  return l;
+}
+
+unsigned __attribute__ ((noinline))
+bar (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned 
n)

+{
+  while (--l < n)
+*a++ = *b++ + 1;
+  return l;
+}
+
+unsigned __attribute__ ((noinline))
+bar_1 (int *__restrict__ a, int *__restrict__ b, unsigned l, 
unsigned)

+{
+  while (--l < 64)
+*a++ = *b++ + 1;
+  return l;
+}
+
+unsigned __attribute__ ((noinline))
+bar_2 (int *__restrict__ a, int *__restrict__ b, unsigned l, unsigned 
n)

+{
+  l = 32;
+  while (--l < n)
+*a++ = *b++ + 1;
+  return l;
+}
+
+
+int a[3200], b[3200];

Re: [PATCH] Fix loop split incorrect count and probability

2021-08-03 Thread Xionghu Luo via Gcc-patches

I' like to split this patch:

https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576488.html

to two patches:

0001-Fix-loop-split-incorrect-count-and-probability.patch
0002-Don-t-move-cold-code-out-of-loop-by-checking-bb-coun.patch

since they are solving two different things, please help to review
the attached series.  They show obvious performance improvement on
both P8 and P9 for CPU2017, and I am not sure how it will affect other
platforms like X86 and AArch64, it will be grateful if someone could
try it.  Thanks.


Xionghu
From 4e1ef5b1f423484a6789750e7cc0cf2e94517f20 Mon Sep 17 00:00:00 2001
From: Xionghu Luo 
Date: Tue, 3 Aug 2021 03:44:14 -0500
Subject: [PATCH 1/2] Fix loop split incorrect count and probability

loop split condition is moved between loop1 and loop2, the split bb's
count and probability should also be duplicated instead of (100% vs INV),
secondly, the original loop1 and loop2 count need be propotional from the
original loop.

Regression tested pass, OK for master?

diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
   int prephitmp_16;
   int prephitmp_25;

[local count: 118111600]:
   if (n_7(D) > 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 118111600]:
   return;

[local count: 105119324]:
   pretmp_3 = ga;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_13 = PHI 
   # prephitmp_12 = PHI 
   if (prephitmp_12 != 0)
 goto ; [33.00%]
   else
 goto ; [67.00%]

-   [local count: 315357972]:
+   [local count: 104068130]:
   _2 = do_something ();
   ga = _2;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # prephitmp_5 = PHI 
   i_10 = inc (i_13);
   if (n_7(D) > i_10)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
   goto ; [100.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   if (prephitmp_12 != 0)
-goto ; [100.00%]
+goto ; [33.00%]
   else
-goto ; [INV]
+goto ; [67.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   goto ; [100.00%]

-   [count: 0]:
+   [local count: 70429947]:
   # i_23 = PHI 
   # prephitmp_25 = PHI 

-   [local count: 955630225]:
+   [local count: 640272252]:
   # i_15 = PHI 
   # prephitmp_16 = PHI 
   i_22 = inc (i_15);
   if (n_7(D) > i_22)
 goto ; [89.00%]
   else
 goto ; [11.00%]

-   [local count: 850510901]:
+   [local count: 569842305]:
   goto ; [100.00%]

 }

gcc/ChangeLog:

* tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
(do_split_loop_on_cond): Likewise.
---
 gcc/tree-ssa-loop-split.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3f6ad046623..d30782888f3 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -575,7 +575,11 @@ split_loop (class loop *loop1)
stmts2);
tree cond = build2 (guard_code, boolean_type_node, guard_init, border);
if (!initial_true)
- cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond); 
+ cond = fold_build1 (TRUTH_NOT_EXPR, boolean_type_node, cond);
+
+   edge true_edge = EDGE_SUCC (bbs[i], 0)->flags & EDGE_TRUE_VALUE
+  ? EDGE_SUCC (bbs[i], 0)
+  : EDGE_SUCC (bbs[i], 1);
 
/* Now version the loop, placing loop2 after loop1 connecting
   them, and fix up SSA form for that.  */
@@ -583,10 +587,10 @@ split_loop (class loop *loop1)
basic_block cond_bb;
 
class loop *loop2 = loop_version (loop1, cond, _bb,
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
+  true_edge->probability,
+  true_edge->probability.invert (),
+  true_edge->probability,
+  true_edge->probability.invert (),
   true);
gcc_assert (loop2);
 
@@ -1486,10 +1490,10 @@ do_split_loop_on_cond (struct loop *loop1, edge 
invar_branch)
   initialize_original_copy_tables ();
 
   struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
-profile_probability::always (),
-profile_probability::never (),
-profile_probability::always (),
-profile_probability::always (),
+invar_branch->probability.invert (),
+invar_branch->probability,
+

Re: [PATCH 2/6] [i386] Enable _Float16 type for TARGET_SSE2 and above.

2021-08-03 Thread Hongtao Liu via Gcc-patches
On Mon, Aug 2, 2021 at 2:31 PM liuhongt  wrote:
>
> gcc/ChangeLog:
>
> * config/i386/i386-modes.def (FLOAT_MODE): Define ieee HFmode.
> * config/i386/i386.c (enum x86_64_reg_class): Add
> X86_64_SSEHF_CLASS.
> (merge_classes): Handle X86_64_SSEHF_CLASS.
> (examine_argument): Ditto.
> (construct_container): Ditto.
> (classify_argument): Ditto, and set HFmode/HCmode to
> X86_64_SSEHF_CLASS.
> (function_value_32): Return _FLoat16/Complex Float16 by
> %xmm0.
> (function_value_64): Return _Float16/Complex Float16 by SSE
> register.
> (ix86_print_operand): Handle CONST_DOUBLE HFmode.
> (ix86_secondary_reload): Require gpr as intermediate register
> to store _Float16 from sse register when sse4 is not
> available.
> (ix86_libgcc_floating_mode_supported_p): Enable _FLoat16 under
> sse2.
> (ix86_scalar_mode_supported_p): Ditto.
> (TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P): Defined.
> * config/i386/i386.h (VALID_SSE2_REG_MODE): Add HFmode.
> (VALID_INT_MODE_P): Add HFmode and HCmode.
> * config/i386/i386.md (*pushhf_rex64): New define_insn.
> (*pushhf): Ditto.
> (*movhf_internal): Ditto.
> * doc/extend.texi (Half-Precision Floating Point): Documemt
> _Float16 for x86.
> * emit-rtl.c (validate_subreg): Allow (subreg:SI (reg:HF) 0)
> which is used by extract_bit_field but not backends.
>
> gcc/lto/ChangeLog:
>
> * lto-lang.c (lto_type_for_mode): Return float16_type_node
> when mode == TYPE_MODE (float16_type_node).
>
> gcc/testsuite/ChangeLog
>
> * gcc.target/i386/sse2-float16-1.c: New test.
> * gcc.target/i386/sse2-float16-2.c: Ditto.
> * gcc.target/i386/sse2-float16-3.c: Ditto.
> * gcc.target/i386/float16-5.c: Ditto.
> ---
>  gcc/config/i386/i386-modes.def|   1 +
>  gcc/config/i386/i386.c|  91 +-
>  gcc/config/i386/i386.h|   3 +-
>  gcc/config/i386/i386.md   | 118 +-
>  gcc/doc/extend.texi   |  13 ++
>  gcc/emit-rtl.c|   5 +
>  gcc/lto/lto-lang.c|   3 +
>  gcc/testsuite/gcc.target/i386/float16-5.c |  12 ++
>  .../gcc.target/i386/sse2-float16-1.c  |   8 ++
>  .../gcc.target/i386/sse2-float16-2.c  |  16 +++
>  .../gcc.target/i386/sse2-float16-3.c  |  12 ++
>  11 files changed, 274 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/float16-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-3.c
>
> diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
> index 4e7014be034..9232f59a925 100644
> --- a/gcc/config/i386/i386-modes.def
> +++ b/gcc/config/i386/i386-modes.def
> @@ -23,6 +23,7 @@ along with GCC; see the file COPYING3.  If not see
>
>  FRACTIONAL_FLOAT_MODE (XF, 80, 12, ieee_extended_intel_96_format);
>  FLOAT_MODE (TF, 16, ieee_quad_format);
> +FLOAT_MODE (HF, 2, ieee_half_format);
>
>  /* In ILP32 mode, XFmode has size 12 and alignment 4.
> In LP64 mode, XFmode has size and alignment 16.  */
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index ff96134fb37..7979e240426 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -387,6 +387,7 @@ enum x86_64_reg_class
>  X86_64_INTEGER_CLASS,
>  X86_64_INTEGERSI_CLASS,
>  X86_64_SSE_CLASS,
> +X86_64_SSEHF_CLASS,
>  X86_64_SSESF_CLASS,
>  X86_64_SSEDF_CLASS,
>  X86_64_SSEUP_CLASS,
> @@ -2023,8 +2024,10 @@ merge_classes (enum x86_64_reg_class class1, enum 
> x86_64_reg_class class2)
>  return X86_64_MEMORY_CLASS;
>
>/* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
> -  if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
> -  || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
> +  if ((class1 == X86_64_INTEGERSI_CLASS
> +   && (class2 == X86_64_SSESF_CLASS || class2 == X86_64_SSEHF_CLASS))
> +  || (class2 == X86_64_INTEGERSI_CLASS
> + && (class1 == X86_64_SSESF_CLASS || class1 == X86_64_SSEHF_CLASS)))
>  return X86_64_INTEGERSI_CLASS;
>if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
>|| class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
> @@ -2178,6 +2181,8 @@ classify_argument (machine_mode mode, const_tree type,
> /* The partial classes are now full classes.  */
> if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
>   subclasses[0] = X86_64_SSE_CLASS;
> +   if (subclasses[0] == X86_64_SSEHF_CLASS && bytes != 2)

[PATCH v3] Make loops_list support an optional loop_p root

2021-08-03 Thread Kewen.Lin via Gcc-patches
on 2021/8/3 下午8:08, Richard Biener wrote:
> On Fri, Jul 30, 2021 at 7:20 AM Kewen.Lin  wrote:
>>
>> on 2021/7/29 下午4:01, Richard Biener wrote:
>>> On Fri, Jul 23, 2021 at 10:41 AM Kewen.Lin  wrote:

 on 2021/7/22 下午8:56, Richard Biener wrote:
> On Tue, Jul 20, 2021 at 4:37
> PM Kewen.Lin  wrote:
>>
>> Hi,
>>
>> This v2 has addressed some review comments/suggestions:
>>
>>   - Use "!=" instead of "<" in function operator!= (const Iter )
>>   - Add new CTOR loops_list (struct loops *loops, unsigned flags)
>> to support loop hierarchy tree rather than just a function,
>> and adjust to use loops* accordingly.
>
> I actually meant struct loop *, not struct loops * ;)  At the point
> we pondered to make loop invariant motion work on single
> loop nests we gave up not only but also because it iterates
> over the loop nest but all the iterators only ever can process
> all loops, not say, all loops inside a specific 'loop' (and
> including that 'loop' if LI_INCLUDE_ROOT).  So the
> CTOR would take the 'root' of the loop tree as argument.
>
> I see that doesn't trivially fit how loops_list works, at least
> not for LI_ONLY_INNERMOST.  But I guess FROM_INNERMOST
> could be adjusted to do ONLY_INNERMOST as well?
>


 Thanks for the clarification!  I just realized that the previous
 version with struct loops* is problematic, all traversal is
 still bounded with outer_loop == NULL.  I think what you expect
 is to respect the given loop_p root boundary.  Since we just
 record the loops' nums, I think we still need the function* fn?
>>>
>>> Would it simplify things if we recorded the actual loop *?
>>>
>>
>> I'm afraid it's unsafe to record the loop*.  I had the same
>> question why the loop iterator uses index rather than loop* when
>> I read this at the first time.  I guess the design of processing
>> loops allows its user to update or even delete the folllowing
>> loops to be visited.  For example, when the user does some tricks
>> on one loop, then it duplicates the loop and its children to
>> somewhere and then removes the loop and its children, when
>> iterating onto its children later, the "index" way will check its
>> validity by get_loop at that point, but the "loop *" way will
>> have some recorded pointers to become dangling, can't do the
>> validity check on itself, seems to need a side linear search to
>> ensure the validity.
>>
>>> There's still the to_visit reserve which needs a bound on
>>> the number of loops for efficiency reasons.
>>>
>>
>> Yes, I still keep the fn in the updated version.
>>
 So I add one optional argument loop_p root and update the
 visiting codes accordingly.  Before this change, the previous
 visiting uses the outer_loop == NULL as the termination condition,
 it perfectly includes the root itself, but with this given root,
 we have to use it as the termination condition to avoid to iterate
 onto its possible existing next.

 For LI_ONLY_INNERMOST, I was thinking whether we can use the
 code like:

 struct loops *fn_loops = loops_for_fn (fn)->larray;
 for (i = 0; vec_safe_iterate (fn_loops, i, ); i++)
 if (aloop != NULL
 && aloop->inner == NULL
 && flow_loop_nested_p (tree_root, aloop))
  this->to_visit.quick_push (aloop->num);

 it has the stable bound, but if the given root only has several
 child loops, it can be much worse if there are many loops in fn.
 It seems impossible to predict the given root loop hierarchy size,
 maybe we can still use the original linear searching for the case
 loops_for_fn (fn) == root?  But since this visiting seems not so
 performance critical, I chose to share the code originally used
 for FROM_INNERMOST, hope it can have better readability and
 maintainability.
>>>
>>> I was indeed looking for something that has execution/storage
>>> bound on the subtree we're interested in.  If we pull the CTOR
>>> out-of-line we can probably keep the linear search for
>>> LI_ONLY_INNERMOST when looking at the whole loop tree.
>>>
>>
>> OK, I've moved the suggested single loop tree walker out-of-line
>> to cfgloop.c, and brought the linear search back for
>> LI_ONLY_INNERMOST when looking at the whole loop tree.
>>
>>> It just seemed to me that we can eventually re-use a
>>> single loop tree walker for all orders, just adjusting the
>>> places we push.
>>>
>>
>> Wow, good point!  Indeed, I have further unified all orders
>> handlings into a single function walk_loop_tree.
>>

 Bootstrapped and regtested on powerpc64le-linux-gnu P9,
 x86_64-redhat-linux and aarch64-linux-gnu, also
 bootstrapped on ppc64le P9 with bootstrap-O3 config.

 Does the attached patch meet what you expect?
>>>
>>> So yeah, it's probably close to what is sensible.  Not sure
>>> 

Re: [PATCH 5/6] AVX512FP16: Initial support for AVX512FP16 feature and scalar _Float16 instructions.

2021-08-03 Thread Hongtao Liu via Gcc-patches
On Mon, Aug 2, 2021 at 2:44 PM liuhongt  wrote:
>
> From: "Guo, Xuepeng" 
>
> gcc/ChangeLog:
>
> * common/config/i386/cpuinfo.h (get_available_features):
> Detect FEATURE_AVX512FP16.
> * common/config/i386/i386-common.c
> (OPTION_MASK_ISA_AVX512FP16_SET,
> OPTION_MASK_ISA_AVX512FP16_UNSET,
> OPTION_MASK_ISA2_AVX512FP16_SET,
> OPTION_MASK_ISA2_AVX512FP16_UNSET): New.
> (OPTION_MASK_ISA2_AVX512BW_UNSET,
> OPTION_MASK_ISA2_AVX512BF16_UNSET): Add AVX512FP16.
> (ix86_handle_option): Handle -mavx512fp16.
> * common/config/i386/i386-cpuinfo.h (enum processor_features):
> Add FEATURE_AVX512FP16.
> * common/config/i386/i386-isas.h: Add entry for AVX512FP16.
> * config.gcc: Add avx512fp16intrin.h.
> * config/i386/avx512fp16intrin.h: New intrinsic header.
> * config/i386/cpuid.h: Add bit_AVX512FP16.
> * config/i386/i386-builtin-types.def: (FLOAT16): New primitive type.
> * config/i386/i386-builtins.c: Support _Float16 type for i386
> backend.
> (ix86_init_float16_builtins): New function.
> (ix86_float16_type_node): New.
> * config/i386/i386-c.c (ix86_target_macros_internal): Define
> __AVX512FP16__.
> * config/i386/i386-expand.c (ix86_expand_branch): Support
> HFmode.
> (ix86_prepare_fp_compare_args): Adjust TARGET_SSE_MATH &&
> SSE_FLOAT_MODE_P to SSE_FLOAT_MODE_SSEMATH_OR_HF_P.
> (ix86_expand_fp_movcc): Ditto.
> * config/i386/i386-isa.def: Add PTA define for AVX512FP16.
> * config/i386/i386-options.c (isa2_opts): Add -mavx512fp16.
> (ix86_valid_target_attribute_inner_p): Add avx512fp16 attribute.
> * config/i386/i386.c (ix86_get_ssemov): Use
> vmovdqu16/vmovw/vmovsh for HFmode/HImode scalar or vector.
> (ix86_get_excess_precision): Use
> FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 when TARGET_AVX512FP16
> existed.
> (sse_store_index): Use SFmode cost for HFmode cost.
> (inline_memory_move_cost): Add HFmode, and perfer SSE cost over
> GPR cost for HFmode.
> (ix86_hard_regno_mode_ok): Allow HImode in sse register.
> (ix86_mangle_type): Add manlging for _Float16 type.
> (inline_secondary_memory_needed): No memory is needed for
> 16bit movement between gpr and sse reg under
> TARGET_AVX512FP16.
> (ix86_multiplication_cost): Adjust TARGET_SSE_MATH &&
> SSE_FLOAT_MODE_P to SSE_FLOAT_MODE_SSEMATH_OR_HF_P.
> (ix86_division_cost): Ditto.
> (ix86_rtx_costs): Ditto.
> (ix86_add_stmt_cost): Ditto.
> (ix86_optab_supported_p): Ditto.
> * config/i386/i386.h (VALID_AVX512F_SCALAR_MODE): Add HFmode.
> (SSE_FLOAT_MODE_SSEMATH_OR_HF_P): Add HFmode.
> (PTA_SAPPHIRERAPIDS): Add PTA_AVX512FP16.
> * config/i386/i386.md (mode): Add HFmode.
> (MODE_SIZE): Add HFmode.
> (isa): Add avx512fp16.
> (enabled): Handle avx512fp16.
> (ssemodesuffix): Add sh suffix for HFmode.
> (comm): Add mult, div.
> (plusminusmultdiv): New code iterator.
> (insn): Add mult, div.
> (*movhf_internal): Adjust for avx512fp16 instruction.
> (*movhi_internal): Ditto.
> (*cmpihf): New define_insn for HFmode.
> (*ieee_shf3): Likewise.
> (extendhf2): Likewise.
> (trunchf2): Likewise.
> (floathf2): Likewise.
> (*hf): Likewise.
> (cbranchhf4): New expander.
> (movhfcc): Likewise.
> (hf3): Likewise.
> (mulhf3): Likewise.
> (divhf3): Likewise.
> * config/i386/i386.opt: Add mavx512fp16.
> * config/i386/immintrin.h: Include avx512fp16intrin.h.
> * doc/invoke.texi: Add mavx512fp16.
> * doc/extend.texi: Add avx512fp16 Usage Notes.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx-1.c: Add -mavx512fp16 in dg-options.
> * gcc.target/i386/avx-2.c: Ditto.
> * gcc.target/i386/avx512-check.h: Check cpuid for AVX512FP16.
> * gcc.target/i386/funcspec-56.inc: Add new target attribute check.
> * gcc.target/i386/sse-13.c: Add -mavx512fp16.
> * gcc.target/i386/sse-14.c: Ditto.
> * gcc.target/i386/sse-22.c: Ditto.
> * gcc.target/i386/sse-23.c: Ditto.
> * lib/target-supports.exp: (check_effective_target_avx512fp16): New.
> * g++.target/i386/float16-1.C: New test.
> * g++.target/i386/float16-2.C: Ditto.
> * g++.target/i386/float16-3.C: Ditto.
> * gcc.target/i386/avx512fp16-12a.c: Ditto.
> * gcc.target/i386/avx512fp16-12b.c: Ditto.
> * gcc.target/i386/float16-3a.c: Ditto.
> * gcc.target/i386/float16-3b.c: Ditto.
> * gcc.target/i386/float16-4a.c: Ditto.
> * gcc.target/i386/float16-4b.c: Ditto.
> * gcc.target/i386/pr54855-12.c: Ditto.
>   

Go patch committed: Support new language constructs in escape analysis

2021-08-03 Thread Ian Lance Taylor via Gcc-patches
This Go frontend patch by Cherry Mui supports the new language
constructs in escape analysis.  Previous patches added new language
constructs in Go 1.17, specifically, unsafe.Add, unsafe.Slice, and
conversion from a slice to a pointer
to an array. This patch handles them in the escape analysis.

At the point of the escape analysis, unsafe.Add and unsafe.Slice are
still builtin calls, so just handle them in data flow.  Conversion
from a slice to a pointer to an array has already been lowered to a
combination of compound expression, conditional expression and slice
info expressions, so handle them in the escape analysis.

Bootstrapped and ran Go testsuite on x86_64-pc-linux-gnu.  Committed
to mainline.

Ian
98b3c98f4acd7d6f74d0f6ad592a7623ebafecd4
diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index 5a097ffee85..be1a90f7aa1 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-2031f0be9c0b5fda6421d290a0261eb6bd1c8205
+616ee658a6238e7de53592ebda5997f6de6a00de
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/escape.cc b/gcc/go/gofrontend/escape.cc
index cf688740443..347ac2534c9 100644
--- a/gcc/go/gofrontend/escape.cc
+++ b/gcc/go/gofrontend/escape.cc
@@ -2325,19 +2325,55 @@ Escape_analysis_assign::assign(Node* dst, Node* src)
  }
  break;
 
+case Expression::EXPRESSION_SLICE_INFO:
+  {
+Slice_info_expression* sie = e->slice_info_expression();
+if (sie->info() == Expression::SLICE_INFO_VALUE_POINTER)
+  {
+Node* slice = Node::make_node(sie->slice());
+this->assign(dst, slice);
+  }
+  }
+  break;
+
case Expression::EXPRESSION_CALL:
  {
Call_expression* call = e->call_expression();
 if (call->is_builtin())
   {
 Builtin_call_expression* bce = call->builtin_call_expression();
-if (bce->code() == Builtin_call_expression::BUILTIN_APPEND)
+switch (bce->code())
   {
-// Append returns the first argument.
-// The subsequent arguments are already leaked because
-// they are operands to append.
-Node* appendee = Node::make_node(call->args()->front());
-this->assign(dst, appendee);
+  case Builtin_call_expression::BUILTIN_APPEND:
+{
+  // Append returns the first argument.
+  // The subsequent arguments are already leaked because
+  // they are operands to append.
+  Node* appendee = Node::make_node(call->args()->front());
+  this->assign(dst, appendee);
+}
+break;
+
+  case Builtin_call_expression::BUILTIN_ADD:
+{
+  // unsafe.Add(p, off).
+  // Flow p to result.
+  Node* arg = Node::make_node(call->args()->front());
+  this->assign(dst, arg);
+}
+break;
+
+  case Builtin_call_expression::BUILTIN_SLICE:
+{
+  // unsafe.Slice(p, len).
+  // The resulting slice has the same backing store as p. 
Flow p to result.
+  Node* arg = Node::make_node(call->args()->front());
+  this->assign(dst, arg);
+}
+break;
+
+  default:
+break;
   }
 break;
   }
@@ -2592,6 +2628,21 @@ Escape_analysis_assign::assign(Node* dst, Node* src)
  }
  break;
 
+case Expression::EXPRESSION_CONDITIONAL:
+  {
+Conditional_expression* ce = e->conditional_expression();
+this->assign(dst, Node::make_node(ce->then_expr()));
+this->assign(dst, Node::make_node(ce->else_expr()));
+  }
+  break;
+
+case Expression::EXPRESSION_COMPOUND:
+  {
+Compound_expression* ce = e->compound_expression();
+this->assign(dst, Node::make_node(ce->expr()));
+  }
+  break;
+
default:
  // TODO(cmang): Add debug info here; this should not be reachable.
  // For now, just to be conservative, we'll just say dst flows to src.
diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc
index 51a8b7e4322..3e433d6c20d 100644
--- a/gcc/go/gofrontend/expressions.cc
+++ b/gcc/go/gofrontend/expressions.cc
@@ -18009,49 +18009,7 @@ Expression::make_type_info(Type* type, Type_info 
type_info)
   return new Type_info_expression(type, type_info);
 }
 
-// An expression that 

Re: [PATCH 53/55] rs6000: Update altivec.h for automated interfaces

2021-08-03 Thread Segher Boessenkool
On Wed, Jul 28, 2021 at 03:58:02PM -0500, Bill Schmidt wrote:
> On 7/27/21 4:07 PM, will schmidt wrote:
> >On Thu, 2021-06-17 at 10:19 -0500, Bill Schmidt via Gcc-patches wrote:
> >>+#ifdef _ARCH_PWR8
> >>+#define vec_vclz vec_cntlz
> >>+#define vec_vgbbd vec_gb
> >>+#define vec_vmrgew vec_mergee
> >>+#define vec_vmrgow vec_mergeo
> >>+#define vec_vpopcntu vec_popcnt
> >>+#define vec_vrld vec_rl
> >>+#define vec_vsld vec_sl
> >>+#define vec_vsrd vec_sr
> >>+#define vec_vsrad vec_sra
> >>+#endif
> >
> >Does anything bad happen if these are simply defined, without the
> >#ifdef/#endif protection?
> >I'm wondering if there is some scenario with
> >pragma GCC target "cpu=powerX" where we may want them defined
> >anyway.
> 
> Yes, you're right about that.  We could run into such problems, I 
> think.  I think it's best to always define these.  If the builtin isn't 
> supported for the specific target configuration, it'll be flagged during 
> the lookup process.

I don't think it will ever cause problems.  But yes, it certainly is not
the best place to do it here, and it is done elsewhere already, as you
say.

It does make a user-visible difference though: without the #if user code
using the name "vec_vgbbd" for some other purpose will stop working
(when building for older machines, it already will break on power8).
These aren't reserved names.  We don't really care though I guess ;-)


Segher


Re: [PATCH 52/55] rs6000: Debug support

2021-08-03 Thread Segher Boessenkool
On Tue, Jul 27, 2021 at 04:07:22PM -0500, will schmidt wrote:
> On Thu, 2021-06-17 at 10:19 -0500, Bill Schmidt via Gcc-patches wrote:
> > +  else if (type == bool_V16QI_type_node)
> > +return "vbc";
> > +  else if (type == bool_V2DI_type_node)
> > +return "vbll";
> > +  else if (type == bool_V4SI_type_node)
> > +return "vbi";
> > +  else if (type == bool_V8HI_type_node)
> > +return "vbs";
> 
> I'd be strongly tempted to rearrange the order and put V16 after V8 in
> the list.  Similar to the order you previously used in
> rs6000_expand_new_builtin(). Same comment elsewhere.

These are ordered on return value.  It is hard to make some order of all
these disparate things based on the actual type, but the strings is a
neat way out ;-)

(A comment "ordered by return value" would be good to have).

> > +  /*
> >if (TARGET_DEBUG_BUILTIN)
> >  fprintf (stderr, "rs6000_builtin, code = %4d, %s%s\n",
> >  (int)code, name, attr_string);
> > +  */
> 
> Could probably just drop that chunk, instead of commenting it out. 

Or fix up its spacing :-P

> > +  for (int i = 1; i < (int) RS6000_BIF_MAX; i++)

That is an good reason to *not* have the max as enum value, btw: you
need a cast to use it.  Make the max a macro, and then it can include
all casting you need right in there :-)


Segher


Re: [PATCH 50/55] rs6000: Update rs6000_builtin_decl

2021-08-03 Thread Segher Boessenkool
On Tue, Jul 27, 2021 at 04:08:15PM -0500, will schmidt wrote:
> On Thu, 2021-06-17 at 10:19 -0500, Bill Schmidt via Gcc-patches wrote:
> > 2021-03-05  Bill Schmidt  
> > 
> 
> Hi,
>   Description could be a bit longer. :-)  (Even just a duplicate of the
> mail subject to fill the space would prob be fine.) 

Well, this should completely go away later in the series... Anything
named "new" should :-)

But some short comment wrt that wouldn't hurt, sure.  And there always
are a few words too say about any patch, if only to help reviewers :-)


Segher


Re: [PATCH 47/55] rs6000: Builtin expansion, part 4

2021-08-03 Thread Segher Boessenkool
Whoops, I forgot some stuff:

On Tue, Jul 27, 2021 at 04:06:49PM -0500, will schmidt wrote:
> On Thu, 2021-06-17 at 10:19 -0500, Bill Schmidt via Gcc-patches wrote:
> >  static rtx
> >  ldv_expand_builtin (rtx target, insn_code icode, rtx *op, machine_mode 
> > tmode)
> >  {
> > +  rtx pat, addr;
> > +  bool blk = (icode == CODE_FOR_altivec_lvlx
> > + || icode == CODE_FOR_altivec_lvlxl
> > + || icode == CODE_FOR_altivec_lvrx
> > + || icode == CODE_FOR_altivec_lvrxl);
> > +
> > +  if (target == 0
> > +  || GET_MODE (target) != tmode
> > +  || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
> 
> No space after "!" ?  (here and later on 'pat'.).

It can be written as just
  || !insn_data[icode].operand[0].predicate (target, tmode))
even.  The * is completely optional, and you don't need the extra parens
without it.


Segher


Re: [PATCH 47/55] rs6000: Builtin expansion, part 4

2021-08-03 Thread Segher Boessenkool
On Tue, Jul 27, 2021 at 04:06:49PM -0500, will schmidt wrote:
> On Thu, 2021-06-17 at 10:19 -0500, Bill Schmidt via Gcc-patches wrote:
> > +case RS6000_BIF_ST_ELEMREV_V1TI:
> > +  return (BYTES_BIG_ENDIAN ? CODE_FOR_vsx_store_v1ti
> > + : CODE_FOR_vsx_st_elemrev_v1ti);
> 
> Hmm, would it be worthy to rename one of the pair so they both match "_st_" 
> or "_store_" ?  
> 
> CODE_FOR_vsx_store_v1ti
> CODE_FOR_vsx_st_elemrev_v1ti

"st" is not a good name.  "store" would be better, and it is not like
three extra characters will kill you here.

But is have to be changed in the machine description of course, these
are existing pattern names.


Segher


Re: [PATCH 46/55] rs6000: Builtin expansion, part 3

2021-08-03 Thread Segher Boessenkool
On Tue, Jul 27, 2021 at 04:06:39PM -0500, will schmidt wrote:
> On Thu, 2021-06-17 at 10:19 -0500, Bill Schmidt via Gcc-patches wrote:
> > +#else
> > +  warning (0, "builtin %qs needs GLIBC (2.23 and newer) that exports 
> > hardware "
> > +  "capability bits", rs6000_builtin_info_x[(size_t) fcode].bifname);
> > +
> 
> This seems OK. 
> It appears to comply with the documentation at least  :-)
>   "If GCC was configured to use a GLIBC before 2.23, the built-in
>   function __builtin_cpu_is always returns a 0 and the compiler
>   issues a warning."

Yup.  And we still (have to) support older glibc versions, since various
distros ship with something older (2.23 is only 5 years old).


Segher


Go patch committed: Return two values from selectnbrecv

2021-08-03 Thread Ian Lance Taylor via Gcc-patches
This patch to the Go frontend and libgo changes selectnbrecv to return
two values.  The only difference between selectnbrecv and
selectnbrecv2 is that the latter uses a pointer argument as the second
return value from chanrecv.  This patch changes selectnbrecv to return
the two values from chanrecv and gets
rid of selectnbrecv2.  The compiler now calls only selectnbrecv, which
is simpler.

This is the gofrontend version of https://golang.org/cl/292890.  I'm
committing it now as part of updating to the Go 1.17 release.

Bootstrapped and ran Go testsuite on x86_64-pc-linux-gnu.  Committed
to mainline.

Ian
e435e72ad713cadd661072427588ec1c777c04e3
diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index 801e039a155..5a097ffee85 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-54361805bd611d896042b879ee7f6d2d4d088537
+2031f0be9c0b5fda6421d290a0261eb6bd1c8205
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/runtime.def b/gcc/go/gofrontend/runtime.def
index fad8cebc012..87a27085d60 100644
--- a/gcc/go/gofrontend/runtime.def
+++ b/gcc/go/gofrontend/runtime.def
@@ -204,12 +204,8 @@ DEF_GO_RUNTIME(SELECTNBSEND, "runtime.selectnbsend", 
P2(CHAN, POINTER), R1(BOOL)
 
 // Non-blocking receive a value from a channel, used for two-case select
 // statement with a default case.
-DEF_GO_RUNTIME(SELECTNBRECV, "runtime.selectnbrecv", P2(POINTER, CHAN), 
R1(BOOL))
-
-// Non-blocking tuple receive from a channel, used for two-case select
-// statement with a default case.
-DEF_GO_RUNTIME(SELECTNBRECV2, "runtime.selectnbrecv2", P3(POINTER, POINTER, 
CHAN),
-   R1(BOOL))
+DEF_GO_RUNTIME(SELECTNBRECV, "runtime.selectnbrecv", P2(POINTER, CHAN),
+  R2(BOOL, BOOL))
 
 // Block execution.  Used for zero-case select.
 DEF_GO_RUNTIME(BLOCK, "runtime.block", P0(), R0())
diff --git a/gcc/go/gofrontend/statements.cc b/gcc/go/gofrontend/statements.cc
index 9643d1b42b3..95fa3c48709 100644
--- a/gcc/go/gofrontend/statements.cc
+++ b/gcc/go/gofrontend/statements.cc
@@ -6051,7 +6051,7 @@ Select_statement::lower_two_case(Block* b)
   Expression* chanref = Expression::make_temporary_reference(chantmp, loc);
 
   Block* bchan;
-  Expression* call;
+  Expression* cond;
   if (chancase.is_send())
 {
   // if selectnbsend(chan, ) { body } else { default body }
@@ -6065,7 +6065,7 @@ Select_statement::lower_two_case(Block* b)
 
   Expression* ref = Expression::make_temporary_reference(ts, loc);
   Expression* addr = Expression::make_unary(OPERATOR_AND, ref, loc);
-  call = Runtime::make_call(Runtime::SELECTNBSEND, loc, 2, chanref, addr);
+  cond = Runtime::make_call(Runtime::SELECTNBSEND, loc, 2, chanref, addr);
   bchan = chancase.statements();
 }
   else
@@ -6075,34 +6075,31 @@ Select_statement::lower_two_case(Block* b)
 
   Expression* ref = Expression::make_temporary_reference(ts, loc);
   Expression* addr = Expression::make_unary(OPERATOR_AND, ref, loc);
-  Expression* okref = NULL;
-  if (chancase.closed() == NULL && chancase.closedvar() == NULL)
-{
-  // Simple receive.
-  // if selectnbrecv(, chan) { body } else { default body }
-  call = Runtime::make_call(Runtime::SELECTNBRECV, loc, 2, addr, 
chanref);
-}
-  else
-{
-  // Tuple receive.
-  // if selectnbrecv2(, , chan) { body } else { default body }
-
-  Type* booltype = Type::make_boolean_type();
-  Temporary_statement* okts = Statement::make_temporary(booltype, NULL,
-loc);
-  b->add_statement(okts);
-
-  okref = Expression::make_temporary_reference(okts, loc);
-  Expression* okaddr = Expression::make_unary(OPERATOR_AND, okref, 
loc);
-  call = Runtime::make_call(Runtime::SELECTNBRECV2, loc, 3, addr, 
okaddr,
-chanref);
-}
+
+  // selected, ok = selectnbrecv(, chan)
+  Call_expression* call = Runtime::make_call(Runtime::SELECTNBRECV, loc, 2,
+addr, chanref);
+
+  Temporary_statement* selected_temp =
+   Statement::make_temporary(Type::make_boolean_type(),
+ Expression::make_call_result(call, 0),
+ loc);
+  b->add_statement(selected_temp);
+
+  Temporary_statement* ok_temp =
+   Statement::make_temporary(Type::make_boolean_type(),
+ Expression::make_call_result(call, 1),
+ loc);
+  b->add_statement(ok_temp);
+
+  cond = Expression::make_temporary_reference(selected_temp, loc);
 
   Location cloc = chancase.location();
   bchan = new Block(b, loc);
   if (chancase.val() != NULL && !chancase.val()->is_sink_expression())
 {
-  

Re: Go patch committed: Allow converting from slice to pointer-to-array

2021-08-03 Thread Ian Lance Taylor via Gcc-patches
On Mon, Aug 2, 2021 at 3:53 PM Ian Lance Taylor  wrote:
>
> The upcoming Go 1.17 release has a new language feature: it permits
> conversions from slice types to pointer-to-array types.  If the slice
> is too short, the conversion panics.  This patch implements this new
> feature in gccgo.  Bootstrapped and ran Go testsuite on
> x86_64-pc-linux-gnu.  Committed to mainline.

I didn't get the type checking right: I forgot to check that the
element types of the slice and array are identical.  Fixed with this
patches.  Bootstrapped and tested on x86_64-pc-linux-gnu.  Committed
to mainline.

Ian
7ff2742eacee93c7e7d9262d07c2496f87d801a7
diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index 95b9340b42d..801e039a155 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-0a4d612e6b211780b294717503fc739bbd1f509c
+54361805bd611d896042b879ee7f6d2d4d088537
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/expressions.cc b/gcc/go/gofrontend/expressions.cc
index 15c9eabc6bf..51a8b7e4322 100644
--- a/gcc/go/gofrontend/expressions.cc
+++ b/gcc/go/gofrontend/expressions.cc
@@ -3962,7 +3962,10 @@ Type_conversion_expression::do_lower(Gogo*, 
Named_object*,
   if (type->points_to() != NULL
   && type->points_to()->array_type() != NULL
   && !type->points_to()->is_slice_type()
-  && val->type()->is_slice_type())
+  && val->type()->is_slice_type()
+  && Type::are_identical(type->points_to()->array_type()->element_type(),
+val->type()->array_type()->element_type(),
+0, NULL))
 {
   Temporary_statement* val_temp = NULL;
   if (!val->is_multi_eval_safe())
diff --git a/gcc/go/gofrontend/types.cc b/gcc/go/gofrontend/types.cc
index 7c7b2eb8271..0c44186f507 100644
--- a/gcc/go/gofrontend/types.cc
+++ b/gcc/go/gofrontend/types.cc
@@ -846,7 +846,9 @@ Type::are_convertible(const Type* lhs, const Type* rhs, 
std::string* reason)
   if (rhs->is_slice_type()
   && lhs->points_to() != NULL
   && lhs->points_to()->array_type() != NULL
-  && !lhs->points_to()->is_slice_type())
+  && !lhs->points_to()->is_slice_type()
+  && Type::are_identical(lhs->points_to()->array_type()->element_type(),
+rhs->array_type()->element_type(), 0, reason))
 return true;
 
   // An unsafe.Pointer type may be converted to any pointer type or to


[PATCH] rs6000: Replace & by &

2021-08-03 Thread Segher Boessenkool
No functional changes.  Tested on powerpc64-linux.  Committed.


Segher


2021-08-03  Segher Boessenkool  

* config/rs6000/vsx.md (*vsx_le_perm_store_): Use && instead of &.
---
 gcc/config/rs6000/vsx.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 6f6fc0bd8350..441735df9c3b 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1014,7 +1014,7 @@ (define_insn "*vsx_le_perm_store_"
   [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q")
 (match_operand:VSX_LE_128 1 "vsx_register_operand" "+wa,r"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR
-   & !altivec_indexed_or_indirect_operand (operands[0], mode)"
+   && !altivec_indexed_or_indirect_operand (operands[0], mode)"
   "@
#
#"
-- 
1.8.3.1



[PATCH] rs6000: "e" is not a free constraint letter

2021-08-03 Thread Segher Boessenkool
It is the prefix of the "es" and "eI" constraints.

Committing to trunk.


Segher


2021-08-03  Segher Boessenkool  

* config/rs6000/constraints.md: Remove "e" from the list of available
constraint characters.
---
 gcc/config/rs6000/constraints.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/rs6000/constraints.md b/gcc/config/rs6000/constraints.md
index 561ce9797af5..c8cff1a3038f 100644
--- a/gcc/config/rs6000/constraints.md
+++ b/gcc/config/rs6000/constraints.md
@@ -17,7 +17,7 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; .
 
-;; Available constraint letters: e k q t u A B C D S T
+;; Available constraint letters: k q t u A B C D S T
 
 ;; Register constraints
 
-- 
1.8.3.1



RE: [EXTERNAL] Re: [PATCH] gcov-profile/71672 Fix indirect call inlining with AutoFDO

2021-08-03 Thread Eugene Rozenfeld via Gcc-patches
Thank you for the reviews, Andy and Richard.

I split up the patch into 4 commits and pushed to trunk.

Eugene

-Original Message-
From: Richard Biener  
Sent: Monday, August 2, 2021 2:57 AM
To: Eugene Rozenfeld 
Cc: gcc-patches@gcc.gnu.org; mli...@suse.cz; Andi Kleen 
Subject: [EXTERNAL] Re: [PATCH] gcov-profile/71672 Fix indirect call inlining 
with AutoFDO

On Fri, Jul 30, 2021 at 9:09 AM Eugene Rozenfeld via Gcc-patches 
 wrote:
>
> This patch has the following changes:
>
> 1. The main fix is in auto-profile.c: the histogram value for
>indirect calls was incorrectly set up. That is fixed now.
>
> 2. Several tests now have -fdump-ipa-afdo-optimized instead of -fdump-ipa-afdo
>in dg-options so that the expected output can be found.
>
> 3. I increased the number of iterations in several tests so that perf can have
>enough sampling events.
>
> 4. indir-call-prof-2.c has -fno-early-inlining but AutoFDO can't work without
>early inlining (it needs to match the inlining of the profiled binary).
>I changed profopt.exp to always pass -fearly-inlining for AutoFDO.
>With that the indirect call inlining in indir-call-prof-2.c happens in the 
> early inliner
>so I changed the dg-final-use-autofdo.
>
> 5. create_gcov tool doesn't currently support dwarf 5 so I made a change in 
> profopt.exp
>to pass -gdwarf-4 when compiling the binary to profile.
>
> 6. I updated the invocation of create_gcov in profopt.exp to pass 
> -gcov_version=2.
>I recently made a change to create_gcov to support version 2:
>
> https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgith
> ub.com%2Fgoogle%2Fautofdo%2Fpull%2F117data=04%7C01%7CEugene.Rozen
> feld%40microsoft.com%7C92927d4029754d0d6b4708d9559be06d%7C72f988bf86f1
> 41af91ab2d7cd011db47%7C1%7C0%7C637634950245832767%7CUnknown%7CTWFpbGZs
> b3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D
> %7C1000sdata=Ex1OpS0gt9dpsBVIK71k7hvjJbfIkN%2BlRr%2BYD86%2FqEs%3D
> reserved=0
>
> 7. I removed useless -o perf.data from the invocation of gcc-auto-profile in
>target-supports.exp.
>
> With these changes the tests checking indirect call inlining in gcc.dg 
> and g++.dg are passing.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> PR gcov-profile/71672
> * auto-profile.c (afdo_indirect_call): Fix the setup of the 
> historgram value for indirect calls.
>
> gcc/testsuite/ChangeLog:
> PR gcov-profile/71672
> * g++.dg/tree-prof/indir-call-prof.C: Fix options, increase the 
> number of iterations.
> * g++.dg/tree-prof/morefunc.C: Fix options, increase the number of 
> iterations.
> * g++.dg/tree-prof/reorder.C: Fix options, increase the number of 
> iterations.
> * gcc.dg/tree-prof/indir-call-prof-2.c: Fix options, fix 
> dg-final-use-autofdo, increase the number of iterations.
> * gcc.dg/tree-prof/indir-call-prof.c: Fix options.
> * lib/profopt.exp: Pass gdwarf-4 when compiling binary to profile; 
> pass -fearly-inlining when compiling with AutoFDO; pass -gcov_version=2 to 
> create_gcov.
> * lib/target-supports.exp: Remove unnecessary -o perf.data passed to 
> gcc-auto-profile.
> ---
>  gcc/auto-profile.c | 13 +
>  gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C   |  4 ++--
>  gcc/testsuite/g++.dg/tree-prof/morefunc.C  |  7 ---
>  gcc/testsuite/g++.dg/tree-prof/reorder.C   |  6 +++---
>  gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-2.c |  8 
>  gcc/testsuite/gcc.dg/tree-prof/indir-call-prof.c   |  2 +-
>  gcc/testsuite/lib/profopt.exp  |  6 +++---
>  gcc/testsuite/lib/target-supports.exp  |  2 +-
>  8 files changed, 27 insertions(+), 21 deletions(-)
>
> diff --git a/gcc/auto-profile.c b/gcc/auto-profile.c index 
> b23b82b2df4..4c1fc6b536b 100644
> --- a/gcc/auto-profile.c
> +++ b/gcc/auto-profile.c
> @@ -1009,13 +1009,18 @@ afdo_indirect_call (gimple_stmt_iterator *gsi, 
> const icall_target_map ,
>
>histogram_value hist = gimple_alloc_histogram_value (
>cfun, HIST_TYPE_INDIR_CALL, stmt, callee);
> -  hist->n_counters = 3;
> +  hist->n_counters = 4;
>hist->hvalue.counters = XNEWVEC (gcov_type, hist->n_counters);
>gimple_add_histogram_value (cfun, stmt, hist);
>
> -  hist->hvalue.counters[0] = direct_call->profile_id;
> -  hist->hvalue.counters[1] = max_iter->second;
> -  hist->hvalue.counters[2] = total;
> +  // Total counter
> +  hist->hvalue.counters[0] = total;
> +  // Number of value/counter pairs
> +  hist->hvalue.counters[1] = 1;
> +  // Value
> +  hist->hvalue.counters[2] = direct_call->profile_id;  // Counter  
> + hist->hvalue.counters[3] = max_iter->second;
>
>if (!transform)
>  return;
> diff --git a/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C 
> b/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C
> index 3374744613e..b45417106d0 100644
> --- a/gcc/testsuite/g++.dg/tree-prof/indir-call-prof.C
> 

Re: [PATCH] by_pieces: Properly set m_max_size in op_by_pieces

2021-08-03 Thread H.J. Lu via Gcc-patches
On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu  wrote:
>
> 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> move is enabled since x86 uses vec_duplicate, which is enabled only when
> inter-unit move is enabled, to implement store_by_pieces.
> 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> compare_by_pieces.
>
> gcc/
>
> PR target/101742
> * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
> for compare_by_pieces.
> * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
> only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
>
> gcc/testsuite/
>
> PR target/101742
> * gcc.target/i386/pr101742a.c: New test.
> * gcc.target/i386/pr101742b.c: Likewise.
> ---
>  gcc/config/i386/i386.h| 20 +++-
>  gcc/expr.c|  6 +-
>  gcc/testsuite/gcc.target/i386/pr101742a.c | 16 
>  gcc/testsuite/gcc.target/i386/pr101742b.c |  4 
>  4 files changed, 36 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index bed9cd9da18..9b416abd5f4 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
>  /* STORE_MAX_PIECES is the number of bytes at a time that we can
> store efficiently.  */
>  #define STORE_MAX_PIECES \
> -  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> -   ? 64 \
> -   : ((TARGET_AVX \
> -   && !TARGET_PREFER_AVX128 \
> -   && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> -  ? 32 \
> -  : ((TARGET_SSE2 \
> - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> -? 16 : UNITS_PER_WORD)))
> +  (TARGET_INTER_UNIT_MOVES_TO_VEC \
> +   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> +  ? 64 \
> +  : ((TARGET_AVX \
> + && !TARGET_PREFER_AVX128 \
> + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> + ? 32 \
> + : ((TARGET_SSE2 \
> + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> + ? 16 : UNITS_PER_WORD))) \
> +   : UNITS_PER_WORD)
>
>  /* If a memory-to-memory move would take MOVE_RATIO or more simple
> move-instruction pairs, we will do a cpymem or libcall instead.
> diff --git a/gcc/expr.c b/gcc/expr.c
> index b65cfcfdcd1..2964b38b9a5 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> bool qi_vector_mode)
>: m_to (to, to_load, NULL, NULL),
>  m_from (from, from_load, from_cfn, from_cfn_data),
> -m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
> +m_len (len),
> +m_max_size (((!to_load && from == nullptr)
> +? STORE_MAX_PIECES
> +: (from_cfn != nullptr
> +   ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1),
>  m_push (push), m_qi_vector_mode (qi_vector_mode)
>  {
>int toi = m_to.get_addr_inc ();

This larger expr.c patch passes the proper MAX_PIECES directly.

> diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c 
> b/gcc/testsuite/gcc.target/i386/pr101742a.c
> new file mode 100644
> index 000..67ea40587dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -mtune=nano-x2" } */
> +
> +int n2;
> +
> +__attribute__ ((simd)) char
> +w7 (void)
> +{
> +  short int xb = n2;
> +  int qp;
> +
> +  for (qp = 0; qp < 2; ++qp)
> +xb = xb < 1;
> +
> +  return xb;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c 
> b/gcc/testsuite/gcc.target/i386/pr101742b.c
> new file mode 100644
> index 000..ba19064077b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
> @@ -0,0 +1,4 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -mtune=nano-x2 
> -mtune-ctrl=sse_unaligned_store_optimal" } */
> +
> +#include "pr101742a.c"
> --
> 2.31.1
>


-- 
H.J.
diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..66ac1986f02 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
   }
 
  public:
-  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool,
+  op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+		  void *, unsigned HOST_WIDE_INT, unsigned int, bool,
 		  bool = false);
   void run ();
 };
@@ -1122,8 +1122,8 @@ class op_by_pieces_d
and its associated FROM_CFN_DATA can be used to replace loads with
constant values.  LEN describes the length of the operation.  */
 
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
-rtx from, bool 

Re: [PATCH] PR fortran/100950 - ICE in output_constructor_regular_field, at varasm.c:5514

2021-08-03 Thread Harald Anlauf via Gcc-patches
Here's now my third attempt to fix this PR, taking into account
the comments by Tobias and Bernhard.

> > On 10.06.21 20:52, Harald Anlauf via Fortran wrote:
> > > +static bool
> > > +substring_has_constant_len (gfc_expr *e)
> > > +{
> > > +  ptrdiff_t istart, iend;
> > > +  size_t length;
> > > +  bool equal_length = false;
> > > +
> > > +  if (e->ts.type != BT_CHARACTER
> > > +  || !e->ref
> > > +  || e->ref->type != REF_SUBSTRING
> > 
> > Is there a reason why you do not handle:
> > 
> > type t
> >character(len=5) :: str1
> >character(len=:), allocatable :: str2
> > end type
> > type(t) :: x
> > 
> > allocate(x%str2, source="abd")
> > if (len (x%str)) /= 1) ...
> > if (len (x%str2(1:2) /= 2) ...
> > etc.
> > 
> > Namely: Search the last_ref = expr->ref->next->next ...?
> > and then check that lastref?

The mentioned search is now implemented.

Note, however, that gfc_simplify_len still won't handle neither
deferred strings nor their substrings.

I think there is nothing to simplify at compile time here.  Otherwise
there would be a conflict/inconsistency with type parameter inquiry,
see F2018:9.4.5(2):

"A deferred type parameter of a pointer that is not associated or
of an unallocated allocatable variable shall not be inquired about."

> >* * *
> > 
> > Slightly unrelated: I think the following does not violate
> > F2018's R916 / C923 – but is rejected, namely:
> >R916  type-param-inquiry  is  designator % type-param-name
> > the latter is 'len' or 'kind' for intrinsic types. And:
> >R901  designator is ...
> > or substring
> > But
> > 
> > character(len=5) :: str
> > print *, str(1:3)%len
> > end
> > 
> > fails with
> > 
> >  2 | print *, str(1:3)%len
> >|  1
> > Error: Syntax error in PRINT statement at (1)
> > 
> > 
> > Assuming you don't want to handle it, can you open a new PR?
> > Thanks!

I tried to look into this, but there appear to be several unrelated
issues requiring a separate treatment.  I therefore opened:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101735

> > > +  istart = gfc_mpz_get_hwi (e->ref->u.ss.start->value.integer);
> > > +  iend = gfc_mpz_get_hwi (e->ref->u.ss.end->value.integer);
> > > +  length = gfc_mpz_get_hwi (e->ref->u.ss.length->length->value.integer);
> > > +
> > > +  if (istart <= iend)
> > > +{
> > > +  if (istart < 1)
> > > + {
> > > +   gfc_error ("Substring start index (%ld) at %L below 1",
> > > +  (long) istart, >ref->u.ss.start->where);
> > 
> > As mentioned by Bernhard, you could use HOST_WIDE_INT_PRINT_DEC.
> > 
> > (It probably only matters on Windows which uses long == int = 32bit for
> > strings longer than INT_MAX.)

Done.

The updated patch regtests fine.  OK?

Thanks,
Harald


Fortran - simplify length of substring with constant bounds

gcc/fortran/ChangeLog:

PR fortran/100950
* simplify.c (substring_has_constant_len): New.
(gfc_simplify_len): Handle case of substrings with constant
bounds.

gcc/testsuite/ChangeLog:

PR fortran/100950
* gfortran.dg/pr100950.f90: New test.

diff --git a/gcc/fortran/simplify.c b/gcc/fortran/simplify.c
index c27b47aa98f..8f7fcec94c8 100644
--- a/gcc/fortran/simplify.c
+++ b/gcc/fortran/simplify.c
@@ -4512,6 +4512,69 @@ gfc_simplify_leadz (gfc_expr *e)
 }


+/* Check for constant length of a substring.  */
+
+static bool
+substring_has_constant_len (gfc_expr *e)
+{
+  gfc_ref *ref;
+  HOST_WIDE_INT istart, iend, length;
+  bool equal_length = false;
+
+  if (e->ts.type != BT_CHARACTER || e->ts.deferred)
+return false;
+
+  for (ref = e->ref; ref; ref = ref->next)
+if (ref->type != REF_COMPONENT)
+  break;
+
+  if (!ref
+  || ref->type != REF_SUBSTRING
+  || !ref->u.ss.start
+  || ref->u.ss.start->expr_type != EXPR_CONSTANT
+  || !ref->u.ss.end
+  || ref->u.ss.end->expr_type != EXPR_CONSTANT
+  || !ref->u.ss.length
+  || !ref->u.ss.length->length
+  || ref->u.ss.length->length->expr_type != EXPR_CONSTANT)
+return false;
+
+  /* Basic checks on substring starting and ending indices.  */
+  if (!gfc_resolve_substring (ref, _length))
+return false;
+
+  istart = gfc_mpz_get_hwi (ref->u.ss.start->value.integer);
+  iend = gfc_mpz_get_hwi (ref->u.ss.end->value.integer);
+  length = gfc_mpz_get_hwi (ref->u.ss.length->length->value.integer);
+
+  if (istart <= iend)
+{
+  if (istart < 1)
+	{
+	  gfc_error ("Substring start index (" HOST_WIDE_INT_PRINT_DEC
+		 ") at %L below 1",
+		 istart, >u.ss.start->where);
+	  return false;
+	}
+  if (iend > length)
+	{
+	  gfc_error ("Substring end index (" HOST_WIDE_INT_PRINT_DEC
+		 ") at %L exceeds string length",
+		 iend, >u.ss.end->where);
+	  return false;
+	}
+  length = iend - istart + 1;
+}
+  else
+length = 0;
+
+  /* Fix substring length.  */
+  e->value.character.length = length;
+
+  return true;
+}
+
+
 gfc_expr *
 

Re: [PATCH] libstdc++: Skip atomic instructions in _Sp_counted_base::_M_release when both counts are 1

2021-08-03 Thread Jonathan Wakely via Gcc-patches
On Mon, 2 Aug 2021 at 14:29, Maged Michael wrote:
>
> This is the right patch. The previous one is missing noexcept. Sorry.
>
>
> On Mon, Aug 2, 2021 at 9:23 AM Maged Michael  wrote:
>>
>> Please find attached an updated patch after incorporating Jonathan's 
>> suggestions.
>>
>> Changes from the last patch include:
>> - Add a TSAN macro to bits/c++config.
>> - Use separate constexpr bool-s for the conditions for lock-freedom, 
>> double-width and alignment.
>> - Move the code in the optimized path to a separate function 
>> _M_release_double_width_cas.

Thanks for the updated patch. At a quick glance it looks great. I'll
apply it locally and test it tomorrow.


[PATCH] c++: parameterized requires-expr as default argument [PR101725]

2021-08-03 Thread Patrick Palka via Gcc-patches
Here we're rejecting the default template argument

  requires (T t) { x(t); }

because we consider the 't' in the requirement to be a local variable
(according to local_variable_p), and we generally forbid local variables
from appearing inside template arguments.  We can perhaps fix this by
giving special treatment to parameters introduced by requires-expressions,
but DR 2082 relaxed the restriction about local variables appearing inside
default arguments to permit them inside unevaluated operands thereof.
So this patch just implements DR 2082 which also fixes this PR since a
requires-expression is an unevaluated context.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk and perhaps 11?

PR c++/101725
DR 2082

gcc/cp/ChangeLog:

* cp-tree.h (unevaluated_p): Return true for REQUIRES_EXPR.
* decl.c (local_variable_p_walkfn): Don't walk into unevaluated
operands.
* parser.c (cp_parser_primary_expression) : Never
reject uses of local variables in unevaluated contexts.
* tree.c (cp_walk_subtrees) : Increment
cp_unevaluated_operand.  Use cp_walk_tree directly instead of
WALK_SUBTREE to avoid the goto.  Use REQUIRES_EXPR_REQS instead
of TREE_OPERAND directly.

gcc/testsuite/ChangeLog:

* g++.dg/DRs/dr2082.C: New test.
* g++.dg/cpp2a/concepts-uneval4.C: New test.
* g++.dg/cpp2a/concepts-uneval5.C: New test.
---
 gcc/cp/cp-tree.h  |  3 ++-
 gcc/cp/decl.c |  8 
 gcc/cp/parser.c   |  5 -
 gcc/cp/tree.c |  4 +++-
 gcc/testsuite/g++.dg/DRs/dr2082.C | 12 
 gcc/testsuite/g++.dg/cpp2a/concepts-uneval4.C | 12 
 6 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/DRs/dr2082.C
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/concepts-uneval4.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 9a47a8787d6..6a8264b0c61 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -8494,7 +8494,8 @@ unevaluated_p (tree_code code)
   return (code == DECLTYPE_TYPE
  || code == ALIGNOF_EXPR
  || code == SIZEOF_EXPR
- || code == NOEXCEPT_EXPR);
+ || code == NOEXCEPT_EXPR
+ || code == REQUIRES_EXPR);
 }
 
 /* RAII class to push/pop the access scope for T.  */
diff --git a/gcc/cp/decl.c b/gcc/cp/decl.c
index 6fa6b9adc87..b0b492360af 100644
--- a/gcc/cp/decl.c
+++ b/gcc/cp/decl.c
@@ -14270,6 +14270,14 @@ static tree
 local_variable_p_walkfn (tree *tp, int *walk_subtrees,
 void * /*data*/)
 {
+  if (unevaluated_p (TREE_CODE (*tp)))
+{
+  /* DR 2082 permits local variables in unevaluated contexts
+within a default argument.  */
+  *walk_subtrees = 0;
+  return NULL_TREE;
+}
+
   if (local_variable_p (*tp)
   && (!DECL_ARTIFICIAL (*tp) || DECL_NAME (*tp) == this_identifier))
 return *tp;
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index 47bf7d9ad1f..8b551db2c8a 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -5971,7 +5971,10 @@ cp_parser_primary_expression (cp_parser *parser,
/* Check to see if DECL is a local variable in a context
   where that is forbidden.  */
if ((parser->local_variables_forbidden_p & LOCAL_VARS_FORBIDDEN)
-   && local_variable_p (decl))
+   && local_variable_p (decl)
+   /* DR 2082 permits local variables in unevaluated contexts
+  within a default argument.  */
+   && !cp_unevaluated_operand)
  {
const char *msg
  = (TREE_CODE (decl) == PARM_DECL
diff --git a/gcc/cp/tree.c b/gcc/cp/tree.c
index 8345396ec33..e8831b21802 100644
--- a/gcc/cp/tree.c
+++ b/gcc/cp/tree.c
@@ -5386,7 +5386,9 @@ cp_walk_subtrees (tree *tp, int *walk_subtrees_p, 
walk_tree_fn func,
   // walk the parameter list. Doing so causes false
   // positives in the pack expansion checker since the
   // requires parameters are introduced as pack expansions.
-  WALK_SUBTREE (TREE_OPERAND (*tp, 1));
+  ++cp_unevaluated_operand;
+  result = cp_walk_tree (_EXPR_REQS (*tp), func, data, pset);
+  --cp_unevaluated_operand;
   *walk_subtrees_p = 0;
   break;
 
diff --git a/gcc/testsuite/g++.dg/DRs/dr2082.C 
b/gcc/testsuite/g++.dg/DRs/dr2082.C
new file mode 100644
index 000..84bb23f63f2
--- /dev/null
+++ b/gcc/testsuite/g++.dg/DRs/dr2082.C
@@ -0,0 +1,12 @@
+// DR 2082
+
+void f() {
+  int i;
+  extern void h(int x = sizeof(i));
+}
+
+class A {
+  void f(A* p = this) { } // { dg-error "this" }
+};
+
+int h(int a, int b = sizeof(a));
diff --git a/gcc/testsuite/g++.dg/cpp2a/concepts-uneval4.C 
b/gcc/testsuite/g++.dg/cpp2a/concepts-uneval4.C
new file mode 100644
index 000..1be27d1ab28
--- /dev/null
+++ 

[PATCH] c++: constexpr std::construct_at on empty field [PR101663]

2021-08-03 Thread Patrick Palka via Gcc-patches
Here during constexpr evaluation of

  std::construct_at(_M_value)

we find ourselves in cxx_eval_store_expression where the target object
is 'a._M_value' and the initializer is {}.  Since _M_value is an empty
[[no_unique_address]] member we don't create a sub-CONSTRUCTOR for it,
so we end up in the early exit code path for empty stores with mismatched
types and we trip over the assert therein

  gcc_assert (is_empty_class (TREE_TYPE (init)) && !lval);

because lval is true.  The reason it's true is because the INIT_EXPR in
question is the LHS of a COMPOUND_EXPR, and evaluation of the LHS is
always performed with lval=true for some reason.  This is the case ever
since r5-5900, before which we used to do the evaluation with
lval=false.

I'm not sure why we evaluate the LHS of a COMPOUND_EXPR with lval=true
(changing it to false survives bootstrap+regtest and is sufficient to
fix the PR), but regardless it's also straightforward enough to make the
relevant code path in cxx_eval_store_expression handle lval=true, which
is the approach this patch takes.

This patch also consolidates the duplicate implementations of
std::construct_at/destroy_at from some of the C++20 constexpr tests into
a common header file.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk/11?

PR c++/101663

gcc/cp/ChangeLog:

* constexpr.c (cxx_eval_store_expression): In the early exit
code path for mismatched types,
Pass false instead of true for lval when evaluating the LHS.

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/construct_at.h: New convenience header that
defines minimal implementations of std::construct_at/destroy_at,
split out from ...
* g++.dg/cpp2a/constexpr-new5.C: ... here.
* g++.dg/cpp2a/constexpr-new6.C: Use the header.
* g++.dg/cpp2a/constexpr-new14.C: Likewise.
* g++.dg/cpp2a/constexpr-new20.C: New test.
---
 gcc/cp/constexpr.c   |  4 +-
 gcc/testsuite/g++.dg/cpp2a/constexpr-new14.C | 60 +-
 gcc/testsuite/g++.dg/cpp2a/constexpr-new20.C | 18 ++
 gcc/testsuite/g++.dg/cpp2a/constexpr-new5.C  | 60 +-
 gcc/testsuite/g++.dg/cpp2a/constexpr-new6.C  | 64 +---
 gcc/testsuite/g++.dg/cpp2a/construct_at.h| 62 +++
 6 files changed, 85 insertions(+), 183 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/constexpr-new20.C
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/construct_at.h

diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index 1af365d47b9..25d84a377d8 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -5588,8 +5588,8 @@ cxx_eval_store_expression (const constexpr_ctx *ctx, tree 
t,
   argument, which has the derived type rather than the base type.  In
   this situation, just evaluate the initializer and return, since
   there's no actual data to store.  */
- gcc_assert (is_empty_class (TREE_TYPE (init)) && !lval);
- return init;
+ gcc_assert (is_empty_class (TREE_TYPE (init)));
+ return lval ? target : init;
}
   CONSTRUCTOR_ELTS (*valp) = CONSTRUCTOR_ELTS (init);
   TREE_CONSTANT (*valp) = TREE_CONSTANT (init);
diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-new14.C 
b/gcc/testsuite/g++.dg/cpp2a/constexpr-new14.C
index fd6f6075ef0..26037397b1d 100644
--- a/gcc/testsuite/g++.dg/cpp2a/constexpr-new14.C
+++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-new14.C
@@ -1,65 +1,7 @@
 // PR c++/97195
 // { dg-do compile { target c++20 } }
 
-namespace std
-{
-  typedef __SIZE_TYPE__ size_t;
-
-  template 
-  struct allocator
-  {
-constexpr allocator () noexcept {}
-
-constexpr T *allocate (size_t n)
-{ return static_cast (::operator new (n * sizeof(T))); }
-
-constexpr void
-deallocate (T *p, size_t n)
-{ ::operator delete (p); }
-  };
-
-  template 
-  U __declval (int);
-  template 
-  T __declval (long);
-  template 
-  auto declval () noexcept -> decltype (__declval (0));
-
-  template 
-  struct remove_reference
-  { typedef T type; };
-  template 
-  struct remove_reference
-  { typedef T type; };
-  template 
-  struct remove_reference
-  { typedef T type; };
-
-  template 
-  constexpr T &&
-  forward (typename std::remove_reference::type ) noexcept
-  { return static_cast (t); }
-
-  template
-  constexpr T &&
-  forward (typename std::remove_reference::type &) noexcept
-  { return static_cast (t); }
-
-  template 
-  constexpr auto
-  construct_at (T *l, A &&... a)
-  noexcept (noexcept (::new ((void *) 0) T (std::declval ()...)))
-  -> decltype (::new ((void *) 0) T (std::declval ()...))
-  { return ::new ((void *) l) T (std::forward (a)...); }
-
-  template 
-  constexpr inline void
-  destroy_at (T *l)
-  { l->~T (); }
-}
-
-inline void *operator new (std::size_t, void *p) noexcept
-{ return p; }
+#include "construct_at.h"
 
 constexpr bool
 foo ()
diff --git 

Re: Sanity check that 'Init' doesn't appear without 'Var' in '*.opt' files

2021-08-03 Thread Joseph Myers
On Tue, 3 Aug 2021, Thomas Schwinge wrote:

> Hi!
> 
> Is the attached OK to push?
> 
> No violations found per:
> 
> $ find -type f -name \*.opt | xargs grep -F 'Init(' | grep -v -F 'Var('
> 
> ..., and manually verified the error condition.

OK.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: Re: [PATCH] Fix ICE when mixing VLAs and statement expressions [PR91038]

2021-08-03 Thread Martin Uecker
Am Dienstag, den 03.08.2021, 11:26 +0200 schrieb Richard Biener:
> On Tue, Aug 3, 2021 at 10:28 AM Martin Uecker  wrote:


> 
> Does the same issue arise with writing the testcases as
> 
>  ({ ... }) + i;
> 
> ?  How can we fix it then if you also need to support
> 
>  i + ({ ...});
> 
> ?

Here, the FE always moves the pointer to the right and
then produces something like:

*((int *) TARGET_EXPR  + ((sizetype) SAVE_EXPR  + 1) * 20);


So these are the cases which already work without
the path, although maybe it is wrong to have
the n in the SAVE_EXPR?

It gets gimplified to something like this,
which works:

  int[0:D.1959][0:D.1955] * D.1952;
  int n.0;
  sizetype D.1955;
  sizetype D.1959;
 
  {
int n;
int[0:D.1959][0:D.1955] * x;

n = 10;
n.0 = n;

...

_32 = (sizetype) n.0;
_33 = (sizetype) n.1;
_34 = _32 * _33;
_35 = _34 * 4;
x = __builtin_malloc (_35);
D.1952 = x;
  }
  _36 = (sizetype) n.0;
  _37 = _36 + 1;
  _38 = _37 * 20;
  _39 = D.1952 + _38;
 

For the array ref, the FE produces:


  (*TARGET_EXPR )[5][5];


With the patch, we get something like
the following in GIMPLE, which seems correct:

  int[0:D.1958][0:D.1954] * D.1951;
  int n.0;
  sizetype D.1954;

  {
int n;
int[0:D.1958][0:D.1954] * x;

n = 10;
n.0 = n;
 
_7 = (sizetype) n.0;
_8 = _7 * 4;
D.1956 = _8;

n.1 = n
 
_22 = (sizetype) n.0;
_23 = (sizetype) n.1;
_24 = _22 * _23;
_25 = _24 * 4;
x = __builtin_malloc (_25);
D.1951 = x;
  }
  _26 = D.1956 /[ex] 4;
  c = (*D.1951)[5]{lb: 0 sz: _26 * 4}[5];
 

MArtin



Re: [PATCH] Objective-C: don't require redundant -fno-objc-sjlj-exceptions for the NeXT v2 ABI

2021-08-03 Thread Iain Sandoe via Gcc-patches



> On 2 Aug 2021, at 22:37, Matt Jacobson via Gcc-patches 
>  wrote:
> 
>> On Aug 2, 2021, at 5:09 PM, Eric Gallager  wrote:
>> 
>> On Wed, Jul 28, 2021 at 11:36 PM Matt Jacobson via Gcc-patches
>>  wrote:
>>> 
>>> As is, an invocation of GCC with -fnext-runtime -fobjc-abi-version=2 
>>> crashes,
>>> unless target-specific code adds an implicit -fno-objc-sjlj-exceptions 
>>> (which
>>> Darwin does).
>>> 
>>> This patch makes the general case not crash.
>>> 
>>> I don't have commit access, so if this patch is suitable, I'd need someone 
>>> else
>>> to commit it for me.  Thanks.
>> 
>> Is there a bug open for the issue that this fixes? Just wondering for
>> cross-referencing purposes...
> 
> No, I didn’t file a bug for this one, just sent the patch directly.  Hope 
> that’s OK.  If not, happy to file one.

I have this on my TODO (and in my “to apply” patch queue - IMO it’s OK as an 
interim
solution - but I think in the longer term it would be better to make 
fobjc-sjlj-exceptions
into a NOP, since the exception models are fixed for NeXT runtime (unless you 
have
some intent to update the 32bit one to use DWARF unwinding ;-) ).

thanks
Iain



Re: [PATCH 0/3] arm: fix problems when targetting extended FPUs [PR101723]

2021-08-03 Thread Richard Earnshaw via Gcc-patches




On 03/08/2021 16:04, Christophe Lyon via Gcc-patches wrote:

On Mon, Aug 2, 2021 at 4:57 PM Richard Earnshaw  wrote:


This patch series addresses an issue that has come to light due to a
change in the way GAS handles .fpu directives in the assembler.  A fix
to the assembler made in binutils 2.34 to clear out all features
realated to the FPU when .fpu is emitted has started causing problems
for GCC because of the order in which we emit .fpu and .arch_extension
directives.  To fully address this we need to re-organize the way in
which the compiler does this.

I'll hold of pushing the patches for a couple of days.  Although I've
gone through the testsuite quite carefully and run this through
several configurations, it's possible that this may have some impact
on the testsuite that I've missed.  Christophe, is the any chance you
can run this through your test environment before I commit this?



Sorry for the delay, still unpacking emails after hollidays.

Yes I can run the validation for these patches. I think you mean with all 3
patches combined, not 3 validations (patch 1, patches 1+2, patches 1-3) ?


Yes, the first two are trivial changes that just support the interesting 
one, which is the final patch.


R.


Thanks,

Christophe



R.

Richard Earnshaw (3):
   arm: ensure the arch_name is always set for the build target
   arm: Don't reconfigure globals in arm_configure_build_target
   arm: reorder assembler architecture directives [PR101723]

  gcc/config/arm/arm-c.c|   1 +
  gcc/config/arm/arm-cpus.in|   1 +
  gcc/config/arm/arm.c  | 190 --
  gcc/testsuite/gcc.target/arm/attr-neon.c  |   9 +-
  gcc/testsuite/gcc.target/arm/attr-neon2.c |  35 +++-
  gcc/testsuite/gcc.target/arm/attr-neon3.c |  43 +++-
  .../arm/cortex-m55-nofp-flag-hard.c   |   2 +-
  .../arm/cortex-m55-nofp-flag-softfp.c |   2 +-
  .../arm/cortex-m55-nofp-nomve-flag-softfp.c   |   2 +-
  .../gcc.target/arm/mve/intrinsics/mve_fpu1.c  |   5 +-
  .../gcc.target/arm/mve/intrinsics/mve_fpu2.c  |   5 +-
  gcc/testsuite/gcc.target/arm/pr98636.c|   3 +-
  12 files changed, 153 insertions(+), 145 deletions(-)

--
2.25.1




[PATCH 7/7] fortran: Ignore unused args in scalarization [PR97896]

2021-08-03 Thread Mikael Morin via Gcc-patches

The KIND argument of the INDEX intrinsic is a compile time constant
that is used at compile time only to resolve to a kind-specific library
method.  It is otherwise completely ignored at runtime, and there is
no code generated for it as the library procedure has no kind argument.
This confuses the scalarizer which expects to see every argument
of elemental functions to be used when calling a procedure.
This change removes the argument from the scalarization lists
at the beginning of the scalarization process, so that the argument
is completely ignored.

gcc/fortran/
PR fortran/97896
* gfortran.h (gfc_dummy_arg::get_name): New method.
(gfc_formal_arglist::get_name, gfc_intrinsic_arg::get_name):
Declare new methods.
* symbol.c (gfc_formal_arglist::get_name): Implement new method.
* intrinsic.c (gfc_intrinsic_arg::get_name): Same.
* trans-array.h (gfc_get_intrinsic_for_expr,
gfc_get_proc_ifc_for_expr): New.
* trans-array.c (gfc_get_intrinsic_for_expr,
arg_evaluated_for_scalarization): New.
(gfc_walk_elemental_function_args): Add intrinsic procedure
as argument.  Check arg_evaluated_for_scalarization.
* trans-intrinsic.c (gfc_walk_intrinsic_function): Update call.
* trans-stmt.c (get_intrinsic_for_code): New.
(gfc_trans_call): Update call.

gcc/testsuite/
PR fortran/97896
* gfortran.dg/index_5.f90: New.
---
 gcc/fortran/gfortran.h|  3 ++
 gcc/fortran/intrinsic.c   |  6 +++
 gcc/fortran/symbol.c  |  6 +++
 gcc/fortran/trans-array.c | 53 ++-
 gcc/fortran/trans-array.h |  3 ++
 gcc/fortran/trans-intrinsic.c |  1 +
 gcc/fortran/trans-stmt.c  | 20 ++
 gcc/testsuite/gfortran.dg/index_5.f90 | 23 
 8 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gfortran.dg/index_5.f90

diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 627a3480ef1..6d9af76c9fc 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -1136,6 +1136,7 @@ gfc_component;
 class gfc_dummy_arg
 {
 public:
+  virtual const char *get_name () const = 0;
   virtual const gfc_typespec & get_typespec () const = 0;
   virtual bool is_optional () const = 0;
 };
@@ -1149,6 +1150,7 @@ struct gfc_formal_arglist : public gfc_dummy_arg
   /* Points to the next formal argument.  */
   struct gfc_formal_arglist *next;
 
+  virtual const char *get_name () const FINAL OVERRIDE;
   virtual const gfc_typespec & get_typespec () const FINAL OVERRIDE;
   virtual bool is_optional () const FINAL OVERRIDE;
 };
@@ -2183,6 +2185,7 @@ struct gfc_intrinsic_arg : public gfc_dummy_arg
 
   struct gfc_intrinsic_arg *next;
 
+  virtual const char *get_name () const FINAL OVERRIDE;
   virtual const gfc_typespec & get_typespec () const FINAL OVERRIDE;
   virtual bool is_optional () const FINAL OVERRIDE;
 };
diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c
index b3e907ba3b8..af4da7ea7d3 100644
--- a/gcc/fortran/intrinsic.c
+++ b/gcc/fortran/intrinsic.c
@@ -5472,6 +5472,12 @@ gfc_warn_intrinsic_shadow (const gfc_symbol* sym, bool in_module, bool func)
 }
 
 
+const char *
+gfc_intrinsic_arg::get_name () const
+{
+  return name;
+}
+
 const gfc_typespec &
 gfc_intrinsic_arg::get_typespec () const
 {
diff --git a/gcc/fortran/symbol.c b/gcc/fortran/symbol.c
index 59f0d0385a0..9d1e2f876dc 100644
--- a/gcc/fortran/symbol.c
+++ b/gcc/fortran/symbol.c
@@ -5261,6 +5261,12 @@ gfc_sym_get_dummy_args (gfc_symbol *sym)
 }
 
 
+const char *
+gfc_formal_arglist::get_name () const
+{
+  return sym->name;
+}
+
 const gfc_typespec &
 gfc_formal_arglist::get_typespec () const
 {
diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c
index 7d85abb181f..1fe48c22b93 100644
--- a/gcc/fortran/trans-array.c
+++ b/gcc/fortran/trans-array.c
@@ -11200,6 +11200,51 @@ gfc_get_proc_ifc_for_expr (gfc_expr *procedure_ref)
 }
 
 
+/* Given an expression referring to an intrinsic function call,
+   return the intrinsic symbol.  */
+
+gfc_intrinsic_sym *
+gfc_get_intrinsic_for_expr (gfc_expr *call)
+{
+  if (call == NULL)
+return NULL;
+
+  /* Normal procedure case.  */
+  if (call->expr_type == EXPR_FUNCTION)
+return call->value.function.isym;
+  else
+return NULL;
+}
+
+
+/* Indicates whether an argument to an intrinsic function should be used in
+   scalarization.  It is usually the case, except for some intrinsics
+   requiring the value to be constant, and using the value at compile time only.
+   As the value is not used at runtime in those cases, we don’t produce code
+   for it, and it should not be visible to the scalarizer.  */
+
+static bool
+arg_evaluated_for_scalarization (gfc_intrinsic_sym *function,
+ gfc_dummy_arg *dummy_arg)
+{
+  if (function != NULL)
+{
+  switch (function->id)
+	{
+	  case GFC_ISYM_INDEX:
+	if (strcmp ("kind", 

[PATCH 6/7] Revert "Remove KIND argument from INDEX so it does not mess up scalarization."

2021-08-03 Thread Mikael Morin via Gcc-patches

This reverts commit d09847357b965a2c2cda063827ce362d4c9c86f2 except for
its testcase.

gcc/fortran/
* intrinsic.c (add_sym_4ind): Remove.
(add_functions): Use add_sym4 instead of add_sym4ind.
Don’t special case the index intrinsic.
* iresolve.c (gfc_resolve_index_func): Use the individual arguments
directly instead of the full argument list.
* intrinsic.h (gfc_resolve_index_func): Update the declaration
accordingly.
* trans-decl.c (gfc_get_extern_function_decl): Don’t modify the
list of arguments in the case of the index intrinsic.
---
 gcc/fortran/intrinsic.c  | 48 ++--
 gcc/fortran/intrinsic.h  |  3 ++-
 gcc/fortran/iresolve.c   | 21 --
 gcc/fortran/trans-decl.c | 24 +---
 4 files changed, 14 insertions(+), 82 deletions(-)

diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c
index 8d5546ce19f..b3e907ba3b8 100644
--- a/gcc/fortran/intrinsic.c
+++ b/gcc/fortran/intrinsic.c
@@ -893,39 +893,6 @@ add_sym_4 (const char *name, gfc_isym_id id, enum klass cl, int actual_ok, bt ty
 	   (void *) 0);
 }
 
-/* Add a symbol to the function list where the function takes 4
-   arguments and resolution may need to change the number or
-   arrangement of arguments. This is the case for INDEX, which needs
-   its KIND argument removed.  */
-
-static void
-add_sym_4ind (const char *name, gfc_isym_id id, enum klass cl, int actual_ok,
-	  bt type, int kind, int standard,
-	  bool (*check) (gfc_expr *, gfc_expr *, gfc_expr *, gfc_expr *),
-	  gfc_expr *(*simplify) (gfc_expr *, gfc_expr *, gfc_expr *,
- gfc_expr *),
-	  void (*resolve) (gfc_expr *, gfc_actual_arglist *),
-	  const char *a1, bt type1, int kind1, int optional1,
-	  const char *a2, bt type2, int kind2, int optional2,
-	  const char *a3, bt type3, int kind3, int optional3,
-	  const char *a4, bt type4, int kind4, int optional4 )
-{
-  gfc_check_f cf;
-  gfc_simplify_f sf;
-  gfc_resolve_f rf;
-
-  cf.f4 = check;
-  sf.f4 = simplify;
-  rf.f1m = resolve;
-
-  add_sym (name, id, cl, actual_ok, type, kind, standard, cf, sf, rf,
-	   a1, type1, kind1, optional1, INTENT_IN,
-	   a2, type2, kind2, optional2, INTENT_IN,
-	   a3, type3, kind3, optional3, INTENT_IN,
-	   a4, type4, kind4, optional4, INTENT_IN,
-	   (void *) 0);
-}
-
 
 /* Add a symbol to the subroutine list where the subroutine takes
4 arguments.  */
@@ -2229,11 +2196,11 @@ add_functions (void)
 
   /* The resolution function for INDEX is called gfc_resolve_index_func
  because the name gfc_resolve_index is already used in resolve.c.  */
-  add_sym_4ind ("index", GFC_ISYM_INDEX, CLASS_ELEMENTAL, ACTUAL_YES,
-		BT_INTEGER, di, GFC_STD_F77,
-		gfc_check_index, gfc_simplify_index, gfc_resolve_index_func,
-		stg, BT_CHARACTER, dc, REQUIRED, ssg, BT_CHARACTER, dc, REQUIRED,
-		bck, BT_LOGICAL, dl, OPTIONAL, kind, BT_INTEGER, di, OPTIONAL);
+  add_sym_4 ("index", GFC_ISYM_INDEX, CLASS_ELEMENTAL, ACTUAL_YES,
+	 BT_INTEGER, di, GFC_STD_F77,
+	 gfc_check_index, gfc_simplify_index, gfc_resolve_index_func,
+	 stg, BT_CHARACTER, dc, REQUIRED, ssg, BT_CHARACTER, dc, REQUIRED,
+	 bck, BT_LOGICAL, dl, OPTIONAL, kind, BT_INTEGER, di, OPTIONAL);
 
   make_generic ("index", GFC_ISYM_INDEX, GFC_STD_F77);
 
@@ -4539,10 +4506,9 @@ resolve_intrinsic (gfc_intrinsic_sym *specific, gfc_expr *e)
 
   arg = e->value.function.actual;
 
-  /* Special case hacks for MIN, MAX and INDEX.  */
+  /* Special case hacks for MIN and MAX.  */
   if (specific->resolve.f1m == gfc_resolve_max
-  || specific->resolve.f1m == gfc_resolve_min
-  || specific->resolve.f1m == gfc_resolve_index_func)
+  || specific->resolve.f1m == gfc_resolve_min)
 {
   (*specific->resolve.f1m) (e, arg);
   return;
diff --git a/gcc/fortran/intrinsic.h b/gcc/fortran/intrinsic.h
index 2148f89e194..b195e0b271a 100644
--- a/gcc/fortran/intrinsic.h
+++ b/gcc/fortran/intrinsic.h
@@ -521,7 +521,8 @@ void gfc_resolve_ibits (gfc_expr *, gfc_expr *, gfc_expr *, gfc_expr *);
 void gfc_resolve_ibset (gfc_expr *, gfc_expr *, gfc_expr *);
 void gfc_resolve_image_index (gfc_expr *, gfc_expr *, gfc_expr *);
 void gfc_resolve_image_status (gfc_expr *, gfc_expr *, gfc_expr *);
-void gfc_resolve_index_func (gfc_expr *, gfc_actual_arglist *);
+void gfc_resolve_index_func (gfc_expr *, gfc_expr *, gfc_expr *, gfc_expr *,
+			 gfc_expr *);
 void gfc_resolve_ierrno (gfc_expr *);
 void gfc_resolve_ieor (gfc_expr *, gfc_expr *, gfc_expr *);
 void gfc_resolve_ichar (gfc_expr *, gfc_expr *, gfc_expr *);
diff --git a/gcc/fortran/iresolve.c b/gcc/fortran/iresolve.c
index e17fe45f080..598c0409b66 100644
--- a/gcc/fortran/iresolve.c
+++ b/gcc/fortran/iresolve.c
@@ -1276,27 +1276,16 @@ gfc_resolve_ior (gfc_expr *f, gfc_expr *i, gfc_expr *j)
 
 
 void
-gfc_resolve_index_func (gfc_expr *f, gfc_actual_arglist *a)
+gfc_resolve_index_func (gfc_expr *f, gfc_expr *str,
+	

[PATCH 5/7] fortran: Delete redundant missing_arg_type field

2021-08-03 Thread Mikael Morin via Gcc-patches

Now that we can get information about an actual arg's associated
dummy using the associated_dummy attribute, the field missing_arg_type
contains redundant information.
This removes it.

gcc/fortran/
* gfortran.h (gfc_actual_arglist::missing_arg_type): Remove.
* interface.c (gfc_compare_actual_formal): Remove
missing_arg_type initialization.
* intrinsic.c (sort_actual): Ditto.
* trans-expr.c (gfc_conv_procedure_call): Use associated_dummy
and get_typespec to get the dummy argument type.
---
 gcc/fortran/gfortran.h   | 5 -
 gcc/fortran/interface.c  | 5 -
 gcc/fortran/intrinsic.c  | 5 +
 gcc/fortran/trans-expr.c | 7 +--
 4 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index edad3d9e98c..627a3480ef1 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -1166,11 +1166,6 @@ typedef struct gfc_actual_arglist
   /* Alternate return label when the expr member is null.  */
   struct gfc_st_label *label;
 
-  /* This is set to the type of an eventual omitted optional
- argument. This is used to determine if a hidden string length
- argument has to be added to a function call.  */
-  bt missing_arg_type;
-
   gfc_param_spec_type spec_type;
 
   struct gfc_expr *expr;
diff --git a/gcc/fortran/interface.c b/gcc/fortran/interface.c
index b763f87e8bd..c51ec4c124e 100644
--- a/gcc/fortran/interface.c
+++ b/gcc/fortran/interface.c
@@ -3569,11 +3569,6 @@ gfc_compare_actual_formal (gfc_actual_arglist **ap, gfc_formal_arglist *formal,
   if (*ap == NULL && n > 0)
 *ap = new_arg[0];
 
-  /* Note the types of omitted optional arguments.  */
-  for (a = *ap, f = formal; a; a = a->next, f = f->next)
-if (a->expr == NULL && a->label == NULL)
-  a->missing_arg_type = f->sym->ts.type;
-
   return true;
 }
 
diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c
index 007cac053cb..8d5546ce19f 100644
--- a/gcc/fortran/intrinsic.c
+++ b/gcc/fortran/intrinsic.c
@@ -4430,10 +4430,7 @@ do_sort:
 	}
 
   if (a == NULL)
-	{
-	  a = gfc_get_actual_arglist ();
-	  a->missing_arg_type = f->ts.type;
-	}
+	a = gfc_get_actual_arglist ();
 
   a->associated_dummy = f;
 
diff --git a/gcc/fortran/trans-expr.c b/gcc/fortran/trans-expr.c
index b18a9ec9799..4806ebac56e 100644
--- a/gcc/fortran/trans-expr.c
+++ b/gcc/fortran/trans-expr.c
@@ -5831,7 +5831,9 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
 		{
 		  /* Pass a NULL pointer for an absent arg.  */
 		  parmse.expr = null_pointer_node;
-		  if (arg->missing_arg_type == BT_CHARACTER)
+		  if (arg->associated_dummy
+		  && arg->associated_dummy->get_typespec ().type
+			 == BT_CHARACTER)
 		parmse.string_length = build_int_cst (gfc_charlen_type_node,
 			  0);
 		}
@@ -5848,7 +5850,8 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
 			  || !CLASS_DATA (fsym)->attr.allocatable));
 	  gfc_init_se (, NULL);
 	  parmse.expr = null_pointer_node;
-	  if (arg->missing_arg_type == BT_CHARACTER)
+	  if (arg->associated_dummy
+	  && arg->associated_dummy->get_typespec ().type == BT_CHARACTER)
 	parmse.string_length = build_int_cst (gfc_charlen_type_node, 0);
 	}
   else if (fsym && fsym->ts.type == BT_CLASS


[PATCH 4/7] fortran: simplify elemental arguments walking

2021-08-03 Thread Mikael Morin via Gcc-patches

This adds two methods to the abstract gfc_dummy_arg and makes
usage of them to simplify a bit the walking of elemental procedure
arguments for scalarization.  As information about dummy arguments
can be obtained from the actual argument through the just-introduced
associated_dummy field, there is no need to carry around the procedure
interface and walk dummy arguments manually together with actual arguments.

gcc/fortran/
* gfortran.h (gfc_dummy_arg::get_typespec,
gfc_dummy_arg::is_optional): Declare new methods.
(gfc_formal_arglist::get_typespec,
gfc_formal_arglist::is_optional): Same.
(gfc_intrinsic_arg::get_typespec,
gfc_intrinsic_arg::is_optional): Same.
* symbol.c (gfc_formal_arglist::get_typespec,
gfc_formal_arglist::is_optional): Implement new methods.
* intrinsic.c (gfc_intrinsic_arg::get_typespec,
gfc_intrinsic_arg::is_optional): Same.
* trans.h (gfc_ss_info::dummy_arg): Use the more general
interface as declaration type.
* trans-array.c (gfc_scalar_elemental_arg_saved_as_reference):
use get_typespec_method to get the type.
(gfc_walk_elemental_function_args): Remove proc_ifc argument.
Get info about the dummy arg using the associated_dummy field.
* trans-array.h (gfc_walk_elemental_function_args): Update declaration.
* trans-intrinsic.c (gfc_walk_intrinsic_function):
Update call to gfc_walk_elemental_function_args.
* trans-stmt.c (gfc_trans_call): Ditto.
(get_proc_ifc_for_call): Remove.
---
 gcc/fortran/gfortran.h|  9 +
 gcc/fortran/intrinsic.c   | 13 +
 gcc/fortran/symbol.c  | 13 +
 gcc/fortran/trans-array.c | 22 ++
 gcc/fortran/trans-array.h |  2 +-
 gcc/fortran/trans-intrinsic.c |  2 +-
 gcc/fortran/trans-stmt.c  | 22 --
 gcc/fortran/trans.h   |  4 ++--
 8 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 78b43a31a9a..edad3d9e98c 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -1135,6 +1135,9 @@ gfc_component;
 /* dummy arg of either an intrinsic or a user-defined procedure.  */
 class gfc_dummy_arg
 {
+public:
+  virtual const gfc_typespec & get_typespec () const = 0;
+  virtual bool is_optional () const = 0;
 };
 
 
@@ -1145,6 +1148,9 @@ struct gfc_formal_arglist : public gfc_dummy_arg
   struct gfc_symbol *sym;
   /* Points to the next formal argument.  */
   struct gfc_formal_arglist *next;
+
+  virtual const gfc_typespec & get_typespec () const FINAL OVERRIDE;
+  virtual bool is_optional () const FINAL OVERRIDE;
 };
 
 #define GFC_NEW(T) new (XCNEW (T)) T
@@ -2181,6 +2187,9 @@ struct gfc_intrinsic_arg : public gfc_dummy_arg
   ENUM_BITFIELD (sym_intent) intent:2;
 
   struct gfc_intrinsic_arg *next;
+
+  virtual const gfc_typespec & get_typespec () const FINAL OVERRIDE;
+  virtual bool is_optional () const FINAL OVERRIDE;
 };
 
 #define gfc_get_intrinsic_arg() GFC_NEW (gfc_intrinsic_arg)
diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c
index ef5da389434..007cac053cb 100644
--- a/gcc/fortran/intrinsic.c
+++ b/gcc/fortran/intrinsic.c
@@ -5507,3 +5507,16 @@ gfc_warn_intrinsic_shadow (const gfc_symbol* sym, bool in_module, bool func)
 		 " only be called via an explicit interface or if declared"
 		 " EXTERNAL.", sym->name, >declared_at);
 }
+
+
+const gfc_typespec &
+gfc_intrinsic_arg::get_typespec () const
+{
+  return ts;
+}
+
+bool
+gfc_intrinsic_arg::is_optional () const
+{
+  return optional;
+}
diff --git a/gcc/fortran/symbol.c b/gcc/fortran/symbol.c
index 6d61bf4982b..59f0d0385a0 100644
--- a/gcc/fortran/symbol.c
+++ b/gcc/fortran/symbol.c
@@ -5259,3 +5259,16 @@ gfc_sym_get_dummy_args (gfc_symbol *sym)
 
   return dummies;
 }
+
+
+const gfc_typespec &
+gfc_formal_arglist::get_typespec () const
+{
+  return sym->ts;
+}
+
+bool
+gfc_formal_arglist::is_optional () const
+{
+  return sym->attr.optional;
+}
diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c
index 0d013defdbb..7d85abb181f 100644
--- a/gcc/fortran/trans-array.c
+++ b/gcc/fortran/trans-array.c
@@ -2879,7 +2879,7 @@ gfc_scalar_elemental_arg_saved_as_reference (gfc_ss_info * ss_info)
   /* If the expression is of polymorphic type, it's actual size is not known,
  so we avoid copying it anywhere.  */
   if (ss_info->data.scalar.dummy_arg
-  && ss_info->data.scalar.dummy_arg->ts.type == BT_CLASS
+  && ss_info->data.scalar.dummy_arg->get_typespec ().type == BT_CLASS
   && ss_info->expr->ts.type == BT_CLASS)
 return true;
 
@@ -11207,9 +11207,8 @@ gfc_get_proc_ifc_for_expr (gfc_expr *procedure_ref)
 
 gfc_ss *
 gfc_walk_elemental_function_args (gfc_ss * ss, gfc_actual_arglist *arg,
-  gfc_symbol *proc_ifc, gfc_ss_type type)
+  gfc_ss_type type)
 {
-  gfc_formal_arglist *dummy_arg;
   int scalar;
   gfc_ss *head;
  

[PATCH 3/7] fortran: Reverse actual vs dummy argument mapping

2021-08-03 Thread Mikael Morin via Gcc-patches

There was originally no way from an actual argument to get
to the corresponding dummy argument, even if the job of sorting
and matching actual with dummy arguments was done.
The closest was a field named actual in gfc_intrinsic_arg that was
used as scratch data when sorting arguments of one specific call.
However that value was overwritten later on as arguments of another
call to the same procedure were sorted and matched.

This change removes that field and adds instead a new field
associated_dummy in gfc_actual_arglist.  This field uses the just
introduced gfc_dummy_arg interface, which makes it usable with
both external and intrinsic procedure dummy arguments.

As the removed field was used in the code sorting and matching arguments,
that code has to be updated.  Two local vectors with matching indices
are introduced for respectively dummy and actual arguments, and the
loops are modified to use indices and update those argument vectors.

gcc/fortran/
* gfortran.h (gfc_actual_arglist): New field associated_dummy.
(gfc_intrinsic_arg): Remove field actual.
* interface.c (gfc_compare_actual): Initialize associated_dummy.
* intrinsic.c (sort_actual):  Add argument vectors.
Use loops with indices on argument vectors.
Initialize associated_dummy.
---
 gcc/fortran/gfortran.h  |  6 +-
 gcc/fortran/interface.c |  9 +++--
 gcc/fortran/intrinsic.c | 31 ---
 3 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 031e46d1457..78b43a31a9a 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -1168,6 +1168,11 @@ typedef struct gfc_actual_arglist
   gfc_param_spec_type spec_type;
 
   struct gfc_expr *expr;
+
+  /*  The dummy arg this actual arg is associated with, if the interface
+  is explicit.  NULL otherwise.  */
+  gfc_dummy_arg *associated_dummy;
+
   struct gfc_actual_arglist *next;
 }
 gfc_actual_arglist;
@@ -2174,7 +2179,6 @@ struct gfc_intrinsic_arg : public gfc_dummy_arg
   gfc_typespec ts;
   unsigned optional:1, value:1;
   ENUM_BITFIELD (sym_intent) intent:2;
-  gfc_actual_arglist *actual;
 
   struct gfc_intrinsic_arg *next;
 };
diff --git a/gcc/fortran/interface.c b/gcc/fortran/interface.c
index 9e3e8aa9da9..b763f87e8bd 100644
--- a/gcc/fortran/interface.c
+++ b/gcc/fortran/interface.c
@@ -3131,6 +3131,8 @@ gfc_compare_actual_formal (gfc_actual_arglist **ap, gfc_formal_arglist *formal,
 			   "call at %L", where);
 	  return false;
 	}
+  else
+	a->associated_dummy = f;
 
   if (a->expr == NULL)
 	{
@@ -3546,9 +3548,12 @@ gfc_compare_actual_formal (gfc_actual_arglist **ap, gfc_formal_arglist *formal,
   /* The argument lists are compatible.  We now relink a new actual
  argument list with null arguments in the right places.  The head
  of the list remains the head.  */
-  for (i = 0; i < n; i++)
+  for (f = formal, i = 0; f; f = f->next, i++)
 if (new_arg[i] == NULL)
-  new_arg[i] = gfc_get_actual_arglist ();
+  {
+	new_arg[i] = gfc_get_actual_arglist ();
+	new_arg[i]->associated_dummy = f;
+  }
 
   if (na != 0)
 {
diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c
index 2b7b72f03e2..ef5da389434 100644
--- a/gcc/fortran/intrinsic.c
+++ b/gcc/fortran/intrinsic.c
@@ -4290,8 +4290,14 @@ sort_actual (const char *name, gfc_actual_arglist **ap,
   remove_nullargs (ap);
   actual = *ap;
 
+  auto_vec dummy_args;
+  auto_vec ordered_actual_args;
+
   for (f = formal; f; f = f->next)
-f->actual = NULL;
+dummy_args.safe_push (f);
+
+  ordered_actual_args.safe_grow_cleared (dummy_args.length (),
+	 /* exact = */true);
 
   f = formal;
   a = actual;
@@ -4343,7 +4349,7 @@ sort_actual (const char *name, gfc_actual_arglist **ap,
 	}
 }
 
-  for (;;)
+  for (int i = 0;; i++)
 {		/* Put the nonkeyword arguments in a 1:1 correspondence */
   if (f == NULL)
 	break;
@@ -4353,7 +4359,7 @@ sort_actual (const char *name, gfc_actual_arglist **ap,
   if (a->name != NULL)
 	goto keywords;
 
-  f->actual = a;
+  ordered_actual_args[i] = a;
 
   f = f->next;
   a = a->next;
@@ -4371,7 +4377,8 @@ keywords:
  to be keyword arguments.  */
   for (; a; a = a->next)
 {
-  for (f = formal; f; f = f->next)
+  int idx;
+  FOR_EACH_VEC_ELT (dummy_args, idx, f)
 	if (strcmp (a->name, f->name) == 0)
 	  break;
 
@@ -4386,21 +4393,21 @@ keywords:
 	  return false;
 	}
 
-  if (f->actual != NULL)
+  if (ordered_actual_args[idx] != NULL)
 	{
 	  gfc_error ("Argument %qs appears twice in call to %qs at %L",
 		 f->name, name, where);
 	  return false;
 	}
-
-  f->actual = a;
+  ordered_actual_args[idx] = a;
 }
 
 optional:
   /* At this point, all unmatched formal args must be optional.  */
-  for (f = formal; f; f = f->next)
+  int idx;
+  FOR_EACH_VEC_ELT (dummy_args, idx, f)
 {
-  if (f->actual == NULL && f->optional == 0)
+  if 

[PATCH 1/7] fortran: new abstract class gfc_dummy_arg

2021-08-03 Thread Mikael Morin via Gcc-patches

Introduce a new abstract class gfc_dummy_arg that provides a common
interface to both dummy arguments of user-defined procedures (which
have type gfc_formal_arglist) and dummy arguments of intrinsic procedures
(which have type gfc_intrinsic_arg).

gcc/fortran/
* gfortran.h (gfc_dummy_arg): New.
(gfc_formal_arglist, gfc_intrinsic_arg): Inherit gfc_dummy_arg.
(gfc_get_formal_arglist, gfc_get_intrinsic_arg): Call constructor.
* intrinsic.c (gfc_intrinsic_init_1): Merge the memory area of
conversion intrinsics with that of regular function and
subroutine intrinsics.
Use a separate memory area for arguments.
(add_sym, gfc_intrinsic_init_1): Don’t do pointer arithmetics
with next_arg.
(add_sym, make_alias, add_conv,
add_char_conversions, gfc_intrinsic_init_1): Call constructor
before filling object data.
* resolve.c (resolve_select_type): Same.
---
 gcc/fortran/gfortran.h  | 22 ++---
 gcc/fortran/intrinsic.c | 44 ++---
 gcc/fortran/resolve.c   | 10 ++
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/gcc/fortran/gfortran.h b/gcc/fortran/gfortran.h
index 921aed93dc3..031e46d1457 100644
--- a/gcc/fortran/gfortran.h
+++ b/gcc/fortran/gfortran.h
@@ -1131,17 +1131,25 @@ gfc_component;
 
 #define gfc_get_component() XCNEW (gfc_component)
 
+
+/* dummy arg of either an intrinsic or a user-defined procedure.  */
+class gfc_dummy_arg
+{
+};
+
+
 /* Formal argument lists are lists of symbols.  */
-typedef struct gfc_formal_arglist
+struct gfc_formal_arglist : public gfc_dummy_arg
 {
   /* Symbol representing the argument at this position in the arglist.  */
   struct gfc_symbol *sym;
   /* Points to the next formal argument.  */
   struct gfc_formal_arglist *next;
-}
-gfc_formal_arglist;
+};
+
+#define GFC_NEW(T) new (XCNEW (T)) T
 
-#define gfc_get_formal_arglist() XCNEW (gfc_formal_arglist)
+#define gfc_get_formal_arglist() GFC_NEW (gfc_formal_arglist)
 
 
 /* The gfc_actual_arglist structure is for actual arguments and
@@ -2159,7 +2167,7 @@ gfc_ref;
 
 
 /* Structures representing intrinsic symbols and their arguments lists.  */
-typedef struct gfc_intrinsic_arg
+struct gfc_intrinsic_arg : public gfc_dummy_arg
 {
   char name[GFC_MAX_SYMBOL_LEN + 1];
 
@@ -2169,9 +2177,9 @@ typedef struct gfc_intrinsic_arg
   gfc_actual_arglist *actual;
 
   struct gfc_intrinsic_arg *next;
+};
 
-}
-gfc_intrinsic_arg;
+#define gfc_get_intrinsic_arg() GFC_NEW (gfc_intrinsic_arg)
 
 
 /* Specifies the various kinds of check functions used to verify the
diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c
index 219f04f2317..ba79eb3242b 100644
--- a/gcc/fortran/intrinsic.c
+++ b/gcc/fortran/intrinsic.c
@@ -376,6 +376,7 @@ add_sym (const char *name, gfc_isym_id id, enum klass cl, int actual_ok, bt type
   break;
 
 case SZ_NOTHING:
+  next_sym = new (next_sym) gfc_intrinsic_sym;
   next_sym->name = gfc_get_string ("%s", name);
 
   strcpy (buf, "_gfortran_");
@@ -406,6 +407,7 @@ add_sym (const char *name, gfc_isym_id id, enum klass cl, int actual_ok, bt type
   va_start (argp, resolve);
 
   first_flag = 1;
+  gfc_intrinsic_arg * previous_arg;
 
   for (;;)
 {
@@ -422,12 +424,12 @@ add_sym (const char *name, gfc_isym_id id, enum klass cl, int actual_ok, bt type
 	nargs++;
   else
 	{
-	  next_arg++;
+	  next_arg = new (next_arg) gfc_intrinsic_arg;
 
 	  if (first_flag)
 	next_sym->formal = next_arg;
 	  else
-	(next_arg - 1)->next = next_arg;
+	previous_arg->next = next_arg;
 
 	  first_flag = 0;
 
@@ -437,6 +439,9 @@ add_sym (const char *name, gfc_isym_id id, enum klass cl, int actual_ok, bt type
 	  next_arg->optional = optional;
 	  next_arg->value = 0;
 	  next_arg->intent = intent;
+
+	  previous_arg = next_arg;
+	  next_arg++;
 	}
 }
 
@@ -1270,6 +1275,7 @@ make_alias (const char *name, int standard)
   break;
 
 case SZ_NOTHING:
+  next_sym = new (next_sym) gfc_intrinsic_sym;
   next_sym[0] = next_sym[-1];
   next_sym->name = gfc_get_string ("%s", name);
   next_sym->standard = standard;
@@ -3991,7 +3997,7 @@ add_conv (bt from_type, int from_kind, bt to_type, int to_kind, int standard)
   to.type = to_type;
   to.kind = to_kind;
 
-  sym = conversion + nconv;
+  sym = new (conversion + nconv) gfc_intrinsic_sym;
 
   sym->name = conv_name (, );
   sym->lib_name = sym->name;
@@ -4167,15 +4173,17 @@ add_char_conversions (void)
 	to.type = BT_CHARACTER;
 	to.kind = gfc_character_kinds[j].kind;
 
-	char_conversions[n].name = conv_name (, );
-	char_conversions[n].lib_name = char_conversions[n].name;
-	char_conversions[n].simplify.cc = gfc_convert_char_constant;
-	char_conversions[n].standard = GFC_STD_F2003;
-	char_conversions[n].elemental = 1;
-	char_conversions[n].pure = 1;
-	char_conversions[n].conversion = 0;
-	char_conversions[n].ts = to;
-	char_conversions[n].id = 

[PATCH 2/7] fortran: Tiny sort_actual internal refactoring

2021-08-03 Thread Mikael Morin via Gcc-patches

Preliminary refactoring to make further changes more obvious.
No functional change.

gcc/fortran/
* intrinsic.c (sort_actual): initialise variable and use it earlier.
---
 gcc/fortran/intrinsic.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/gcc/fortran/intrinsic.c b/gcc/fortran/intrinsic.c
index ba79eb3242b..2b7b72f03e2 100644
--- a/gcc/fortran/intrinsic.c
+++ b/gcc/fortran/intrinsic.c
@@ -4415,19 +4415,18 @@ do_sort:
 
   for (f = formal; f; f = f->next)
 {
-  if (f->actual && f->actual->label != NULL && f->ts.type)
+  a = f->actual;
+  if (a && a->label != NULL && f->ts.type)
 	{
 	  gfc_error ("ALTERNATE RETURN not permitted at %L", where);
 	  return false;
 	}
 
-  if (f->actual == NULL)
+  if (a == NULL)
 	{
 	  a = gfc_get_actual_arglist ();
 	  a->missing_arg_type = f->ts.type;
 	}
-  else
-	a = f->actual;
 
   if (actual == NULL)
 	*ap = a;


[PATCH 0/7] fortran: Ignore unused arguments for scalarisation [PR97896]

2021-08-03 Thread Mikael Morin via Gcc-patches
Hello,

I have had these patches fixing PR97896 almost ready for a while.  Now is time 
to actually submit them, at last.

The problematic case is intrinsic procedures where an argument is actually not 
used in the code generated (KIND argument of INDEX in the testcase), which 
confuses the scalariser.

Thomas König comitted a change to workaround the problem, but it regressed in 
PR97896.  These patch put the workaround where I think it is more appropriate, 
namely at the beginning of the scalarisation procedure.  This is the patch 7 of 
the series, preceded with the revert in patch 6.  I intend to commit both of 
them squashed together.

The rest of the series (patches 1-5) is preliminary work to be able to identify 
the KIND argument of the INDEX intrinsic by its name, rather than using the 
right number of next->next->next indirections starting with the first argument. 
 It is probably overkill for just this use case, but I think it’s worth having 
that facility in the long term.
These patches use some c++ features, namely class inheritance and virtual 
functions; I know this is frowned upon by some (fortran) maintainers; let’s see 
what they will say.

I intend to submit a separate patch for the release branch with only patch 6 
and 7 and the next->next->next indirections.

Regression-tested on x86_64-linux-gnu.  Ok for master? 

Mikael Morin (7):
  fortran: new abstract class gfc_dummy_arg
  fortran: Tiny sort_actual internal refactoring
  fortran: Reverse actual vs dummy argument mapping
  fortran: simplify elemental arguments walking
  fortran: Delete redundant missing_arg_type field
  Revert "Remove KIND argument from INDEX so it does not mess up
scalarization."
  fortran: Ignore unused args in scalarization [PR97896]

 gcc/fortran/gfortran.h|  45 +---
 gcc/fortran/interface.c   |  14 +--
 gcc/fortran/intrinsic.c   | 152 +-
 gcc/fortran/intrinsic.h   |   3 +-
 gcc/fortran/iresolve.c|  21 +---
 gcc/fortran/resolve.c |  10 +-
 gcc/fortran/symbol.c  |  19 
 gcc/fortran/trans-array.c |  75 ++---
 gcc/fortran/trans-array.h |   5 +-
 gcc/fortran/trans-decl.c  |  24 +---
 gcc/fortran/trans-expr.c  |   7 +-
 gcc/fortran/trans-intrinsic.c |   3 +-
 gcc/fortran/trans-stmt.c  |  30 +++--
 gcc/fortran/trans.h   |   4 +-
 gcc/testsuite/gfortran.dg/index_5.f90 |  23 
 15 files changed, 252 insertions(+), 183 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/index_5.f90

-- 
2.30.2



[committed] libstdc++: Suppress redundant definitions of inline variables

2021-08-03 Thread Jonathan Wakely via Gcc-patches
In C++17 the out-of-class definitions for static constexpr variables are
redundant, because they are implicitly inline. This change avoids
"redundant redeclaration" warnings from -Wsystem-headers -Wdeprecated.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/bits/random.tcc (linear_congruential_engine): Do not
define static constexpr members when they are implicitly inline.
* include/std/ratio (ratio, __ratio_multiply, __ratio_divide)
(__ratio_add, __ratio_subtract): Likewise.
* include/std/type_traits (integral_constant): Likewise.
* testsuite/26_numerics/random/pr60037-neg.cc: Adjust dg-error
line number.

Tested powerpc64le-linux. Committed to trunk.

commit a77a46d9aeb0166b4b1ee4b52e1cbb4b52c6736f
Author: Jonathan Wakely 
Date:   Tue Aug 3 15:03:44 2021

libstdc++: Suppress redundant definitions of inline variables

In C++17 the out-of-class definitions for static constexpr variables are
redundant, because they are implicitly inline. This change avoids
"redundant redeclaration" warnings from -Wsystem-headers -Wdeprecated.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/bits/random.tcc (linear_congruential_engine): Do not
define static constexpr members when they are implicitly inline.
* include/std/ratio (ratio, __ratio_multiply, __ratio_divide)
(__ratio_add, __ratio_subtract): Likewise.
* include/std/type_traits (integral_constant): Likewise.
* testsuite/26_numerics/random/pr60037-neg.cc: Adjust dg-error
line number.

diff --git a/libstdc++-v3/include/bits/random.tcc 
b/libstdc++-v3/include/bits/random.tcc
index 6ba263072b0..0be50d90e8a 100644
--- a/libstdc++-v3/include/bits/random.tcc
+++ b/libstdc++-v3/include/bits/random.tcc
@@ -91,6 +91,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   } // namespace __detail
   /// @endcond
 
+#if ! __cpp_inline_variables
   template
 constexpr _UIntType
 linear_congruential_engine<_UIntType, __a, __c, __m>::multiplier;
@@ -106,6 +107,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   template
 constexpr _UIntType
 linear_congruential_engine<_UIntType, __a, __c, __m>::default_seed;
+#endif
 
   /**
* Seeds the LCR with integral value @p __s, adjusted so that the
diff --git a/libstdc++-v3/include/std/ratio b/libstdc++-v3/include/std/ratio
index ceee7d00c12..92f6d4b9ea1 100644
--- a/libstdc++-v3/include/std/ratio
+++ b/libstdc++-v3/include/std/ratio
@@ -279,11 +279,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   typedef ratio type;
 };
 
+#if ! __cpp_inline_variables
   template
 constexpr intmax_t ratio<_Num, _Den>::num;
 
   template
 constexpr intmax_t ratio<_Num, _Den>::den;
+#endif
 
   /// @cond undocumented
 
@@ -307,11 +309,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static constexpr intmax_t den = type::den;
 };
 
+#if ! __cpp_inline_variables
   template
 constexpr intmax_t __ratio_multiply<_R1, _R2>::num;
 
   template
 constexpr intmax_t __ratio_multiply<_R1, _R2>::den;
+#endif
 
   /// @endcond
 
@@ -334,11 +338,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static constexpr intmax_t den = type::den;
 };
 
+#if ! __cpp_inline_variables
   template
 constexpr intmax_t __ratio_divide<_R1, _R2>::num;
 
   template
 constexpr intmax_t __ratio_divide<_R1, _R2>::den;
+#endif
 
   /// @endcond
 
@@ -512,11 +518,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static constexpr intmax_t den = type::den;
 };
 
+#if ! __cpp_inline_variables
   template
 constexpr intmax_t __ratio_add<_R1, _R2>::num;
 
   template
 constexpr intmax_t __ratio_add<_R1, _R2>::den;
+#endif
 
   /// @endcond
 
@@ -537,11 +545,13 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   static constexpr intmax_t den = type::den;
 };
 
+#if ! __cpp_inline_variables
   template
 constexpr intmax_t __ratio_subtract<_R1, _R2>::num;
 
   template
 constexpr intmax_t __ratio_subtract<_R1, _R2>::den;
+#endif
 
   /// @endcond
 
diff --git a/libstdc++-v3/include/std/type_traits 
b/libstdc++-v3/include/std/type_traits
index 0d821f9c074..46edde905f8 100644
--- a/libstdc++-v3/include/std/type_traits
+++ b/libstdc++-v3/include/std/type_traits
@@ -73,8 +73,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 #endif
 };
 
+#if ! __cpp_inline_variables
   template
 constexpr _Tp integral_constant<_Tp, __v>::value;
+#endif
 
   /// The type used as a compile-time boolean with true value.
   using true_type =  integral_constant;
diff --git a/libstdc++-v3/testsuite/26_numerics/random/pr60037-neg.cc 
b/libstdc++-v3/testsuite/26_numerics/random/pr60037-neg.cc
index d6e6399bd79..8fba7144d8a 100644
--- a/libstdc++-v3/testsuite/26_numerics/random/pr60037-neg.cc
+++ b/libstdc++-v3/testsuite/26_numerics/random/pr60037-neg.cc
@@ -12,4 +12,4 @@ auto x = std::generate_canonical

[committed] libstdc++: Replace TR1 components with C++11 ones in test utils

2021-08-03 Thread Jonathan Wakely via Gcc-patches
Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* testsuite/util/testsuite_common_types.h: Replace uses of
tr1::unordered_map and tr1::unordered_set with their C++11
equivalents.
* testsuite/29_atomics/atomic/cons/assign_neg.cc: Adjust
dg-error line number.
* testsuite/29_atomics/atomic/cons/copy_neg.cc: Likewise.
* testsuite/29_atomics/atomic_integral/cons/assign_neg.cc:
Likewise.
* testsuite/29_atomics/atomic_integral/cons/copy_neg.cc:
Likewise.
* testsuite/29_atomics/atomic_integral/operators/bitwise_neg.cc:
Likewise.
* testsuite/29_atomics/atomic_integral/operators/decrement_neg.cc:
Likewise.
* testsuite/29_atomics/atomic_integral/operators/increment_neg.cc:
Likewise.

Tested powerpc64le-linux. Committed to trunk.

commit 5c6759e41607f9edbbe25be18bd322d6a0408238
Author: Jonathan Wakely 
Date:   Tue Aug 3 15:02:50 2021

libstdc++: Replace TR1 components with C++11 ones in test utils

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* testsuite/util/testsuite_common_types.h: Replace uses of
tr1::unordered_map and tr1::unordered_set with their C++11
equivalents.
* testsuite/29_atomics/atomic/cons/assign_neg.cc: Adjust
dg-error line number.
* testsuite/29_atomics/atomic/cons/copy_neg.cc: Likewise.
* testsuite/29_atomics/atomic_integral/cons/assign_neg.cc:
Likewise.
* testsuite/29_atomics/atomic_integral/cons/copy_neg.cc:
Likewise.
* testsuite/29_atomics/atomic_integral/operators/bitwise_neg.cc:
Likewise.
* testsuite/29_atomics/atomic_integral/operators/decrement_neg.cc:
Likewise.
* testsuite/29_atomics/atomic_integral/operators/increment_neg.cc:
Likewise.

diff --git a/libstdc++-v3/testsuite/29_atomics/atomic/cons/assign_neg.cc 
b/libstdc++-v3/testsuite/29_atomics/atomic/cons/assign_neg.cc
index 4907312b547..f0520af8e0d 100644
--- a/libstdc++-v3/testsuite/29_atomics/atomic/cons/assign_neg.cc
+++ b/libstdc++-v3/testsuite/29_atomics/atomic/cons/assign_neg.cc
@@ -27,5 +27,5 @@ int main()
   return 0;
 }
 
-// { dg-error "deleted" "" { target *-*-* } 659 }
+// { dg-error "deleted" "" { target *-*-* } 663 }
 // { dg-prune-output "include" }
diff --git a/libstdc++-v3/testsuite/29_atomics/atomic/cons/copy_neg.cc 
b/libstdc++-v3/testsuite/29_atomics/atomic/cons/copy_neg.cc
index 0b67f61dd14..76bb2d60350 100644
--- a/libstdc++-v3/testsuite/29_atomics/atomic/cons/copy_neg.cc
+++ b/libstdc++-v3/testsuite/29_atomics/atomic/cons/copy_neg.cc
@@ -27,5 +27,5 @@ int main()
   return 0;
 }
 
-// { dg-error "deleted" "" { target *-*-* } 698 }
+// { dg-error "deleted" "" { target *-*-* } 702 }
 // { dg-prune-output "include" }
diff --git 
a/libstdc++-v3/testsuite/29_atomics/atomic_integral/cons/assign_neg.cc 
b/libstdc++-v3/testsuite/29_atomics/atomic_integral/cons/assign_neg.cc
index c54a51e4cab..8def1559bd4 100644
--- a/libstdc++-v3/testsuite/29_atomics/atomic_integral/cons/assign_neg.cc
+++ b/libstdc++-v3/testsuite/29_atomics/atomic_integral/cons/assign_neg.cc
@@ -28,5 +28,5 @@ int main()
   return 0;
 }
 
-// { dg-error "deleted" "" { target *-*-* } 659 }
+// { dg-error "deleted" "" { target *-*-* } 663 }
 // { dg-prune-output "include" }
diff --git a/libstdc++-v3/testsuite/29_atomics/atomic_integral/cons/copy_neg.cc 
b/libstdc++-v3/testsuite/29_atomics/atomic_integral/cons/copy_neg.cc
index c7d01ee31d8..9ef033bd678 100644
--- a/libstdc++-v3/testsuite/29_atomics/atomic_integral/cons/copy_neg.cc
+++ b/libstdc++-v3/testsuite/29_atomics/atomic_integral/cons/copy_neg.cc
@@ -28,5 +28,5 @@ int main()
   return 0;
 }
 
-// { dg-error "deleted" "" { target *-*-* } 698 }
+// { dg-error "deleted" "" { target *-*-* } 702 }
 // { dg-prune-output "include" }
diff --git 
a/libstdc++-v3/testsuite/29_atomics/atomic_integral/operators/bitwise_neg.cc 
b/libstdc++-v3/testsuite/29_atomics/atomic_integral/operators/bitwise_neg.cc
index f6749fdd97b..69c307bebae 100644
--- a/libstdc++-v3/testsuite/29_atomics/atomic_integral/operators/bitwise_neg.cc
+++ b/libstdc++-v3/testsuite/29_atomics/atomic_integral/operators/bitwise_neg.cc
@@ -26,8 +26,8 @@ int main()
   return 0;
 }
 
-// { dg-error "operator" "" { target *-*-* } 495 }
-// { dg-error "operator" "" { target *-*-* } 496 }
-// { dg-error "operator" "" { target *-*-* } 497 }
+// { dg-error "operator" "" { target *-*-* } 499 }
+// { dg-error "operator" "" { target *-*-* } 500 }
+// { dg-error "operator" "" { target *-*-* } 501 }
 
 // { dg-prune-output "declared here" }
diff --git 
a/libstdc++-v3/testsuite/29_atomics/atomic_integral/operators/decrement_neg.cc 
b/libstdc++-v3/testsuite/29_atomics/atomic_integral/operators/decrement_neg.cc
index 765e3af270e..31ce7d0d149 100644
--- 

[committed] libstdc++: Specialize allocator_traits>

2021-08-03 Thread Jonathan Wakely via Gcc-patches
This adds a partial specialization of allocator_traits, similar to what
was already done for std::allocator. This means that most uses of
polymorphic_allocator via the traits can avoid the metaprogramming
overhead needed to deduce the properties from polymorphic_allocator.

In addition, I'm changing polymorphic_allocator::delete_object to invoke
the destructor (or pseudo-destructor) directly, rather than calling
allocator_traits::destroy, which calls polymorphic_allocator::destroy
(which is deprecated). This is observable if a user has specialized
allocator_traits> and expects to see its
destroy member function called. I consider explicit specializations of
allocator_traits to be wrong-headed, and this use case seems unnecessary
to support. So delete_object just invokes the destructor directly.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/std/memory_resource (polymorphic_allocator::delete_object):
Call destructor directly instead of using destroy.
(allocator_traits>): Define partial
specialization.

Tested powerpc64le-linux. Committed to trunk.

commit 13a1ac9f6f700f4e214fcc83b122a4a405c6b13d
Author: Jonathan Wakely 
Date:   Tue Aug 3 14:00:47 2021

libstdc++: Specialize allocator_traits>

This adds a partial specialization of allocator_traits, similar to what
was already done for std::allocator. This means that most uses of
polymorphic_allocator via the traits can avoid the metaprogramming
overhead needed to deduce the properties from polymorphic_allocator.

In addition, I'm changing polymorphic_allocator::delete_object to invoke
the destructor (or pseudo-destructor) directly, rather than calling
allocator_traits::destroy, which calls polymorphic_allocator::destroy
(which is deprecated). This is observable if a user has specialized
allocator_traits> and expects to see its
destroy member function called. I consider explicit specializations of
allocator_traits to be wrong-headed, and this use case seems unnecessary
to support. So delete_object just invokes the destructor directly.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/std/memory_resource 
(polymorphic_allocator::delete_object):
Call destructor directly instead of using destroy.
(allocator_traits>): Define partial
specialization.

diff --git a/libstdc++-v3/include/std/memory_resource 
b/libstdc++-v3/include/std/memory_resource
index cdc5e5d98b1..6bca0afa018 100644
--- a/libstdc++-v3/include/std/memory_resource
+++ b/libstdc++-v3/include/std/memory_resource
@@ -232,7 +232,7 @@ namespace pmr
void
delete_object(_Up* __p)
{
- destroy(__p);
+ __p->~_Up();
  deallocate_object(__p);
}
 #endif // C++2a
@@ -381,6 +381,136 @@ namespace pmr
 { return !(__a == __b); }
 #endif
 
+} // namespace pmr
+
+  /// Partial specialization for std::pmr::polymorphic_allocator
+  template
+struct allocator_traits>
+{
+  /// The allocator type
+  using allocator_type = pmr::polymorphic_allocator<_Tp>;
+
+  /// The allocated type
+  using value_type = _Tp;
+
+  /// The allocator's pointer type.
+  using pointer = _Tp*;
+
+  /// The allocator's const pointer type.
+  using const_pointer = const _Tp*;
+
+  /// The allocator's void pointer type.
+  using void_pointer = void*;
+
+  /// The allocator's const void pointer type.
+  using const_void_pointer = const void*;
+
+  /// The allocator's difference type
+  using difference_type = std::ptrdiff_t;
+
+  /// The allocator's size type
+  using size_type = std::size_t;
+
+  /** @{
+   * A `polymorphic_allocator` does not propagate when a
+   * container is copied, moved, or swapped.
+   */
+  using propagate_on_container_copy_assignment = false_type;
+  using propagate_on_container_move_assignment = false_type;
+  using propagate_on_container_swap = false_type;
+
+  static allocator_type
+  select_on_container_copy_construction(const allocator_type&) noexcept
+  { return allocator_type(); }
+  /// @}
+
+  /// Whether all instances of the allocator type compare equal.
+  using is_always_equal = false_type;
+
+  template
+   using rebind_alloc = pmr::polymorphic_allocator<_Up>;
+
+  template
+   using rebind_traits = allocator_traits>;
+
+  /**
+   *  @brief  Allocate memory.
+   *  @param  __a  An allocator.
+   *  @param  __n  The number of objects to allocate space for.
+   *
+   *  Calls `a.allocate(n)`.
+  */
+  [[nodiscard]] static pointer
+  allocate(allocator_type& __a, size_type __n)
+  { return __a.allocate(__n); }
+
+  /**
+   *  @brief  Allocate memory.
+   *  @param  __a  An allocator.
+   *  @param  __n  The number of objects to allocate space for.
+   *  

[committed] libstdc++: Remove trailing whitespace in some tests

2021-08-03 Thread Jonathan Wakely via Gcc-patches
Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* testsuite/20_util/function_objects/binders/3113.cc: Remove
trailing whitespace.
* testsuite/20_util/shared_ptr/assign/auto_ptr.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr_rvalue.cc:
Likewise.
* testsuite/20_util/shared_ptr/creation/dr925.cc: Likewise.
* testsuite/25_algorithms/headers/algorithm/synopsis.cc:
Likewise.
* 
testsuite/25_algorithms/random_shuffle/requirements/explicit_instantiation/2.cc:
Likewise.
* 
testsuite/25_algorithms/random_shuffle/requirements/explicit_instantiation/pod.cc:
Likewise.

Tested powerpc64le-linux. Committed to trunk.

commit 9bd87e388724baab9597ef232ea7e855c99eb7d7
Author: Jonathan Wakely 
Date:   Tue Aug 3 00:05:01 2021

libstdc++: Remove trailing whitespace in some tests

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* testsuite/20_util/function_objects/binders/3113.cc: Remove
trailing whitespace.
* testsuite/20_util/shared_ptr/assign/auto_ptr.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr_rvalue.cc:
Likewise.
* testsuite/20_util/shared_ptr/creation/dr925.cc: Likewise.
* testsuite/25_algorithms/headers/algorithm/synopsis.cc:
Likewise.
* 
testsuite/25_algorithms/random_shuffle/requirements/explicit_instantiation/2.cc:
Likewise.
* 
testsuite/25_algorithms/random_shuffle/requirements/explicit_instantiation/pod.cc:
Likewise.

diff --git a/libstdc++-v3/testsuite/20_util/function_objects/binders/3113.cc 
b/libstdc++-v3/testsuite/20_util/function_objects/binders/3113.cc
index 0b671ae4a0e..03118b291b1 100644
--- a/libstdc++-v3/testsuite/20_util/function_objects/binders/3113.cc
+++ b/libstdc++-v3/testsuite/20_util/function_objects/binders/3113.cc
@@ -25,22 +25,22 @@
 #include  // for_each
 #include 
 
-class Elem 
-{ 
-public: 
-  void print(int) const { } 
-  void modify(int) { } 
-}; 
+class Elem
+{
+public:
+  void print(int) const { }
+  void modify(int) { }
+};
 
 // libstdc++/3113
 void test01()
-{ 
-  std::vector coll(2); 
-  // OK 
-  std::for_each(coll.begin(), coll.end(), 
+{
+  std::vector coll(2);
+  // OK
+  std::for_each(coll.begin(), coll.end(),
   std::bind2nd(std::mem_fun_ref(::print), 42));
   // OK
-  std::for_each(coll.begin(), coll.end(), 
+  std::for_each(coll.begin(), coll.end(),
   std::bind2nd(std::mem_fun_ref(::modify), 42));
 }
 
diff --git a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr.cc 
b/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr.cc
index 82750dca9a9..39574984c7d 100644
--- a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr.cc
+++ b/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr.cc
@@ -78,7 +78,7 @@ test01()
   return 0;
 }
 
-int 
+int
 main()
 {
   test01();
diff --git a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc 
b/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc
index 32af6001366..9d0393be22a 100644
--- a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc
+++ b/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc
@@ -40,7 +40,7 @@ test01()
   return 0;
 }
 
-int 
+int
 main()
 {
   test01();
diff --git 
a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_rvalue.cc 
b/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_rvalue.cc
index 2f6e4db18d9..503fb348aa2 100644
--- a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_rvalue.cc
+++ b/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_rvalue.cc
@@ -44,7 +44,7 @@ test02()
   a = std::move(au);
 }
 
-int 
+int
 main()
 {
   test01();
diff --git a/libstdc++-v3/testsuite/20_util/shared_ptr/creation/dr925.cc 
b/libstdc++-v3/testsuite/20_util/shared_ptr/creation/dr925.cc
index 87e89375d28..25ff611f8d6 100644
--- a/libstdc++-v3/testsuite/20_util/shared_ptr/creation/dr925.cc
+++ b/libstdc++-v3/testsuite/20_util/shared_ptr/creation/dr925.cc
@@ -28,19 +28,19 @@ struct A
 {
 };
 
-std::unique_ptr 
+std::unique_ptr
 create_unique_ptr()
 {
   return std::unique_ptr(new A());
 }
 
-std::auto_ptr 
+std::auto_ptr
 create_auto_ptr()
 {
   return std::auto_ptr(new A());
 }
 
-void 
+void
 process(std::shared_ptr a)
 {
   VERIFY( a.get() != 0 );
diff --git a/libstdc++-v3/testsuite/25_algorithms/headers/algorithm/synopsis.cc 
b/libstdc++-v3/testsuite/25_algorithms/headers/algorithm/synopsis.cc
index dbf58fb64a6..68f5d425b4d 100644
--- a/libstdc++-v3/testsuite/25_algorithms/headers/algorithm/synopsis.cc
+++ b/libstdc++-v3/testsuite/25_algorithms/headers/algorithm/synopsis.cc
@@ -24,12 +24,12 @@ namespace std
   // 25.1, non-modifying sequence 

[committed] libstdc++: Deprecate std::random_shuffle for C++14

2021-08-03 Thread Jonathan Wakely via Gcc-patches
The std::random_shuffle algorithm was removed in C++14 (without
deprecation). This adds the deprecated attribute for C++14 and later, so
that users are warned they should not be using it in those dialects.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* doc/xml/manual/evolution.xml: Document deprecation.
* doc/html/*: Regenerate.
* include/bits/c++config (_GLIBCXX14_DEPRECATED): Define.
(_GLIBCXX14_DEPRECATED_SUGGEST): Define.
* include/bits/stl_algo.h (random_shuffle): Deprecate for C++14
and later.
* testsuite/25_algorithms/headers/algorithm/synopsis.cc: Adjust
for C++11 and C++14 changes to std::random_shuffle and
std::shuffle.
* testsuite/25_algorithms/random_shuffle/1.cc: Add options to
use deprecated algorithms.
* testsuite/25_algorithms/random_shuffle/59603.cc: Likewise.
* testsuite/25_algorithms/random_shuffle/moveable.cc: Likewise.
* 
testsuite/25_algorithms/random_shuffle/requirements/explicit_instantiation/2.cc:
Likewise.
* 
testsuite/25_algorithms/random_shuffle/requirements/explicit_instantiation/pod.cc:
Likewise.

Tested powerpc64le-linux. Committed to trunk.

commit 7f2f4b87910506effb8dc60eeb2451573126
Author: Jonathan Wakely 
Date:   Mon Aug 2 18:35:42 2021

libstdc++: Deprecate std::random_shuffle for C++14

The std::random_shuffle algorithm was removed in C++14 (without
deprecation). This adds the deprecated attribute for C++14 and later, so
that users are warned they should not be using it in those dialects.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* doc/xml/manual/evolution.xml: Document deprecation.
* doc/html/*: Regenerate.
* include/bits/c++config (_GLIBCXX14_DEPRECATED): Define.
(_GLIBCXX14_DEPRECATED_SUGGEST): Define.
* include/bits/stl_algo.h (random_shuffle): Deprecate for C++14
and later.
* testsuite/25_algorithms/headers/algorithm/synopsis.cc: Adjust
for C++11 and C++14 changes to std::random_shuffle and
std::shuffle.
* testsuite/25_algorithms/random_shuffle/1.cc: Add options to
use deprecated algorithms.
* testsuite/25_algorithms/random_shuffle/59603.cc: Likewise.
* testsuite/25_algorithms/random_shuffle/moveable.cc: Likewise.
* 
testsuite/25_algorithms/random_shuffle/requirements/explicit_instantiation/2.cc:
Likewise.
* 
testsuite/25_algorithms/random_shuffle/requirements/explicit_instantiation/pod.cc:
Likewise.

diff --git a/libstdc++-v3/doc/xml/manual/evolution.xml 
b/libstdc++-v3/doc/xml/manual/evolution.xml
index 55b8903baff..59b71b04442 100644
--- a/libstdc++-v3/doc/xml/manual/evolution.xml
+++ b/libstdc++-v3/doc/xml/manual/evolution.xml
@@ -1019,4 +1019,14 @@ now defaults to zero.
 
 
 
+12
+
+
+The std::random_shuffle algorithms are deprecated
+for C++14 and later. The C++11 std::shuffle algorithm
+can be used instead.
+
+
+
+
 
diff --git a/libstdc++-v3/include/bits/c++config 
b/libstdc++-v3/include/bits/c++config
index 69ace386dd7..32b8957f814 100644
--- a/libstdc++-v3/include/bits/c++config
+++ b/libstdc++-v3/include/bits/c++config
@@ -80,6 +80,8 @@
 //   _GLIBCXX_DEPRECATED_SUGGEST( string-literal )
 //   _GLIBCXX11_DEPRECATED
 //   _GLIBCXX11_DEPRECATED_SUGGEST( string-literal )
+//   _GLIBCXX14_DEPRECATED
+//   _GLIBCXX14_DEPRECATED_SUGGEST( string-literal )
 //   _GLIBCXX17_DEPRECATED
 //   _GLIBCXX17_DEPRECATED_SUGGEST( string-literal )
 //   _GLIBCXX20_DEPRECATED( string-literal )
@@ -105,6 +107,14 @@
 # define _GLIBCXX11_DEPRECATED_SUGGEST(ALT)
 #endif
 
+#if defined(__DEPRECATED) && (__cplusplus >= 201403L)
+# define _GLIBCXX14_DEPRECATED _GLIBCXX_DEPRECATED
+# define _GLIBCXX14_DEPRECATED_SUGGEST(ALT) _GLIBCXX_DEPRECATED_SUGGEST(ALT)
+#else
+# define _GLIBCXX14_DEPRECATED
+# define _GLIBCXX14_DEPRECATED_SUGGEST(ALT)
+#endif
+
 #if defined(__DEPRECATED) && (__cplusplus >= 201703L)
 # define _GLIBCXX17_DEPRECATED [[__deprecated__]]
 # define _GLIBCXX17_DEPRECATED_SUGGEST(ALT) _GLIBCXX_DEPRECATED_SUGGEST(ALT)
diff --git a/libstdc++-v3/include/bits/stl_algo.h 
b/libstdc++-v3/include/bits/stl_algo.h
index a18bb000d0c..54ad383711f 100644
--- a/libstdc++-v3/include/bits/stl_algo.h
+++ b/libstdc++-v3/include/bits/stl_algo.h
@@ -56,7 +56,6 @@
 #ifndef _STL_ALGO_H
 #define _STL_ALGO_H 1
 
-#include   // for rand
 #include 
 #include 
 #include   // for _Temporary_buffer
@@ -66,6 +65,10 @@
 #include 
 #endif
 
+#if _GLIBCXX_HOSTED && (__cplusplus <= 201103L || _GLIBCXX_USE_DEPRECATED)
+#include   // for rand
+#endif
+
 // See concept_check.h for the __glibcxx_*_requires macros.
 
 namespace std _GLIBCXX_VISIBILITY(default)
@@ -4551,6 +4554,7 @@ _GLIBCXX_BEGIN_NAMESPACE_ALGO
std::__iterator_category(__result));
   

[committed] libstdc++: Add testsuite proc for testing deprecated features

2021-08-03 Thread Jonathan Wakely via Gcc-patches
This change adds options to tests that explicitly use deprecated
features, so that -D_GLIBCXX_USE_DEPRECATED=0 can be used to run the
rest of the testsuite. The tests that explicitly/intentionally use
deprecated features will still be able to use them, but they can be
disabled for the majority of tests.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* testsuite/23_containers/forward_list/operations/3.cc:
Use lambda instead of std::bind2nd.
* testsuite/20_util/function_objects/binders/3113.cc: Add
options for testing deprecated features.
* testsuite/20_util/pair/cons/99957.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr_rvalue.cc:
Likewise.
* testsuite/20_util/shared_ptr/cons/43820_neg.cc: Likewise.
* testsuite/20_util/shared_ptr/cons/auto_ptr.cc: Likewise.
* testsuite/20_util/shared_ptr/cons/auto_ptr_neg.cc: Likewise.
* testsuite/20_util/shared_ptr/creation/dr925.cc: Likewise.
* testsuite/20_util/unique_ptr/cons/auto_ptr.cc: Likewise.
* testsuite/20_util/unique_ptr/cons/auto_ptr_neg.cc: Likewise.
* testsuite/ext/pb_ds/example/priority_queue_erase_if.cc:
Likewise.
* testsuite/ext/pb_ds/example/priority_queue_split_join.cc:
Likewise.
* testsuite/lib/dg-options.exp (dg_add_options_using-deprecated):
New proc.

Tested powerpc64le-linux. Committed to trunk.

commit 07b70dfc4eab7869e7a43b3ff5b8b512dba0bb6e
Author: Jonathan Wakely 
Date:   Mon Aug 2 23:55:18 2021

libstdc++: Add testsuite proc for testing deprecated features

This change adds options to tests that explicitly use deprecated
features, so that -D_GLIBCXX_USE_DEPRECATED=0 can be used to run the
rest of the testsuite. The tests that explicitly/intentionally use
deprecated features will still be able to use them, but they can be
disabled for the majority of tests.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* testsuite/23_containers/forward_list/operations/3.cc:
Use lambda instead of std::bind2nd.
* testsuite/20_util/function_objects/binders/3113.cc: Add
options for testing deprecated features.
* testsuite/20_util/pair/cons/99957.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc: Likewise.
* testsuite/20_util/shared_ptr/assign/auto_ptr_rvalue.cc:
Likewise.
* testsuite/20_util/shared_ptr/cons/43820_neg.cc: Likewise.
* testsuite/20_util/shared_ptr/cons/auto_ptr.cc: Likewise.
* testsuite/20_util/shared_ptr/cons/auto_ptr_neg.cc: Likewise.
* testsuite/20_util/shared_ptr/creation/dr925.cc: Likewise.
* testsuite/20_util/unique_ptr/cons/auto_ptr.cc: Likewise.
* testsuite/20_util/unique_ptr/cons/auto_ptr_neg.cc: Likewise.
* testsuite/ext/pb_ds/example/priority_queue_erase_if.cc:
Likewise.
* testsuite/ext/pb_ds/example/priority_queue_split_join.cc:
Likewise.
* testsuite/lib/dg-options.exp (dg_add_options_using-deprecated):
New proc.

diff --git a/libstdc++-v3/testsuite/20_util/function_objects/binders/3113.cc 
b/libstdc++-v3/testsuite/20_util/function_objects/binders/3113.cc
index c4dd784dd6c..0b671ae4a0e 100644
--- a/libstdc++-v3/testsuite/20_util/function_objects/binders/3113.cc
+++ b/libstdc++-v3/testsuite/20_util/function_objects/binders/3113.cc
@@ -17,6 +17,8 @@
 // with this library; see the file COPYING3.  If not see
 // .
 
+// { dg-add-options using-deprecated }
+
 // 20.3.6 Binders
 
 #include 
diff --git a/libstdc++-v3/testsuite/20_util/pair/cons/99957.cc 
b/libstdc++-v3/testsuite/20_util/pair/cons/99957.cc
index 150bcd57c9a..82ec54ca1d8 100644
--- a/libstdc++-v3/testsuite/20_util/pair/cons/99957.cc
+++ b/libstdc++-v3/testsuite/20_util/pair/cons/99957.cc
@@ -1,4 +1,5 @@
 // { dg-options "-Wdeprecated" }
+// { dg-add-options using-deprecated }
 // { dg-do compile { target { c++11 && { ! c++20 } } } }
 
 #include 
diff --git a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr.cc 
b/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr.cc
index 9615897cbff..82750dca9a9 100644
--- a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr.cc
+++ b/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr.cc
@@ -1,4 +1,5 @@
 // { dg-options "-Wno-deprecated" }
+// { dg-add-options using-deprecated }
 // { dg-do run { target c++11 } }
 
 // Copyright (C) 2005-2021 Free Software Foundation, Inc.
diff --git a/libstdc++-v3/testsuite/20_util/shared_ptr/assign/auto_ptr_neg.cc 

[committed] libstdc++: Reduce header dependencies in

2021-08-03 Thread Jonathan Wakely via Gcc-patches
This reduces the size of  a little. This is one of the largest
and slowest headers in the library.

By using  and  instead of
 we don't need to parse all the parallel algorithms and
std::ranges:: algorithms that are not needed by . Similarly, by
using  and  instead of  we don't
need to parse the definition of std::multimap.

The _State_info type is not movable or copyable, so doesn't need to use
std::unique_ptr to manage a bitset, we can just delete it in the
destructor. It would use a lot less space if we used a bitset instead,
but that would be an ABI break. We could do it for the versioned
namespace, but this patch doesn't do so. For future reference, using
vector would work, but would increase sizeof(_State_info) by two
pointers, because it's three times as large as unique_ptr. We
can't use std::bitset because the length isn't constant. We want a
bitset with a non-constant but fixed length.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/bits/regex_executor.h (_State_info): Replace
unique_ptr with array of bool.
* include/bits/regex_executor.tcc: Likewise.
* include/bits/regex_scanner.tcc: Replace std::strchr with
__builtin_strchr.
* include/std/regex: Replace standard headers with smaller
internal ones.
* testsuite/28_regex/traits/char/lookup_classname.cc: Include
 for strlen.
* testsuite/28_regex/traits/char/lookup_collatename.cc:
Likewise.

Tested powerpc64le-linux. Committed to trunk.

commit e9f64fff64d83f5fcaa9ff17f1688490f75bdcb7
Author: Jonathan Wakely 
Date:   Mon Aug 2 18:34:19 2021

libstdc++: Reduce header dependencies in 

This reduces the size of  a little. This is one of the largest
and slowest headers in the library.

By using  and  instead of
 we don't need to parse all the parallel algorithms and
std::ranges:: algorithms that are not needed by . Similarly, by
using  and  instead of  we don't
need to parse the definition of std::multimap.

The _State_info type is not movable or copyable, so doesn't need to use
std::unique_ptr to manage a bitset, we can just delete it in the
destructor. It would use a lot less space if we used a bitset instead,
but that would be an ABI break. We could do it for the versioned
namespace, but this patch doesn't do so. For future reference, using
vector would work, but would increase sizeof(_State_info) by two
pointers, because it's three times as large as unique_ptr. We
can't use std::bitset because the length isn't constant. We want a
bitset with a non-constant but fixed length.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/bits/regex_executor.h (_State_info): Replace
unique_ptr with array of bool.
* include/bits/regex_executor.tcc: Likewise.
* include/bits/regex_scanner.tcc: Replace std::strchr with
__builtin_strchr.
* include/std/regex: Replace standard headers with smaller
internal ones.
* testsuite/28_regex/traits/char/lookup_classname.cc: Include
 for strlen.
* testsuite/28_regex/traits/char/lookup_collatename.cc:
Likewise.

diff --git a/libstdc++-v3/include/bits/regex_executor.h 
b/libstdc++-v3/include/bits/regex_executor.h
index 4a6416c..014b4e83064 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -195,6 +195,11 @@ namespace __detail
  : _M_visited_states(new bool[__n]()), _M_start(__start)
  { }
 
+ ~_State_info() { delete[] _M_visited_states; }
+
+ _State_info(const _State_info&) = delete;
+ _State_info& operator=(const _State_info&) = delete;
+
  bool _M_visited(_StateIdT __i)
  {
if (_M_visited_states[__i])
@@ -212,7 +217,7 @@ namespace __detail
  // Saves states that need to be considered for the next character.
  vector>  _M_match_queue;
  // Indicates which states are already visited.
- unique_ptr_M_visited_states;
+ bool* _M_visited_states;
  // To record current solution.
  _StateIdT _M_start;
};
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc 
b/libstdc++-v3/include/bits/regex_executor.tcc
index 405d1c4d0d1..3cefeda48a3 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -122,7 +122,7 @@ namespace __detail
  _M_has_sol = false;
  if (_M_states._M_match_queue.empty())
break;
- std::fill_n(_M_states._M_visited_states.get(), _M_nfa.size(), false);
+ std::fill_n(_M_states._M_visited_states, _M_nfa.size(), false);
  auto __old_queue = std::move(_M_states._M_match_queue);
  for (auto& __task : __old_queue)
{
diff --git 

Re: [ARM] PR98435: Missed optimization in expanding vector constructor

2021-08-03 Thread Christophe Lyon via Gcc-patches
On Tue, Aug 3, 2021 at 12:57 PM Prathamesh Kulkarni <
prathamesh.kulka...@linaro.org> wrote:

> On Tue, 3 Aug 2021 at 14:59, Christophe Lyon
>  wrote:
> >
> >
> >
> > On Tue, Jul 6, 2021 at 11:26 AM Prathamesh Kulkarni via Gcc-patches <
> gcc-patches@gcc.gnu.org> wrote:
> >>
> >> On Tue, 6 Jul 2021 at 13:33, Kyrylo Tkachov 
> wrote:
> >> >
> >> >
> >> >
> >> > > -Original Message-
> >> > > From: Prathamesh Kulkarni 
> >> > > Sent: 06 July 2021 08:06
> >> > > To: Christophe LYON 
> >> > > Cc: Kyrylo Tkachov ; gcc Patches  >> > > patc...@gcc.gnu.org>
> >> > > Subject: Re: [ARM] PR98435: Missed optimization in expanding vector
> >> > > constructor
> >> > >
> >> > > On Thu, 1 Jul 2021 at 16:26, Prathamesh Kulkarni
> >> > >  wrote:
> >> > > >
> >> > > > On Wed, 30 Jun 2021 at 20:51, Christophe LYON
> >> > > >  wrote:
> >> > > > >
> >> > > > >
> >> > > > > On 29/06/2021 12:46, Prathamesh Kulkarni wrote:
> >> > > > > > On Mon, 28 Jun 2021 at 14:48, Christophe LYON
> >> > > > > >  wrote:
> >> > > > > >>
> >> > > > > >> On 28/06/2021 10:40, Kyrylo Tkachov via Gcc-patches wrote:
> >> > > > >  -Original Message-
> >> > > > >  From: Prathamesh Kulkarni 
> >> > > > >  Sent: 28 June 2021 09:38
> >> > > > >  To: Kyrylo Tkachov 
> >> > > > >  Cc: Christophe Lyon ; gcc
> Patches
> >> > >  >> > > > >  patc...@gcc.gnu.org>
> >> > > > >  Subject: Re: [ARM] PR98435: Missed optimization in
> expanding
> >> > > vector
> >> > > > >  constructor
> >> > > > > 
> >> > > > >  On Thu, 24 Jun 2021 at 22:01, Kyrylo Tkachov
> >> > > 
> >> > > > >  wrote:
> >> > > > > >
> >> > > > > >> -Original Message-
> >> > > > > >> From: Prathamesh Kulkarni <
> prathamesh.kulka...@linaro.org>
> >> > > > > >> Sent: 14 June 2021 09:02
> >> > > > > >> To: Christophe Lyon 
> >> > > > > >> Cc: gcc Patches ; Kyrylo
> Tkachov
> >> > > > > >> 
> >> > > > > >> Subject: Re: [ARM] PR98435: Missed optimization in
> expanding
> >> > > vector
> >> > > > > >> constructor
> >> > > > > >>
> >> > > > > >> On Wed, 9 Jun 2021 at 15:58, Prathamesh Kulkarni
> >> > > > > >>  wrote:
> >> > > > > >>> On Fri, 4 Jun 2021 at 13:15, Christophe Lyon
> >> > > > >  
> >> > > > > >> wrote:
> >> > > > >  On Fri, 4 Jun 2021 at 09:27, Prathamesh Kulkarni via
> Gcc-
> >> > > patches
> >> > > > >   wrote:
> >> > > > > > Hi,
> >> > > > > > As mentioned in PR, for the following test-case:
> >> > > > > >
> >> > > > > > #include 
> >> > > > > >
> >> > > > > > bfloat16x4_t f1 (bfloat16_t a)
> >> > > > > > {
> >> > > > > > return vdup_n_bf16 (a);
> >> > > > > > }
> >> > > > > >
> >> > > > > > bfloat16x4_t f2 (bfloat16_t a)
> >> > > > > > {
> >> > > > > > return (bfloat16x4_t) {a, a, a, a};
> >> > > > > > }
> >> > > > > >
> >> > > > > > Compiling with arm-linux-gnueabi -O3 -mfpu=neon
> -mfloat-
> >> > > > >  abi=softfp
> >> > > > > > -march=armv8.2-a+bf16+fp16 results in f2 not being
> >> > > vectorized:
> >> > > > > >
> >> > > > > > f1:
> >> > > > > >   vdup.16 d16, r0
> >> > > > > >   vmovr0, r1, d16  @ v4bf
> >> > > > > >   bx  lr
> >> > > > > >
> >> > > > > > f2:
> >> > > > > >   mov r3, r0  @ __bf16
> >> > > > > >   adr r1, .L4
> >> > > > > >   ldrdr0, [r1]
> >> > > > > >   mov r2, r3  @ __bf16
> >> > > > > >   mov ip, r3  @ __bf16
> >> > > > > >   bfi r1, r2, #0, #16
> >> > > > > >   bfi r0, ip, #0, #16
> >> > > > > >   bfi r1, r3, #16, #16
> >> > > > > >   bfi r0, r2, #16, #16
> >> > > > > >   bx  lr
> >> > > > > >
> >> > > > > > This seems to happen because vec_init pattern in
> neon.md
> >> > > has VDQ
> >> > > > > >> mode
> >> > > > > > iterator, which doesn't include V4BF. In attached
> patch, I
> >> > > changed
> >> > > > > > mode
> >> > > > > > to VDQX which seems to work for the test-case, and the
> >> > > compiler
> >> > > > >  now
> >> > > > > >> generates:
> >> > > > > > f2:
> >> > > > > >   vdup.16 d16, r0
> >> > > > > >   vmovr0, r1, d16  @ v4bf
> >> > > > > >   bx  lr
> >> > > > > >
> >> > > > > > However, the pattern is also gated on TARGET_HAVE_MVE
> >> > > and I am
> >> > > > > >> not
> >> > > > > > sure if either VDQ or VDQX are correct modes for MVE
> since
> >> > > MVE
> >> > > > >  has
> >> > > > > > only 128-bit vectors ?
> >> > > > > >
> >> > > > >  I think patterns common to both Neon and MVE should be
> >> > > moved to
> >> > 

Re: [PATCH 0/3] arm: fix problems when targetting extended FPUs [PR101723]

2021-08-03 Thread Christophe Lyon via Gcc-patches
On Mon, Aug 2, 2021 at 4:57 PM Richard Earnshaw  wrote:

> This patch series addresses an issue that has come to light due to a
> change in the way GAS handles .fpu directives in the assembler.  A fix
> to the assembler made in binutils 2.34 to clear out all features
> realated to the FPU when .fpu is emitted has started causing problems
> for GCC because of the order in which we emit .fpu and .arch_extension
> directives.  To fully address this we need to re-organize the way in
> which the compiler does this.
>
> I'll hold of pushing the patches for a couple of days.  Although I've
> gone through the testsuite quite carefully and run this through
> several configurations, it's possible that this may have some impact
> on the testsuite that I've missed.  Christophe, is the any chance you
> can run this through your test environment before I commit this?
>
>
Sorry for the delay, still unpacking emails after hollidays.

Yes I can run the validation for these patches. I think you mean with all 3
patches combined, not 3 validations (patch 1, patches 1+2, patches 1-3) ?

Thanks,

Christophe


> R.
>
> Richard Earnshaw (3):
>   arm: ensure the arch_name is always set for the build target
>   arm: Don't reconfigure globals in arm_configure_build_target
>   arm: reorder assembler architecture directives [PR101723]
>
>  gcc/config/arm/arm-c.c|   1 +
>  gcc/config/arm/arm-cpus.in|   1 +
>  gcc/config/arm/arm.c  | 190 --
>  gcc/testsuite/gcc.target/arm/attr-neon.c  |   9 +-
>  gcc/testsuite/gcc.target/arm/attr-neon2.c |  35 +++-
>  gcc/testsuite/gcc.target/arm/attr-neon3.c |  43 +++-
>  .../arm/cortex-m55-nofp-flag-hard.c   |   2 +-
>  .../arm/cortex-m55-nofp-flag-softfp.c |   2 +-
>  .../arm/cortex-m55-nofp-nomve-flag-softfp.c   |   2 +-
>  .../gcc.target/arm/mve/intrinsics/mve_fpu1.c  |   5 +-
>  .../gcc.target/arm/mve/intrinsics/mve_fpu2.c  |   5 +-
>  gcc/testsuite/gcc.target/arm/pr98636.c|   3 +-
>  12 files changed, 153 insertions(+), 145 deletions(-)
>
> --
> 2.25.1
>
>


[PATCH]middle-end Fix trapping access in test PR101750

2021-08-03 Thread Tamar Christina via Gcc-patches
Hi All,

I believe PR101750 to be a testism. The reduced case accesses h[0] but h is
uninitialized and so the changes added in r12-2523 makes the compiler realize
this and replaces the code with a trap.

This fixes the case by just making the variable static.

regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

PR tree-optimization/101750
* g++.dg/vect/pr99149.cc: Fix access of h.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/g++.dg/vect/pr99149.cc 
b/gcc/testsuite/g++.dg/vect/pr99149.cc
index 
00ebe9d9cdf600ada8e66b4b854f0e18ad0b6a7d..4b885a5d432130d5eff3e96c833ec6c97de3e95d
 100755
--- a/gcc/testsuite/g++.dg/vect/pr99149.cc
+++ b/gcc/testsuite/g++.dg/vect/pr99149.cc
@@ -11,8 +11,8 @@ public:
   a operator*(a d) { return a(b * b - c * c, b * c + c * d.b); }
 };
 int f, g;
-class {
-  a *h;
+class mp {
+  static a *h;
   a *i;
 
 public:


-- 
diff --git a/gcc/testsuite/g++.dg/vect/pr99149.cc b/gcc/testsuite/g++.dg/vect/pr99149.cc
index 00ebe9d9cdf600ada8e66b4b854f0e18ad0b6a7d..4b885a5d432130d5eff3e96c833ec6c97de3e95d 100755
--- a/gcc/testsuite/g++.dg/vect/pr99149.cc
+++ b/gcc/testsuite/g++.dg/vect/pr99149.cc
@@ -11,8 +11,8 @@ public:
   a operator*(a d) { return a(b * b - c * c, b * c + c * d.b); }
 };
 int f, g;
-class {
-  a *h;
+class mp {
+  static a *h;
   a *i;
 
 public:



[PATCH v6] : Add pragma GCC target("general-regs-only")

2021-08-03 Thread H.J. Lu via Gcc-patches
On Tue, Aug 3, 2021 at 4:47 AM Richard Biener
 wrote:
>
> On Sun, Jul 18, 2021 at 3:46 AM H.J. Lu  wrote:
> >
> > On Thu, Apr 22, 2021 at 7:30 AM Richard Biener via Gcc-patches
> >  wrote:
> > >
> > > On Thu, Apr 22, 2021 at 2:52 PM Richard Biener
> > >  wrote:
> > > >
> > > > On Thu, Apr 22, 2021 at 2:22 PM Jakub Jelinek  wrote:
> > > > >
> > > > > On Thu, Apr 22, 2021 at 01:23:20PM +0200, Richard Biener via 
> > > > > Gcc-patches wrote:
> > > > > > > The question is if the pragma GCC target right now behaves 
> > > > > > > incrementally
> > > > > > > or not, whether
> > > > > > > #pragma GCC target("avx2")
> > > > > > > adds -mavx2 to options if it was missing before and nothing 
> > > > > > > otherwise, or if
> > > > > > > it switches other options off.  If it is incremental, we could 
> > > > > > > e.g. try to
> > > > > > > use the second least significant bit of global_options_set.x_* to 
> > > > > > > mean
> > > > > > > this option has been set explicitly by some surrounding #pragma 
> > > > > > > GCC target.
> > > > > > > The normal tests - global_options_set.x_flag_whatever could still 
> > > > > > > work
> > > > > > > fine because they wouldn't care if the option was explicit from 
> > > > > > > anywhere
> > > > > > > (command line or GCC target or target attribute) and just & 2 
> > > > > > > would mean
> > > > > > > it was explicit from pragma GCC target; though there is the case 
> > > > > > > of
> > > > > > > bitfields... And then the inlining decision could check the & 2 
> > > > > > > flags to
> > > > > > > see what is required and what is just from command line.
> > > > > > > Or we can have some other pragma GCC that would be like target 
> > > > > > > but would
> > > > > > > have flags that are explicit (and could e.g. be more restricted, 
> > > > > > > to ISA
> > > > > > > options only, and let those use in addition to #pragma GCC target.
> > > > > >
> > > > > > I'm still curious as to what you think will break if always-inline 
> > > > > > does what
> > > > > > it is documented to do.
> > > > >
> > > > > We will silently accept calling intrinsics that must be used only in 
> > > > > certain
> > > > > ISA contexts, which will lead to people writing non-portable code.
> > > > >
> > > > > So -O2 -mno-avx
> > > > > #include 
> > > > >
> > > > > void
> > > > > foo (__m256 *x)
> > > > > {
> > > > >   x[0] = _mm256_sub_ps (x[1], x[2]);
> > > > > }
> > > > > etc. will now be accepted when it shouldn't be.
> > > > > clang rejects it like gcc with:
> > > > > 1.c:6:10: error: always_inline function '_mm256_sub_ps' requires 
> > > > > target feature 'avx', but would be inlined into function 'foo' that 
> > > > > is compiled without support for 'avx'
> > > > >   x[0] = _mm256_sub_ps (x[1], x[2]);
> > > > >  ^
> > > > >
> > > > > Note, if I do:
> > > > > #include 
> > > > >
> > > > > __attribute__((target ("no-sse3"))) void
> > > > > foo (__m256 *x)
> > > > > {
> > > > >   x[0] = _mm256_sub_ps (x[1], x[2]);
> > > > > }
> > > > > and compile
> > > > > clang -S -O2 -mavx2 1.c
> > > > > 1.c:6:10: error: always_inline function '_mm256_sub_ps' requires 
> > > > > target feature 'avx', but would be inlined into function 'foo' that 
> > > > > is compiled without support for 'avx'
> > > > >   x[0] = _mm256_sub_ps (x[1], x[2]);
> > > > >  ^
> > > > > then from the error message it seems that unlike GCC, clang remembers
> > > > > the exact target features that are needed for the intrinsics and 
> > > > > checks just
> > > > > those.
> > > > > Though, looking at the preprocessed source, seems it uses
> > > > > static __inline __m256 __attribute__((__always_inline__, __nodebug__, 
> > > > > __target__("avx"), __min_vector_width__(256)))
> > > > > _mm256_sub_ps(__m256 __a, __m256 __b)
> > > > > {
> > > > >   return (__m256)((__v8sf)__a-(__v8sf)__b);
> > > > > }
> > > > > and not target pragmas.
> > > > >
> > > > > Anyway, if we tweak our intrinsic headers so that
> > > > > -#ifndef __AVX__
> > > > >  #pragma GCC push_options
> > > > >  #pragma GCC target("avx")
> > > > > -#define __DISABLE_AVX__
> > > > > -#endif /* __AVX__ */
> > > > >
> > > > > ...
> > > > > -#ifdef __DISABLE_AVX__
> > > > > -#undef __DISABLE_AVX__
> > > > >  #pragma GCC pop_options
> > > > > -#endif /* __DISABLE_AVX__ */
> > > > > and do the opts_set->x_* & 2 stuff on explicit options coming out of
> > > > > target/optimize pragmas and attributes, perhaps we don't even need
> > > > > to introduce a new attribute and can handle everything magically:
> > >
> > > Oh, and any such changes will likely interact with Martins ideas to rework
> > > how optimize and target attributes work (aka adding ontop of the
> > > commandline options).  That is, attribute target will then not be enough
> > > to remember the exact set of needed ISA features (as opposed to what
> > > likely clang implements?)
> > >
> > > > > 1) if it is gnu_inline extern inline, allow indirect calls, otherwise
> > > > > disallow them for always_inline functions
> > > 

[committed] libstdc++: Avoid using std::unique_ptr in

2021-08-03 Thread Jonathan Wakely via Gcc-patches
std::wstring_convert and std::wbuffer_convert types are not copyable or
movable, and store a plain pointer without a deleter. That means a much
simpler type that just uses delete in its destructor can be used instead
of std::unique_ptr.

That avoids including and parsing all of  in every
header that includes . It also avoids instantiating
unique_ptr and std::tuple> when the conversion
utilities are used.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/bits/locale_conv.h (__detail::_Scoped_ptr): Define new
RAII class template.
(wstring_convert, wbuffer_convert): Use __detail::_Scoped_ptr
instead of unique_ptr.

Tested powerpc64le-linux. Committed to trunk.

commit a1a2654cdc90e9aa561a0e853b4b1372892afb70
Author: Jonathan Wakely 
Date:   Mon Aug 2 17:12:52 2021

libstdc++: Avoid using std::unique_ptr in 

std::wstring_convert and std::wbuffer_convert types are not copyable or
movable, and store a plain pointer without a deleter. That means a much
simpler type that just uses delete in its destructor can be used instead
of std::unique_ptr.

That avoids including and parsing all of  in every
header that includes . It also avoids instantiating
unique_ptr and std::tuple> when the conversion
utilities are used.

Signed-off-by: Jonathan Wakely 

libstdc++-v3/ChangeLog:

* include/bits/locale_conv.h (__detail::_Scoped_ptr): Define new
RAII class template.
(wstring_convert, wbuffer_convert): Use __detail::_Scoped_ptr
instead of unique_ptr.

diff --git a/libstdc++-v3/include/bits/locale_conv.h 
b/libstdc++-v3/include/bits/locale_conv.h
index 0e409da9876..6af8a5bdc8f 100644
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -38,7 +38,6 @@
 #include 
 #include 
 #include 
-#include 
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
@@ -221,6 +220,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 #endif  // _GLIBCXX_USE_CHAR8_T
 
+  namespace __detail
+  {
+template
+  struct _Scoped_ptr
+  {
+   __attribute__((__nonnull__(2)))
+   explicit
+   _Scoped_ptr(_Tp* __ptr) noexcept
+   : _M_ptr(__ptr)
+   { }
+
+   _Scoped_ptr(_Tp* __ptr, const char* __msg)
+   : _M_ptr(__ptr)
+   {
+ if (!__ptr)
+   __throw_logic_error(__msg);
+   }
+
+   ~_Scoped_ptr() { delete _M_ptr; }
+
+   _Scoped_ptr(const _Scoped_ptr&) = delete;
+   _Scoped_ptr& operator=(const _Scoped_ptr&) = delete;
+
+   __attribute__((__returns_nonnull__))
+   _Tp* operator->() const noexcept { return _M_ptr; }
+
+   _Tp& operator*() const noexcept { return *_M_ptr; }
+
+  private:
+   _Tp* _M_ptr;
+  };
+  }
+
 #ifdef _GLIBCXX_USE_WCHAR_T
 
 _GLIBCXX_BEGIN_NAMESPACE_CXX11
@@ -247,11 +279,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
* Takes ownership of @p __pcvt and will delete it in the destructor.
*/
   explicit
-  wstring_convert(_Codecvt* __pcvt) : _M_cvt(__pcvt)
-  {
-   if (!_M_cvt)
- __throw_logic_error("wstring_convert");
-  }
+  wstring_convert(_Codecvt* __pcvt) : _M_cvt(__pcvt, "wstring_convert")
+  { }
 
   /** Construct with an initial converstion state.
*
@@ -262,11 +291,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
* The object's conversion state will persist between conversions.
*/
   wstring_convert(_Codecvt* __pcvt, state_type __state)
-  : _M_cvt(__pcvt), _M_state(__state), _M_with_cvtstate(true)
-  {
-   if (!_M_cvt)
- __throw_logic_error("wstring_convert");
-  }
+  : _M_cvt(__pcvt, "wstring_convert"),
+   _M_state(__state), _M_with_cvtstate(true)
+  { }
 
   /** Construct with error strings.
*
@@ -279,10 +306,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
   : _M_cvt(new _Codecvt),
_M_byte_err_string(__byte_err), _M_wide_err_string(__wide_err),
_M_with_strings(true)
-  {
-   if (!_M_cvt)
- __throw_logic_error("wstring_convert");
-  }
+  { }
 
   ~wstring_convert() = default;
 
@@ -370,7 +394,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
   state_type state() const { return _M_state; }
 
 private:
-  unique_ptr<_Codecvt> _M_cvt;
+  __detail::_Scoped_ptr<_Codecvt>  _M_cvt;
   byte_string  _M_byte_err_string;
   wide_string  _M_wide_err_string;
   state_type   _M_state = state_type();
@@ -405,13 +429,9 @@ _GLIBCXX_END_NAMESPACE_CXX11
   explicit
   wbuffer_convert(streambuf* __bytebuf, _Codecvt* __pcvt = new _Codecvt,
  state_type __state = state_type())
-  : _M_buf(__bytebuf), _M_cvt(__pcvt), _M_state(__state)
+  : _M_buf(__bytebuf), _M_cvt(__pcvt, "wbuffer_convert"),
+   _M_state(__state), _M_always_noconv(_M_cvt->always_noconv())
   {
-   if (!_M_cvt)
- 

[PATCH] by_pieces: Properly set m_max_size in op_by_pieces

2021-08-03 Thread H.J. Lu via Gcc-patches
1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
move is enabled since x86 uses vec_duplicate, which is enabled only when
inter-unit move is enabled, to implement store_by_pieces.
2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
compare_by_pieces.

gcc/

PR target/101742
* expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
for compare_by_pieces.
* config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.

gcc/testsuite/

PR target/101742
* gcc.target/i386/pr101742a.c: New test.
* gcc.target/i386/pr101742b.c: Likewise.
---
 gcc/config/i386/i386.h| 20 +++-
 gcc/expr.c|  6 +-
 gcc/testsuite/gcc.target/i386/pr101742a.c | 16 
 gcc/testsuite/gcc.target/i386/pr101742b.c |  4 
 4 files changed, 36 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index bed9cd9da18..9b416abd5f4 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1783,15 +1783,17 @@ typedef struct ix86_args {
 /* STORE_MAX_PIECES is the number of bytes at a time that we can
store efficiently.  */
 #define STORE_MAX_PIECES \
-  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
-   ? 64 \
-   : ((TARGET_AVX \
-   && !TARGET_PREFER_AVX128 \
-   && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
-  ? 32 \
-  : ((TARGET_SSE2 \
- && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-? 16 : UNITS_PER_WORD)))
+  (TARGET_INTER_UNIT_MOVES_TO_VEC \
+   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+  ? 64 \
+  : ((TARGET_AVX \
+ && !TARGET_PREFER_AVX128 \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+ ? 32 \
+ : ((TARGET_SSE2 \
+ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+ ? 16 : UNITS_PER_WORD))) \
+   : UNITS_PER_WORD)
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..2964b38b9a5 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
 m_from (from, from_load, from_cfn, from_cfn_data),
-m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+m_len (len),
+m_max_size (((!to_load && from == nullptr)
+? STORE_MAX_PIECES
+: (from_cfn != nullptr
+   ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1),
 m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c 
b/gcc/testsuite/gcc.target/i386/pr101742a.c
new file mode 100644
index 000..67ea40587dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2" } */
+
+int n2;
+
+__attribute__ ((simd)) char
+w7 (void)
+{
+  short int xb = n2;
+  int qp;
+
+  for (qp = 0; qp < 2; ++qp)
+xb = xb < 1;
+
+  return xb;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c 
b/gcc/testsuite/gcc.target/i386/pr101742b.c
new file mode 100644
index 000..ba19064077b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2 
-mtune-ctrl=sse_unaligned_store_optimal" } */
+
+#include "pr101742a.c"
-- 
2.31.1



[PATCH] vect: Tweak comparisons with existing epilogue loops

2021-08-03 Thread Richard Sandiford via Gcc-patches
This patch uses a more accurate scalar iteration estimate when
comparing the epilogue of a constant-iteration loop with a candidate
replacement epilogue.

In the testcase, the patch prevents a 1-to-3-element SVE epilogue
from seeming better than a 64-bit Advanced SIMD epilogue.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


gcc/
* tree-vect-loop.c (vect_better_loop_vinfo_p): Detect cases in
which old_loop_vinfo is an epilogue loop that handles a constant
number of iterations.

gcc/testsuite/
* gcc.target/aarch64/sve/cost_model_12.c: New test.
---
 .../gcc.target/aarch64/sve/cost_model_12.c| 19 +++
 gcc/tree-vect-loop.c  | 10 +-
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 0009d0964af..0a5b65adb04 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2778,7 +2778,15 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
 
   /* Limit the VFs to what is likely to be the maximum number of iterations,
  to handle cases in which at least one loop_vinfo is fully-masked.  */
-  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
+  HOST_WIDE_INT estimated_max_niter;
+  loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo);
+  unsigned HOST_WIDE_INT main_vf;
+  if (main_loop
+  && LOOP_VINFO_NITERS_KNOWN_P (main_loop)
+  && LOOP_VINFO_VECT_FACTOR (main_loop).is_constant (_vf))
+estimated_max_niter = LOOP_VINFO_INT_NITERS (main_loop) % main_vf;
+  else
+estimated_max_niter = likely_max_stmt_executions_int (loop);
   if (estimated_max_niter != -1)
 {
   if (known_le (estimated_max_niter, new_vf))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c
new file mode 100644
index 000..4c5226e05de
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cost_model_12.c
@@ -0,0 +1,19 @@
+/* { dg-options "-O3 -mtune=neoverse-512tvb" } */
+
+void
+f (float x[restrict 10][1024],
+   float y[restrict 10][1024], float z)
+{
+  for (int i = 0; i < 10; ++i)
+{
+#pragma GCC unroll 10
+  for (int j = 0; j < 10; ++j)
+   x[j][i] = y[j][i] * z;
+}
+}
+
+/* We should unroll the outer loop, with 2x 16-byte vectors and 1x
+   8-byte vectors.  */
+/* { dg-final { scan-assembler-not {\tptrue\t} } } */
+/* { dg-final { scan-assembler {\tv[0-9]+\.4s,} } } */
+/* { dg-final { scan-assembler {\tv[0-9]+\.2s,} } } */


Re: [PATCH] x86: Use XMM31 for scratch SSE register

2021-08-03 Thread H.J. Lu via Gcc-patches
On Tue, Aug 3, 2021 at 5:10 AM H.J. Lu  wrote:
>
> On Tue, Aug 3, 2021 at 1:43 AM Uros Bizjak  wrote:
> >
> > On Tue, Aug 3, 2021 at 10:15 AM Hongtao Liu  wrote:
> > >
> > > On Tue, Aug 3, 2021 at 4:03 PM Uros Bizjak via Gcc-patches
> > >  wrote:
> > > >
> > > > On Mon, Aug 2, 2021 at 7:47 PM H.J. Lu  wrote:
> > > > >
> > > > > In 64-bit mode, use XMM31 for scratch SSE register to avoid vzeroupper
> > > > > if possible.
> > > > >
> > > > > gcc/
> > > > >
> > > > > * config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit 
> > > > > mode,
> > > > > try XMM31 to avoid vzeroupper.
> > > > >
> > > > > gcc/testsuite/
> > > > >
> > > > > * gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to
> > > > > disable XMM31.
> > > > > * gcc.target/i386/avx-vzeroupper-15.c: Likewise.
> > > > > * gcc.target/i386/pr82941-1.c: Updated.  Check for vzeroupper.
> > > > > * gcc.target/i386/pr82942-1.c: Likewise.
> > > > > * gcc.target/i386/pr82990-1.c: Likewise.
> > > > > * gcc.target/i386/pr82990-3.c: Likewise.
> > > > > * gcc.target/i386/pr82990-5.c: Likewise.
> > > > > * gcc.target/i386/pr100865-4b.c: Likewise.
> > > > > * gcc.target/i386/pr100865-6b.c: Likewise.
> > > > > * gcc.target/i386/pr100865-7b.c: Likewise.
> > > > > * gcc.target/i386/pr100865-10b.c: Likewise.
> > > > > * gcc.target/i386/pr100865-8b.c: Updated.
> > > > > * gcc.target/i386/pr100865-9b.c: Likewise.
> > > > > * gcc.target/i386/pr100865-11b.c: Likewise.
> > > > > * gcc.target/i386/pr100865-12b.c: Likewise.
> > > > > ---
> > > > >  gcc/config/i386/i386.c | 18 
> > > > > +++---
> > > > >  .../gcc.target/i386/avx-vzeroupper-14.c|  2 +-
> > > > >  .../gcc.target/i386/avx-vzeroupper-15.c|  2 +-
> > > > >  gcc/testsuite/gcc.target/i386/pr100865-10b.c   |  1 +
> > > > >  gcc/testsuite/gcc.target/i386/pr100865-11b.c   |  2 +-
> > > > >  gcc/testsuite/gcc.target/i386/pr100865-12b.c   |  2 +-
> > > > >  gcc/testsuite/gcc.target/i386/pr100865-4b.c|  2 ++
> > > > >  gcc/testsuite/gcc.target/i386/pr100865-6b.c|  5 -
> > > > >  gcc/testsuite/gcc.target/i386/pr100865-7b.c|  5 -
> > > > >  gcc/testsuite/gcc.target/i386/pr100865-8b.c|  2 +-
> > > > >  gcc/testsuite/gcc.target/i386/pr100865-9b.c|  2 +-
> > > > >  gcc/testsuite/gcc.target/i386/pr82941-1.c  |  3 ++-
> > > > >  gcc/testsuite/gcc.target/i386/pr82942-1.c  |  3 ++-
> > > > >  gcc/testsuite/gcc.target/i386/pr82990-1.c  |  3 ++-
> > > > >  gcc/testsuite/gcc.target/i386/pr82990-3.c  |  3 ++-
> > > > >  gcc/testsuite/gcc.target/i386/pr82990-5.c  |  3 ++-
> > > > >  16 files changed, 42 insertions(+), 16 deletions(-)
> > > > >
> > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > index 842eb0e6786..ec0690876b7 100644
> > > > > --- a/gcc/config/i386/i386.c
> > > > > +++ b/gcc/config/i386/i386.c
> > > > > @@ -23335,9 +23335,21 @@ rtx
> > > > >  ix86_gen_scratch_sse_rtx (machine_mode mode)
> > > > >  {
> > > > >if (TARGET_SSE && !lra_in_progress)
> > > > > -return gen_rtx_REG (mode, (TARGET_64BIT
> > > > > -  ? LAST_REX_SSE_REG
> > > > > -  : LAST_SSE_REG));
> > > > > +{
> > > > > +  unsigned int regno;
> > > > > +  if (TARGET_64BIT)
> > > > > +   {
> > > > > + /* In 64-bit mode, use XMM31 to avoid vzeroupper and always
> > > > > +use XMM31 for CSE.  */
> > > > > + if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
> > > > > +   regno = LAST_EXT_REX_SSE_REG;
> > > > > + else
> > > > > +   regno = LAST_REX_SSE_REG;
> > > > > +   }
> > > > > +  else
> > > > > +   regno = LAST_SSE_REG;
> > > >
> > > > Assuming that ix86_hard_regno_mode_ok always returns false for XMM31
> > > > in 64bit mode, we can do:
> > > >
> > > > /* Use XMM31 if available to avoid vzeroupper.  */
> > > > if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
>
> It doesn't work for -m32 since ix86_hard_regno_mode_ok doesn't check
> for TARGET_64BIT.   LAST_EXST_REX_SSE_REG is used for -m32:

I will check in my original patch as is.

Thanks.

> $ /export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/xgcc
> -B/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/
> /export/gnu/import/git/gitlab/x86-gcc/gcc/testsuite/gcc.target/i386/pr82941-1.c
> -m32 -fdiagnostics-plain-output -O2 -march=skylake-avx512
> -ffat-lto-objects -fno-ident -S -o pr82941-1.s
> xgcc: internal compiler error: Segmentation fault signal terminated program 
> cc1
> Please submit a full bug report,
> with preprocessed source if appropriate.
> See  for instructions.
> $
>
> Program received signal SIGSEGV, Segmentation fault.
> 0x01201c64 in general_operand (
> op= 0x7bffeff8>, mode= 

Sanity check that 'Init' doesn't appear without 'Var' in '*.opt' files

2021-08-03 Thread Thomas Schwinge
Hi!

Is the attached OK to push?

No violations found per:

$ find -type f -name \*.opt | xargs grep -F 'Init(' | grep -v -F 'Var('

..., and manually verified the error condition.


Grüße
 Thomas


-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
>From 67b88991c4a37e63bfecd9a0a17d9d7561b23dce Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Tue, 3 Aug 2021 14:59:56 +0200
Subject: [PATCH] Sanity check that 'Init' doesn't appear without 'Var' in
 '*.opt' files

... as that doesn't make sense.

@item Init(@var{value})
The variable specified by the @code{Var} property should be statically
initialized to @var{value}.  [...]

	gcc/
	* optc-gen.awk: Sanity check that 'Init' doesn't appear without
	'Var'.
---
 gcc/optc-gen.awk | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/optc-gen.awk b/gcc/optc-gen.awk
index 880ac776d8a..77e598efd60 100644
--- a/gcc/optc-gen.awk
+++ b/gcc/optc-gen.awk
@@ -195,10 +195,14 @@ for (i = 0; i < n_extra_vars; i++) {
 }
 for (i = 0; i < n_opts; i++) {
 	name = var_name(flags[i]);
-	if (name == "")
+	init = opt_args("Init", flags[i])
+
+	if (name == "") {
+		if (init != "")
+		print "#error " opts[i] " must specify Var to use Init"
 		continue;
+	}
 
-	init = opt_args("Init", flags[i])
 	if (init != "") {
 		if (name in var_init && var_init[name] != init)
 			print "#error multiple initializers for " name
-- 
2.25.1



Re: [PATCH] vect: Tweak dump messages for vector mode choice

2021-08-03 Thread Richard Biener via Gcc-patches
On Tue, Aug 3, 2021 at 2:35 PM Richard Sandiford via Gcc-patches
 wrote:
>
> After vect_analyze_loop has successfully analysed a loop for
> one base vector mode B1, it considers using following base vector
> modes to vectorise an epilogue.  However, for VECT_COMPARE_COSTS,
> a later mode B2 might turn out to be better than B1 was.  Initially
> this comparison will be between an epilogue loop (for B2) and a main
> loop (for B1).  However, in r11-6458 I'd added code to reanalyse the
> B2 epilogue loop as a main loop, partly for correctness and partly
> for better costing.
>
> This can lead to a situation in which we think that the B2 epilogue
> loop was better than the B1 main loop, but that the B2 main loop is
> not better than the B1 main loop.  There was no dump message to say
> that this had happened, which made it look like B2 had still won.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

OK.

> Richard
>
>
> gcc/
> * tree-vect-loop.c (vect_analyze_loop): Print a dump message
> when a reanalyzed loop fails to be cheaper than the current
> main loop.
> ---
>  gcc/tree-vect-loop.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index 00a57b2ba62..48a54b0957f 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -3064,7 +3064,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
> *shared)
> = opt_loop_vec_info::success (main_loop_vinfo);
> }
>   else
> -   delete main_loop_vinfo;
> +   {
> + if (dump_enabled_p ())
> +   dump_printf_loc (MSG_NOTE, vect_location,
> +"* No longer preferring vector"
> +" mode %s after reanalyzing the loop"
> +" as a main loop\n",
> +GET_MODE_NAME
> +  (main_loop_vinfo->vector_mode));
> + delete main_loop_vinfo;
> +   }
> }
> }
>
> --
> 2.17.1
>


Re: [PATCH v3 1/2] rs6000: Add support for _mm_minpos_epu16

2021-08-03 Thread Paul A. Clarke via Gcc-patches
On Mon, Aug 02, 2021 at 05:29:08PM -0500, Segher Boessenkool wrote:
> On Thu, Jul 15, 2021 at 06:29:17PM -0500, Paul A. Clarke wrote:
> > Add a naive implementation of the subject x86 intrinsic to
> > ease porting.
> 
> > --- a/gcc/config/rs6000/smmintrin.h
> > +++ b/gcc/config/rs6000/smmintrin.h
> > @@ -172,4 +172,31 @@ _mm_test_mix_ones_zeros (__m128i __A, __m128i __mask)
> >return any_ones * any_zeros;
> >  }
> >  
> > +/* Return horizontal packed word minimum and its index in bits [15:0]
> > +   and bits [18:16] respectively.  */
> > +__inline __m128i
> > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > +_mm_minpos_epu16 (__m128i __A)
> > +{
> > +  union __u
> > +{
> > +  __m128i __m;
> > +  __v8hu __uh;
> > +};
> > +  union __u __u = { .__m = __A }, __r = { .__m = {0} };
> > +  unsigned short __ridx = 0;
> > +  unsigned short __rmin = __u.__uh[__ridx];
> > +  for (unsigned long __i = 1; __i < 8; __i++)
> > +{
> > +  if (__u.__uh[__i] < __rmin)
> > +   {
> > + __rmin = __u.__uh[__i];
> > + __ridx = __i;
> > +   }
> > +}
> > +  __r.__uh[0] = __rmin;
> > +  __r.__uh[1] = __ridx;
> > +  return __r.__m;
> > +}
> 
> As before: does this work correctly on BE?  Was it tested there?

Per the "cover letter":
| Tested on BE, LE (32 and 64bit).

> Okay for trunk if so.  Thanks!

Thanks! I'll push this shortly.

PC


Re: [ARM] PR66791: Replace builtins for fp and unsigned vmul_n intrinsics

2021-08-03 Thread Christophe Lyon via Gcc-patches
On Mon, Jul 19, 2021 at 2:34 PM Prathamesh Kulkarni <
prathamesh.kulka...@linaro.org> wrote:

> On Thu, 15 Jul 2021 at 16:46, Prathamesh Kulkarni
>  wrote:
> >
> > On Thu, 15 Jul 2021 at 14:47, Christophe Lyon
> >  wrote:
> > >
> > > Hi Prathamesh,
> > >
> > > On Mon, Jul 5, 2021 at 11:25 AM Kyrylo Tkachov via Gcc-patches <
> gcc-patches@gcc.gnu.org> wrote:
> > >>
> > >>
> > >>
> > >> > -Original Message-
> > >> > From: Prathamesh Kulkarni 
> > >> > Sent: 05 July 2021 10:18
> > >> > To: gcc Patches ; Kyrylo Tkachov
> > >> > 
> > >> > Subject: [ARM] PR66791: Replace builtins for fp and unsigned vmul_n
> > >> > intrinsics
> > >> >
> > >> > Hi Kyrill,
> > >> > I assume this patch is OK to commit after bootstrap+testing ?
> > >>
> > >> Yes.
> > >> Thanks,
> > >> Kyrill
> > >>
> > >
> > >
> > > The updated testcase fails on some configs:
> > > gcc.target/arm/armv8_2-fp16-neon-2.c: vdup\\.16\\tq[0-9]+, r[0-9]+
> found 2 times
> > > FAIL:  gcc.target/arm/armv8_2-fp16-neon-2.c scan-assembler-times
> vdup\\.16\\tq[0-9]+, r[0-9]+ 3
> > >
> > > For instance on arm-none-eabi with default configuration flags
> (mode/cpu/fpu)
> > > and default runtestflags.
> > > The same toolchain config also fails on this test when overriding
> runtestflags with:
> > > -mthumb/-mfloat-abi=soft/-march=armv6s-m
> > > -mthumb/-mfloat-abi=soft/-march=armv7-m
> > > -mthumb/-mfloat-abi=soft/-march=armv8.1-m.main
> > >
> > > Can you fix this please?
> > Hi Christophe,
> > Sorry for the breakage, I will take a look.
> The issue is for the following function;
>
> float16x8_t f2 (float16x8_t __a, float16_t __b) {
>   return __a * __b;
> }
>
> With -O2 -ffast-math -mfloat-abi=softfp -march=armv8.2-a+fp16, it
> generates:
> f2:
> ldrhip, [sp]@ __fp16
> vmovd18, r0, r1  @ v8hf
> vmovd19, r2, r3
> vdup.16 q8, ip
> vmul.f16q8, q8, q9
> vmovr0, r1, d16  @ v8hf
> vmovr2, r3, d17
> bx  lr
>
> It correctly generates vdup, but IIUC, r0-r3 are used up in loading
> 'a' into q9 (d18 / d19),
> and it uses ip for loading 'b' and ends up with vdup q8, ip, and thus
> the scan for "vdup\\.16\\tq[0-9]+, r[0-9]+" fails.
> I tried to adjust the scan to following to accommodate ip:
> /* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, (r[0-9]+|ip)} 3 }
> }  */
> but that still FAIL's because log shows:
> gcc.target/arm/armv8_2-fp16-neon-2.c: vdup\\.16\\tq[0-9]+,
> (r[0-9]+|ip) found 6 times
>
> Could you suggest how should I adjust the test, so the second operand
> can be either r[0-9]+ or ip register ?
>
>
Sorry for the delay, I was on vacation.

I don't know off-hand how to adjust the test, did you check why it matched
6 times?

Christophe


> Thanks,
> Prathamesh
> >
> > Thanks,
> > Prathamesh
> > >
> > > Thanks,
> > >
> > > Christophe
> > >
> > >> >
> > >> > Thanks,
> > >> > Prathamesh
>


[PATCH] vect: Tweak dump messages for vector mode choice

2021-08-03 Thread Richard Sandiford via Gcc-patches
After vect_analyze_loop has successfully analysed a loop for
one base vector mode B1, it considers using following base vector
modes to vectorise an epilogue.  However, for VECT_COMPARE_COSTS,
a later mode B2 might turn out to be better than B1 was.  Initially
this comparison will be between an epilogue loop (for B2) and a main
loop (for B1).  However, in r11-6458 I'd added code to reanalyse the
B2 epilogue loop as a main loop, partly for correctness and partly
for better costing.

This can lead to a situation in which we think that the B2 epilogue
loop was better than the B1 main loop, but that the B2 main loop is
not better than the B1 main loop.  There was no dump message to say
that this had happened, which made it look like B2 had still won.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


gcc/
* tree-vect-loop.c (vect_analyze_loop): Print a dump message
when a reanalyzed loop fails to be cheaper than the current
main loop.
---
 gcc/tree-vect-loop.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 00a57b2ba62..48a54b0957f 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -3064,7 +3064,16 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
= opt_loop_vec_info::success (main_loop_vinfo);
}
  else
-   delete main_loop_vinfo;
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"* No longer preferring vector"
+" mode %s after reanalyzing the loop"
+" as a main loop\n",
+GET_MODE_NAME
+  (main_loop_vinfo->vector_mode));
+ delete main_loop_vinfo;
+   }
}
}
 
-- 
2.17.1



Re: [PATCH, v2, libgomp, OpenMP 5.0] Implement omp_get_device_num

2021-08-03 Thread Thomas Schwinge
Hi Chung-Lin!

On 2021-08-02T21:10:57+0800, Chung-Lin Tang  wrote:
> --- a/libgomp/fortran.c
> +++ b/libgomp/fortran.c

> +int32_t
> +omp_get_device_num_ (void)
> +{
> +  return omp_get_device_num ();
> +}

Missing 'ialias_redirect (omp_get_device_num)'?


Grüße
 Thomas
-
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955


Re: [PATCH] Add a simple fraction class

2021-08-03 Thread Richard Biener via Gcc-patches
On Tue, Aug 3, 2021 at 1:58 PM Richard Sandiford
 wrote:
>
> Richard Biener  writes:
> > On Mon, Aug 2, 2021 at 1:31 PM Richard Sandiford
> >  wrote:
> >>
> >> Richard Biener  writes:
> >> > On Mon, Aug 2, 2021 at 12:43 PM Richard Sandiford
> >> >  wrote:
> >> >>
> >> >> Richard Biener via Gcc-patches  writes:
> >> >> > On Fri, Jul 30, 2021 at 5:59 PM Richard Sandiford via Gcc-patches
> >> >> >  wrote:
> >> >> >>
> >> >> >> This patch adds a simple class for holding A/B fractions.
> >> >> >> As the comments in the patch say, the class isn't designed
> >> >> >> to have nice numerial properties at the extremes.
> >> >> >>
> >> >> >> The motivating use case was some aarch64 costing work,
> >> >> >> where being able to represent fractions was much easier
> >> >> >> than using single integers and avoided the rounding errors
> >> >> >> that would come with using floats.  (Unlike things like
> >> >> >> COSTS_N_INSNS, there was no sensible constant base factor
> >> >> >> that could be used.)
> >> >> >>
> >> >> >> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
> >> >> >
> >> >> > Hmm, we use the sreal type for profiles.  I don't see any 
> >> >> > overflow/underflow
> >> >> > handling in your class - I suppose you're going to use it on integer 
> >> >> > types
> >> >> > given we're not allowed to use native FP?
> >> >>
> >> >> Yeah, I'm going to use it on integer types.  And it's not designed
> >> >> to have nice properties at extremes, including handling underflow and
> >> >> overflow.
> >> >
> >> > So maybe assert that it doesn't?  In particular nominator/denominator
> >> > are prone to overflowing in fractional representations.
> >> >
> >> > There's the option to round or ICE.  Or rather than the only option
> >> > is to round (or use a more expensive arbitrary precision representation).
> >>
> >> Yeah, I guess we could do that, but it semes inconsistent to assert
> >> for these costs and not do it for vector costs in general.  I think it's
> >> difficult to guarantee that there is no user input for which the current
> >> vector costs overflow.  And if we assert, we have to have a reason for
> >> believing that no such user input exists (modulo bugs).
> >>
> >> E.g. vect-inner-loop-cost-factor has an upper limit of 99, so the
> >> existing code only needs a cost of 2148 to overflow “int”.
> >
> > I'd argue those are of course bugs.  The 99 upper bound is way
> > too big given REB_BR_PROB_BASE is only 1.  But then we're now
> > set up to initialize vinfo->inner_loop_cost_factor based on profile data
> > (if it is reliable).
> >
> >> > So the question is whether the fractional behavior is better in more
> >> > cases than the sreal behavior (I can easily believe it is).
> >> >
> >> >> I want to use it in costing code, where we already happily multiply
> >> >> and add “int”-sized costs without worrying about overflow.  I'll be
> >> >> using uint64_t for the fractions though, just in case. :-)
> >> >>
> >> >> sreal doesn't help because it's still significand/exponent.  That 
> >> >> matters
> >> >> because…
> >> >>
> >> >> > I mean, how exactly does
> >> >> > the class solve the problem of rounding errors?
> >> >>
> >> >> …I wanted something that represented the results exactly (barring any of
> >> >> integer ops overflowing).  This makes it meaningful to compare costs for
> >> >> equality.  It also means we can use ordered comparisons without having
> >> >> to introduce a fudge factor to cope with one calculation having 
> >> >> different
> >> >> intermediate rounding from the other.
> >> >
> >> > I think you're underestimating how quickly your denominator will 
> >> > overflow?
> >>
> >> Well, it depends on how you use it. :-)  I agree you have to go into
> >> this knowing the risks of the representation (but then I'd argue that's
> >> true for floats/sreals too, if you use them for costs).
> >
> > Yeah, and sreals handle overflow/underflow in a well-defined way because
> > profile info tends to be crap ;)
> >
> >> > So I suppose all factors of all possible denominators are known, in fact
> >> > whats your main source for the divisions?  The VF?
> >>
> >> Yeah, the set of possible dominators is fixed at compile time and
> >> relatively small, but not easily enumerable.  The VF is one source,
> >> but we also have “number of X per cycle” values.  The problem with sreal
> >> is that sometimes those “X per cycle” values are 3, and 1/3 is where the
> >> rounding problems with floats/sreals start to come in.
> >>
> >> I'm fairly sure that using a uint64_t fractional representation for
> >> int costs and these set of denominator values is safe.  But if we
> >> think that this is just too dangerous to advertise as a general
> >> class within GCC, we could make it local to the aarch64 cost code
> >> instead.  Would that be OK?
> >
> > I think we should instead make its use safe, that is, simply round when
> > the denominator gets too big.  The gcn compute is already expensive
> > and so is 

Re: [PATCH] x86: Use XMM31 for scratch SSE register

2021-08-03 Thread H.J. Lu via Gcc-patches
On Tue, Aug 3, 2021 at 1:43 AM Uros Bizjak  wrote:
>
> On Tue, Aug 3, 2021 at 10:15 AM Hongtao Liu  wrote:
> >
> > On Tue, Aug 3, 2021 at 4:03 PM Uros Bizjak via Gcc-patches
> >  wrote:
> > >
> > > On Mon, Aug 2, 2021 at 7:47 PM H.J. Lu  wrote:
> > > >
> > > > In 64-bit mode, use XMM31 for scratch SSE register to avoid vzeroupper
> > > > if possible.
> > > >
> > > > gcc/
> > > >
> > > > * config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit mode,
> > > > try XMM31 to avoid vzeroupper.
> > > >
> > > > gcc/testsuite/
> > > >
> > > > * gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to
> > > > disable XMM31.
> > > > * gcc.target/i386/avx-vzeroupper-15.c: Likewise.
> > > > * gcc.target/i386/pr82941-1.c: Updated.  Check for vzeroupper.
> > > > * gcc.target/i386/pr82942-1.c: Likewise.
> > > > * gcc.target/i386/pr82990-1.c: Likewise.
> > > > * gcc.target/i386/pr82990-3.c: Likewise.
> > > > * gcc.target/i386/pr82990-5.c: Likewise.
> > > > * gcc.target/i386/pr100865-4b.c: Likewise.
> > > > * gcc.target/i386/pr100865-6b.c: Likewise.
> > > > * gcc.target/i386/pr100865-7b.c: Likewise.
> > > > * gcc.target/i386/pr100865-10b.c: Likewise.
> > > > * gcc.target/i386/pr100865-8b.c: Updated.
> > > > * gcc.target/i386/pr100865-9b.c: Likewise.
> > > > * gcc.target/i386/pr100865-11b.c: Likewise.
> > > > * gcc.target/i386/pr100865-12b.c: Likewise.
> > > > ---
> > > >  gcc/config/i386/i386.c | 18 +++---
> > > >  .../gcc.target/i386/avx-vzeroupper-14.c|  2 +-
> > > >  .../gcc.target/i386/avx-vzeroupper-15.c|  2 +-
> > > >  gcc/testsuite/gcc.target/i386/pr100865-10b.c   |  1 +
> > > >  gcc/testsuite/gcc.target/i386/pr100865-11b.c   |  2 +-
> > > >  gcc/testsuite/gcc.target/i386/pr100865-12b.c   |  2 +-
> > > >  gcc/testsuite/gcc.target/i386/pr100865-4b.c|  2 ++
> > > >  gcc/testsuite/gcc.target/i386/pr100865-6b.c|  5 -
> > > >  gcc/testsuite/gcc.target/i386/pr100865-7b.c|  5 -
> > > >  gcc/testsuite/gcc.target/i386/pr100865-8b.c|  2 +-
> > > >  gcc/testsuite/gcc.target/i386/pr100865-9b.c|  2 +-
> > > >  gcc/testsuite/gcc.target/i386/pr82941-1.c  |  3 ++-
> > > >  gcc/testsuite/gcc.target/i386/pr82942-1.c  |  3 ++-
> > > >  gcc/testsuite/gcc.target/i386/pr82990-1.c  |  3 ++-
> > > >  gcc/testsuite/gcc.target/i386/pr82990-3.c  |  3 ++-
> > > >  gcc/testsuite/gcc.target/i386/pr82990-5.c  |  3 ++-
> > > >  16 files changed, 42 insertions(+), 16 deletions(-)
> > > >
> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > index 842eb0e6786..ec0690876b7 100644
> > > > --- a/gcc/config/i386/i386.c
> > > > +++ b/gcc/config/i386/i386.c
> > > > @@ -23335,9 +23335,21 @@ rtx
> > > >  ix86_gen_scratch_sse_rtx (machine_mode mode)
> > > >  {
> > > >if (TARGET_SSE && !lra_in_progress)
> > > > -return gen_rtx_REG (mode, (TARGET_64BIT
> > > > -  ? LAST_REX_SSE_REG
> > > > -  : LAST_SSE_REG));
> > > > +{
> > > > +  unsigned int regno;
> > > > +  if (TARGET_64BIT)
> > > > +   {
> > > > + /* In 64-bit mode, use XMM31 to avoid vzeroupper and always
> > > > +use XMM31 for CSE.  */
> > > > + if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
> > > > +   regno = LAST_EXT_REX_SSE_REG;
> > > > + else
> > > > +   regno = LAST_REX_SSE_REG;
> > > > +   }
> > > > +  else
> > > > +   regno = LAST_SSE_REG;
> > >
> > > Assuming that ix86_hard_regno_mode_ok always returns false for XMM31
> > > in 64bit mode, we can do:
> > >
> > > /* Use XMM31 if available to avoid vzeroupper.  */
> > > if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))

It doesn't work for -m32 since ix86_hard_regno_mode_ok doesn't check
for TARGET_64BIT.   LAST_EXST_REX_SSE_REG is used for -m32:

$ /export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/xgcc
-B/export/build/gnu/tools-build/gcc-gitlab-debug/build-x86_64-linux/gcc/
/export/gnu/import/git/gitlab/x86-gcc/gcc/testsuite/gcc.target/i386/pr82941-1.c
-m32 -fdiagnostics-plain-output -O2 -march=skylake-avx512
-ffat-lto-objects -fno-ident -S -o pr82941-1.s
xgcc: internal compiler error: Segmentation fault signal terminated program cc1
Please submit a full bug report,
with preprocessed source if appropriate.
See  for instructions.
$

Program received signal SIGSEGV, Segmentation fault.
0x01201c64 in general_operand (
op=, mode=)
at /export/gnu/import/git/gitlab/x86-gcc/gcc/recog.c:1412
1412 {
(gdb) bt
#0  0x01201c64 in general_operand (
op=, mode=)
at /export/gnu/import/git/gitlab/x86-gcc/gcc/recog.c:1412
#1  0x0120221e in register_operand (op=0x75c10510, mode=E_V8DFmode)
at /export/gnu/import/git/gitlab/x86-gcc/gcc/recog.c:1559

Re: [PATCH v2] Make loops_list support an optional loop_p root

2021-08-03 Thread Richard Biener via Gcc-patches
On Fri, Jul 30, 2021 at 7:20 AM Kewen.Lin  wrote:
>
> on 2021/7/29 下午4:01, Richard Biener wrote:
> > On Fri, Jul 23, 2021 at 10:41 AM Kewen.Lin  wrote:
> >>
> >> on 2021/7/22 下午8:56, Richard Biener wrote:
> >>> On Tue, Jul 20, 2021 at 4:37
> >>> PM Kewen.Lin  wrote:
> 
>  Hi,
> 
>  This v2 has addressed some review comments/suggestions:
> 
>    - Use "!=" instead of "<" in function operator!= (const Iter )
>    - Add new CTOR loops_list (struct loops *loops, unsigned flags)
>  to support loop hierarchy tree rather than just a function,
>  and adjust to use loops* accordingly.
> >>>
> >>> I actually meant struct loop *, not struct loops * ;)  At the point
> >>> we pondered to make loop invariant motion work on single
> >>> loop nests we gave up not only but also because it iterates
> >>> over the loop nest but all the iterators only ever can process
> >>> all loops, not say, all loops inside a specific 'loop' (and
> >>> including that 'loop' if LI_INCLUDE_ROOT).  So the
> >>> CTOR would take the 'root' of the loop tree as argument.
> >>>
> >>> I see that doesn't trivially fit how loops_list works, at least
> >>> not for LI_ONLY_INNERMOST.  But I guess FROM_INNERMOST
> >>> could be adjusted to do ONLY_INNERMOST as well?
> >>>
> >>
> >>
> >> Thanks for the clarification!  I just realized that the previous
> >> version with struct loops* is problematic, all traversal is
> >> still bounded with outer_loop == NULL.  I think what you expect
> >> is to respect the given loop_p root boundary.  Since we just
> >> record the loops' nums, I think we still need the function* fn?
> >
> > Would it simplify things if we recorded the actual loop *?
> >
>
> I'm afraid it's unsafe to record the loop*.  I had the same
> question why the loop iterator uses index rather than loop* when
> I read this at the first time.  I guess the design of processing
> loops allows its user to update or even delete the folllowing
> loops to be visited.  For example, when the user does some tricks
> on one loop, then it duplicates the loop and its children to
> somewhere and then removes the loop and its children, when
> iterating onto its children later, the "index" way will check its
> validity by get_loop at that point, but the "loop *" way will
> have some recorded pointers to become dangling, can't do the
> validity check on itself, seems to need a side linear search to
> ensure the validity.
>
> > There's still the to_visit reserve which needs a bound on
> > the number of loops for efficiency reasons.
> >
>
> Yes, I still keep the fn in the updated version.
>
> >> So I add one optional argument loop_p root and update the
> >> visiting codes accordingly.  Before this change, the previous
> >> visiting uses the outer_loop == NULL as the termination condition,
> >> it perfectly includes the root itself, but with this given root,
> >> we have to use it as the termination condition to avoid to iterate
> >> onto its possible existing next.
> >>
> >> For LI_ONLY_INNERMOST, I was thinking whether we can use the
> >> code like:
> >>
> >> struct loops *fn_loops = loops_for_fn (fn)->larray;
> >> for (i = 0; vec_safe_iterate (fn_loops, i, ); i++)
> >> if (aloop != NULL
> >> && aloop->inner == NULL
> >> && flow_loop_nested_p (tree_root, aloop))
> >>  this->to_visit.quick_push (aloop->num);
> >>
> >> it has the stable bound, but if the given root only has several
> >> child loops, it can be much worse if there are many loops in fn.
> >> It seems impossible to predict the given root loop hierarchy size,
> >> maybe we can still use the original linear searching for the case
> >> loops_for_fn (fn) == root?  But since this visiting seems not so
> >> performance critical, I chose to share the code originally used
> >> for FROM_INNERMOST, hope it can have better readability and
> >> maintainability.
> >
> > I was indeed looking for something that has execution/storage
> > bound on the subtree we're interested in.  If we pull the CTOR
> > out-of-line we can probably keep the linear search for
> > LI_ONLY_INNERMOST when looking at the whole loop tree.
> >
>
> OK, I've moved the suggested single loop tree walker out-of-line
> to cfgloop.c, and brought the linear search back for
> LI_ONLY_INNERMOST when looking at the whole loop tree.
>
> > It just seemed to me that we can eventually re-use a
> > single loop tree walker for all orders, just adjusting the
> > places we push.
> >
>
> Wow, good point!  Indeed, I have further unified all orders
> handlings into a single function walk_loop_tree.
>
> >>
> >> Bootstrapped and regtested on powerpc64le-linux-gnu P9,
> >> x86_64-redhat-linux and aarch64-linux-gnu, also
> >> bootstrapped on ppc64le P9 with bootstrap-O3 config.
> >>
> >> Does the attached patch meet what you expect?
> >
> > So yeah, it's probably close to what is sensible.  Not sure
> > whether optimizing the loops for the !only_push_innermost_p
> > case is 

Re: [PATCH, v2, libgomp, OpenMP 5.0] Implement omp_get_device_num

2021-08-03 Thread Thomas Schwinge
Hi Chung-Lin!

Just a few quick comments:

On 2021-08-02T21:10:57+0800, Chung-Lin Tang  wrote:
> On 2021/7/23 6:39 PM, Jakub Jelinek wrote:
>> On Fri, Jul 23, 2021 at 06:21:41PM +0800, Chung-Lin Tang wrote:
>>> --- a/libgomp/icv-device.c
>>> +++ b/libgomp/icv-device.c
>>> @@ -61,8 +61,17 @@ omp_is_initial_device (void)
>>> return 1;
>>>   }
>>>
>>> +int
>>> +omp_get_device_num (void)
>>> +{
>>> +  /* By specification, this is equivalent to omp_get_initial_device
>>> + on the host.  */
>>> +  return omp_get_initial_device ();
>>> +}
>>> +
>>
>> I think this won't work properly with the intel micoffload, where the host
>> libgomp is used in the offloaded code.
>> For omp_is_initial_device, the plugin solves it by:
>> liboffloadmic/plugin/offload_target_main.cpp
>> overriding it:
>> /* Override the corresponding functions from libgomp.  */
>> extern "C" int
>> omp_is_initial_device (void) __GOMP_NOTHROW
>> {
>>return 0;
>> }
>>
>> extern "C" int32_t
>> omp_is_initial_device_ (void)
>> {
>>return omp_is_initial_device ();
>> }
>> but guess it will need slightly more work because we need to copy the value
>> to the offloading device too.
>> It can be done incrementally though.
>
> I guess this part of intelmic functionality will just have to wait later.
> There seem to be other parts of liboffloadmic that seems to need re-work,
> e.g. omp_get_num_devices() return mic_engines_total, where it should actually
> return the number of all devices (not just intelmic). omp_get_initial_device()
> returning -1 (which I don't quite understand), etc.

(I'm confirming there are such pre-existing problems with Intel MIC; I've
never looked up any details.)

> Really suggest to have intelmic support be re-worked as an offload plugin 
> inside
> libgomp, rather than floating outside by itself.

Well, it is a regular libgomp plugin, just its sources are not in
'libgomp/plugin/' and it's not built during libgomp build.  Are you
suggesting just to move it into 'libgomp/plugin/'?  This may need some
more complicated setup because of its 'liboffloadmic' dependency?


>>> --- a/libgomp/libgomp-plugin.h
>>> +++ b/libgomp/libgomp-plugin.h
>>> @@ -102,6 +102,12 @@ struct addr_pair
>>> uintptr_t end;
>>>   };
>>>
>>> +/* This symbol is to name a target side variable that holds the designated
>>> +   'device number' of the target device. The symbol needs to be available 
>>> to
>>> +   libgomp code and the  offload plugin (which in the latter case must be
>>> +   stringified).  */
>>> +#define GOMP_DEVICE_NUM_VAR __gomp_device_num
>>
>> For a single var it is acceptable (though, please avoid the double space
>> before offload plugin in the comment), but once we have more than one
>> variable, I think we should simply have a struct which will contain all the
>> parameters that need to be copied from the host to the offloading device at
>> image load time (and have eventually another struct that holds parameters
>> that we'll need to copy to the device on each kernel launch, I bet some ICVs
>> will be one category, other ICVs another one).

ACK.  Also other program state, like 'fenv' or the gfortran "state blob".
This is  "Missing data/state
sharing/propagation between host and offloading devices".

> Actually, if you look at the 5.[01] specifications, omp_get_device_num() is 
> not
> defined in terms of an ICV. Maybe it conceptually ought to be, but the current
> description of "the device number of the device on which the calling thread is
> executing" is not one if the defined ICVs.
>
> It looks like there will eventually be some kind of ICV block handled in a 
> similar
> way, but I think that the modifications will be straightforward then. For now,
> I think it's okay for GOMP_DEVICE_NUM_VAR to just be a normal global variable.

There is, by the way, precedent for that:
'libgomp/config/nvptx/time.c:double __nvptx_clocktick', set up in
'libgomp/plugin/plugin-nvptx.c:nvptx_set_clocktick' ('cuModuleGetGlobal'
to get the device address, followed by 'cuMemcpyHtoD'), invoked from
'libgomp/plugin/plugin-nvptx.c:GOMP_OFFLOAD_load_image', quite simple.

For the case discussed here, we're now adding more complex
'other_count'/'other_entries'/'num_others' bookkeeping.  (Great that all
of the plugins plus 'libgomp/target.c' invented their own terminology...)
;-)

> --- a/libgomp/plugin/plugin-gcn.c
> +++ b/libgomp/plugin/plugin-gcn.c

> @@ -3305,6 +3306,7 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, 
> const void *target_data,
>struct kernel_info *kernel;
>int kernel_count = image_desc->kernel_count;
>unsigned var_count = image_desc->global_variable_count;
> +  int other_count = 1;
>
>agent = get_agent_info (ord);
>if (!agent)
> @@ -3321,7 +3323,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, 
> const void *target_data,
>
>GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
>GCN_DEBUG ("Encountered %u global variables in an image\n", 

[PATCH 8/8] aarch64: Add -mtune=neoverse-512tvb

2021-08-03 Thread Richard Sandiford via Gcc-patches
This patch adds an option to tune for Neoverse cores that have
a total vector bandwidth of 512 bits (4x128 for Advanced SIMD
and a vector-length-dependent equivalent for SVE).  This is intended
to be a compromise between tuning aggressively for a single core like
Neoverse V1 (which can be too narrow) and tuning for AArch64 cores
in general (which can be too wide).

-mcpu=neoverse-512tvb is equivalent to -mcpu=neoverse-v1
-mtune=neoverse-512tvb.

gcc/
* doc/invoke.texi: Document -mtune=neoverse-512tvb and
-mcpu=neoverse-512tvb.
* config/aarch64/aarch64-cores.def (neoverse-512tvb): New entry.
* config/aarch64/aarch64-tune.md: Regenerate.
* config/aarch64/aarch64.c (neoverse512tvb_sve_vector_cost)
(neoverse512tvb_sve_issue_info, neoverse512tvb_vec_issue_info)
(neoverse512tvb_vector_cost, neoverse512tvb_tunings): New structures.
(aarch64_adjust_body_cost_sve): Handle -mtune=neoverse-512tvb.
(aarch64_adjust_body_cost): Likewise.
---
 gcc/config/aarch64/aarch64-cores.def |   1 +
 gcc/config/aarch64/aarch64-tune.md   |   2 +-
 gcc/config/aarch64/aarch64.c | 184 ++-
 gcc/doc/invoke.texi  |  27 +++-
 4 files changed, 202 insertions(+), 12 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-cores.def 
b/gcc/config/aarch64/aarch64-cores.def
index de8fe9bc09b..b2aa1670561 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -139,6 +139,7 @@ AARCH64_CORE("thunderx3t110",  thunderx3t110,  
thunderx3t110, 8_3A,  AARCH64_FL_
 /* Arm ('A') cores.  */
 AARCH64_CORE("zeus", zeus, cortexa57, 8_4A,  AARCH64_FL_FOR_ARCH8_4 | 
AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | AARCH64_FL_BF16 | 
AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | AARCH64_FL_RNG, 
neoversev1, 0x41, 0xd40, -1)
 AARCH64_CORE("neoverse-v1", neoversev1, cortexa57, 8_4A,  
AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | 
AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | 
AARCH64_FL_RNG, neoversev1, 0x41, 0xd40, -1)
+AARCH64_CORE("neoverse-512tvb", neoverse512tvb, cortexa57, 8_4A,  
AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_SVE | AARCH64_FL_RCPC | AARCH64_FL_I8MM | 
AARCH64_FL_BF16 | AARCH64_FL_F16 | AARCH64_FL_PROFILE | AARCH64_FL_SSBS | 
AARCH64_FL_RNG, neoverse512tvb, INVALID_IMP, INVALID_CORE, -1)
 
 /* Qualcomm ('Q') cores. */
 AARCH64_CORE("saphira", saphira,saphira,8_4A,  
AARCH64_FL_FOR_ARCH8_4 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC, saphira,   0x51, 
0xC01, -1)
diff --git a/gcc/config/aarch64/aarch64-tune.md 
b/gcc/config/aarch64/aarch64-tune.md
index af66c111da2..e491c29d31a 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82"
+   
"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,neoversen2,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82"
(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 19625eb048d..f80de2ca897 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1842,6 +1842,136 @@ static const struct tune_params neoversev1_tunings =
   _prefetch_tune
 };
 
+static const sve_vec_cost neoverse512tvb_sve_vector_cost =
+{
+  {
+2, /* int_stmt_cost  */
+2, /* fp_stmt_cost  */
+4, /* ld2_st2_permute_cost  */
+5, /* ld3_st3_permute_cost  */
+5, /* ld4_st4_permute_cost  */
+3, /* permute_cost  */
+/* Theoretically, a reduction involving 15 scalar ADDs could
+   complete in ~5 cycles and would have a cost 

[PATCH 7/8] aarch64: Restrict issue heuristics to inner vector loop

2021-08-03 Thread Richard Sandiford via Gcc-patches
The AArch64 vector costs try to take issue rates into account.
However, when vectorising an outer loop, we lumped the inner
and outer operations together, which is somewhat meaningless.
This patch restricts the heuristic to the inner loop.

gcc/
* config/aarch64/aarch64.c (aarch64_add_stmt_cost): Only
record issue information for operations that occur in the
innermost loop.
---
 gcc/config/aarch64/aarch64.c | 14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 19045ef6944..19625eb048d 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15392,6 +15392,10 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void 
*data, int count,
   fractional_cost stmt_cost
= aarch64_builtin_vectorization_cost (kind, vectype, misalign);
 
+  bool in_inner_loop_p = (where == vect_body
+ && stmt_info
+ && stmt_in_inner_loop_p (vinfo, stmt_info));
+
   /* Do one-time initialization based on the vinfo.  */
   loop_vec_info loop_vinfo = dyn_cast (vinfo);
   bb_vec_info bb_vinfo = dyn_cast (vinfo);
@@ -15438,14 +15442,15 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void 
*data, int count,
  stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
stmt_cost);
 
- /* If we're recording a nonzero vector loop body cost, also estimate
-the operations that would need to be issued by all relevant
-implementations of the loop.  */
+ /* If we're recording a nonzero vector loop body cost for the
+innermost loop, also estimate the operations that would need
+to be issued by all relevant implementations of the loop.  */
  auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
  if (loop_vinfo
  && issue_info
  && costs->vec_flags
  && where == vect_body
+ && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
  && vectype
  && stmt_cost != 0)
{
@@ -15489,8 +15494,7 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void 
*data, int count,
   /* Statements in an inner loop relative to the loop being
 vectorized are weighted more heavily.  The value here is
 arbitrary and could potentially be improved with analysis.  */
-  if (where == vect_body && stmt_info
- && stmt_in_inner_loop_p (vinfo, stmt_info))
+  if (in_inner_loop_p)
{
  gcc_assert (loop_vinfo);
  count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /*  FIXME  */


[PATCH 6/8] aarch64: Tweak MLA vector costs

2021-08-03 Thread Richard Sandiford via Gcc-patches
The issue-based vector costs currently assume that a multiply-add
sequence can be implemented using a single instruction.  This is
generally true for scalars (which have a 4-operand instruction)
and SVE (which allows the output to be tied to any input).
However, for Advanced SIMD, multiplying two values and adding
an invariant will end up being a move and an MLA.

The only target to use the issue-based vector costs is Neoverse V1,
which would generally prefer SVE in this case anyway.  I therefore
don't have a self-contained testcase.  However, the distinction
becomes more important with a later patch.

gcc/
* config/aarch64/aarch64.c (aarch64_multiply_add_p): Add a vec_flags
parameter.  Detect cases in which an Advanced SIMD MLA would almost
certainly require a MOV.
(aarch64_count_ops): Update accordingly.
---
 gcc/config/aarch64/aarch64.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 084f8caa0da..19045ef6944 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14767,9 +14767,12 @@ aarch64_integer_truncation_p (stmt_vec_info stmt_info)
 
 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
or multiply-subtract sequence that might be suitable for fusing into a
-   single instruction.  */
+   single instruction.  If VEC_FLAGS is zero, analyze the operation as
+   a scalar one, otherwise analyze it as an operation on vectors with those
+   VEC_* flags.  */
 static bool
-aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info)
+aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
+   unsigned int vec_flags)
 {
   gassign *assign = dyn_cast (stmt_info->stmt);
   if (!assign)
@@ -14797,6 +14800,22 @@ aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info 
stmt_info)
   if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
continue;
 
+  if (vec_flags & VEC_ADVSIMD)
+   {
+ /* Scalar and SVE code can tie the result to any FMLA input (or none,
+although that requires a MOVPRFX for SVE).  However, Advanced SIMD
+only supports MLA forms, so will require a move if the result
+cannot be tied to the accumulator.  The most important case in
+which this is true is when the accumulator input is invariant.  */
+ rhs = gimple_op (assign, 3 - i);
+ if (TREE_CODE (rhs) != SSA_NAME)
+   return false;
+ def_stmt_info = vinfo->lookup_def (rhs);
+ if (!def_stmt_info
+ || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
+   return false;
+   }
+
   return true;
 }
   return false;
@@ -15232,7 +15251,7 @@ aarch64_count_ops (class vec_info *vinfo, 
aarch64_vector_costs *costs,
 }
 
   /* Assume that multiply-adds will become a single operation.  */
-  if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info))
+  if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
 return;
 
   /* When costing scalar statements in vector code, the count already


[PATCH 5/8] aarch64: Tweak the cost of elementwise stores

2021-08-03 Thread Richard Sandiford via Gcc-patches
When the vectoriser scalarises a strided store, it counts one
scalar_store for each element plus one vec_to_scalar extraction
for each element.  However, extracting element 0 is free on AArch64,
so it should have zero cost.

I don't have a testcase that requires this for existing -mtune
options, but it becomes more important with a later patch.

gcc/
* config/aarch64/aarch64.c (aarch64_is_store_elt_extraction): New
function, split out from...
(aarch64_detect_vector_stmt_subtype): ...here.
(aarch64_add_stmt_cost): Treat extracting element 0 as free.
---
 gcc/config/aarch64/aarch64.c | 22 +++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 36f11808916..084f8caa0da 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14622,6 +14622,18 @@ aarch64_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
 }
 }
 
+/* Return true if an operaton of kind KIND for STMT_INFO represents
+   the extraction of an element from a vector in preparation for
+   storing the element to memory.  */
+static bool
+aarch64_is_store_elt_extraction (vect_cost_for_stmt kind,
+stmt_vec_info stmt_info)
+{
+  return (kind == vec_to_scalar
+ && STMT_VINFO_DATA_REF (stmt_info)
+ && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)));
+}
+
 /* Return true if STMT_INFO represents part of a reduction.  */
 static bool
 aarch64_is_reduction (stmt_vec_info stmt_info)
@@ -14959,9 +14971,7 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, 
vect_cost_for_stmt kind,
   /* Detect cases in which vec_to_scalar is describing the extraction of a
  vector element in preparation for a scalar store.  The store itself is
  costed separately.  */
-  if (kind == vec_to_scalar
-  && STMT_VINFO_DATA_REF (stmt_info)
-  && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
+  if (aarch64_is_store_elt_extraction (kind, stmt_info))
 return simd_costs->store_elt_extra_cost;
 
   /* Detect SVE gather loads, which are costed as a single scalar_load
@@ -15382,6 +15392,12 @@ aarch64_add_stmt_cost (class vec_info *vinfo, void 
*data, int count,
  if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype))
costs->saw_sve_only_op = true;
 
+ /* If we scalarize a strided store, the vectorizer costs one
+vec_to_scalar for each element.  However, we can store the first
+element using an FP store without a separate extract step.  */
+ if (aarch64_is_store_elt_extraction (kind, stmt_info))
+   count -= 1;
+
  stmt_cost = aarch64_detect_scalar_stmt_subtype
(vinfo, kind, stmt_info, stmt_cost);
 


[PATCH 4/8] aarch64: Add gather_load_xNN_cost tuning fields

2021-08-03 Thread Richard Sandiford via Gcc-patches
This patch adds tuning fields for the total cost of a gather load
instruction.  Until now, we've costed them as one scalar load
per element instead.  Those scalar_load-based values are also
what the patch uses to fill in the new fields for existing
cost structures.

gcc/
* config/aarch64/aarch64-protos.h (sve_vec_cost):
Add gather_load_x32_cost and gather_load_x64_cost.
* config/aarch64/aarch64.c (generic_sve_vector_cost)
(a64fx_sve_vector_cost, neoversev1_sve_vector_cost): Update
accordingly, using the values given by the scalar_load * number
of elements calculation that we used previously.
(aarch64_detect_vector_stmt_subtype): Use the new fields.
---
 gcc/config/aarch64/aarch64-protos.h |  9 +
 gcc/config/aarch64/aarch64.c| 19 +++
 2 files changed, 28 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index fb4ce8e9f84..b91eeeba101 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -259,12 +259,16 @@ struct sve_vec_cost : simd_vec_cost
  unsigned int fadda_f16_cost,
  unsigned int fadda_f32_cost,
  unsigned int fadda_f64_cost,
+ unsigned int gather_load_x32_cost,
+ unsigned int gather_load_x64_cost,
  unsigned int scatter_store_elt_cost)
 : simd_vec_cost (base),
   clast_cost (clast_cost),
   fadda_f16_cost (fadda_f16_cost),
   fadda_f32_cost (fadda_f32_cost),
   fadda_f64_cost (fadda_f64_cost),
+  gather_load_x32_cost (gather_load_x32_cost),
+  gather_load_x64_cost (gather_load_x64_cost),
   scatter_store_elt_cost (scatter_store_elt_cost)
   {}
 
@@ -279,6 +283,11 @@ struct sve_vec_cost : simd_vec_cost
   const int fadda_f32_cost;
   const int fadda_f64_cost;
 
+  /* The cost of a gather load instruction.  The x32 value is for loads
+ of 32-bit elements and the x64 value is for loads of 64-bit elements.  */
+  const int gather_load_x32_cost;
+  const int gather_load_x64_cost;
+
   /* The per-element cost of a scatter store.  */
   const int scatter_store_elt_cost;
 };
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b14b6f22aec..36f11808916 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -675,6 +675,8 @@ static const sve_vec_cost generic_sve_vector_cost =
   2, /* fadda_f16_cost  */
   2, /* fadda_f32_cost  */
   2, /* fadda_f64_cost  */
+  4, /* gather_load_x32_cost  */
+  2, /* gather_load_x64_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
@@ -744,6 +746,8 @@ static const sve_vec_cost a64fx_sve_vector_cost =
   13, /* fadda_f16_cost  */
   13, /* fadda_f32_cost  */
   13, /* fadda_f64_cost  */
+  64, /* gather_load_x32_cost  */
+  32, /* gather_load_x64_cost  */
   1 /* scatter_store_elt_cost  */
 };
 
@@ -1739,6 +1743,8 @@ static const sve_vec_cost neoversev1_sve_vector_cost =
   19, /* fadda_f16_cost  */
   11, /* fadda_f32_cost  */
   8, /* fadda_f64_cost  */
+  32, /* gather_load_x32_cost  */
+  16, /* gather_load_x64_cost  */
   3 /* scatter_store_elt_cost  */
 };
 
@@ -14958,6 +14964,19 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, 
vect_cost_for_stmt kind,
   && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
 return simd_costs->store_elt_extra_cost;
 
+  /* Detect SVE gather loads, which are costed as a single scalar_load
+ for each element.  We therefore need to divide the full-instruction
+ cost by the number of elements in the vector.  */
+  if (kind == scalar_load
+  && sve_costs
+  && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+{
+  unsigned int nunits = vect_nunits_for_cost (vectype);
+  if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
+   return { sve_costs->gather_load_x64_cost, nunits };
+  return { sve_costs->gather_load_x32_cost, nunits };
+}
+
   /* Detect cases in which a scalar_store is really storing one element
  in a scatter operation.  */
   if (kind == scalar_store


[PATCH 3/8] aarch64: Split out aarch64_adjust_body_cost_sve

2021-08-03 Thread Richard Sandiford via Gcc-patches
This patch splits the SVE-specific part of aarch64_adjust_body_cost
out into its own subroutine, so that a future patch can call it
more than once.  I wondered about using a lambda to avoid having
to pass all the arguments, but in the end this way seemed clearer.

gcc/
* config/aarch64/aarch64.c (aarch64_adjust_body_cost_sve): New
function, split out from...
(aarch64_adjust_body_cost): ...here.
---
 gcc/config/aarch64/aarch64.c | 220 ---
 1 file changed, 127 insertions(+), 93 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 17fcb34b2c8..b14b6f22aec 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15488,6 +15488,126 @@ aarch64_estimate_min_cycles_per_iter
   return cycles;
 }
 
+/* Subroutine of aarch64_adjust_body_cost for handling SVE.
+   Use ISSUE_INFO to work out how fast the SVE code can be issued and compare
+   it to the equivalent value for scalar code (SCALAR_CYCLES_PER_ITER).
+   If COULD_USE_ADVSIMD is true, also compare it to the issue rate of
+   Advanced SIMD code (ADVSIMD_CYCLES_PER_ITER).
+
+   COSTS is as for aarch64_adjust_body_cost.  ORIG_BODY_COST is the cost
+   originally passed to aarch64_adjust_body_cost and *BODY_COST is the current
+   value of the adjusted cost.  *SHOULD_DISPARAGE is true if we think the loop
+   body is too expensive.  */
+
+static fractional_cost
+aarch64_adjust_body_cost_sve (const aarch64_vector_costs *costs,
+ const aarch64_vec_issue_info *issue_info,
+ fractional_cost scalar_cycles_per_iter,
+ fractional_cost advsimd_cycles_per_iter,
+ bool could_use_advsimd,
+ unsigned int orig_body_cost,
+ unsigned int *body_cost,
+ bool *should_disparage)
+{
+  /* Estimate the minimum number of cycles per iteration needed to issue
+ non-predicate operations.  */
+  fractional_cost sve_nonpred_cycles_per_iter
+= aarch64_estimate_min_cycles_per_iter (>sve_ops,
+   issue_info->sve);
+
+  /* Separately estimate the minimum number of cycles per iteration needed
+ to issue the predicate operations.  */
+  fractional_cost sve_pred_issue_cycles_per_iter
+= { costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle };
+
+  /* Calculate the overall limit on the number of cycles per iteration.  */
+  fractional_cost sve_cycles_per_iter
+= std::max (sve_nonpred_cycles_per_iter, sve_pred_issue_cycles_per_iter);
+
+  if (dump_enabled_p ())
+{
+  costs->sve_ops.dump ();
+  dump_printf_loc (MSG_NOTE, vect_location,
+  "  estimated cycles per iteration = %f\n",
+  sve_cycles_per_iter.as_double ());
+  dump_printf_loc (MSG_NOTE, vect_location,
+  "  estimated cycles per iteration for non-predicate"
+  " operations = %f\n",
+  sve_nonpred_cycles_per_iter.as_double ());
+  if (costs->sve_ops.pred_ops)
+   dump_printf_loc (MSG_NOTE, vect_location, "  estimated cycles per"
+" iteration for predicate operations = %d\n",
+sve_pred_issue_cycles_per_iter.as_double ());
+}
+
+  /* If the scalar version of the loop could issue at least as
+ quickly as the predicate parts of the SVE loop, make the SVE loop
+ prohibitively expensive.  In this case vectorization is adding an
+ overhead that the original scalar code didn't have.
+
+ This is mostly intended to detect cases in which WHILELOs dominate
+ for very tight loops, which is something that normal latency-based
+ costs would not model.  Adding this kind of cliffedge would be
+ too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
+ code in the caller handles that case in a more conservative way.  */
+  fractional_cost sve_estimate = sve_pred_issue_cycles_per_iter + 1;
+  if (scalar_cycles_per_iter < sve_estimate)
+{
+  unsigned int min_cost
+   = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
+  if (*body_cost < min_cost)
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"Increasing body cost to %d because the"
+" scalar code could issue within the limit"
+" imposed by predicate operations\n",
+min_cost);
+ *body_cost = min_cost;
+ *should_disparage = true;
+   }
+}
+
+  /* If it appears that the Advanced SIMD version of a loop could issue
+ more quickly than the SVE one, increase the SVE cost in proportion
+ to the difference.  The intention is to make Advanced SIMD preferable
+ in cases where an Advanced SIMD version 

[PATCH 2/8] aarch64: Add a simple fixed-point class for costing

2021-08-03 Thread Richard Sandiford via Gcc-patches
This patch adds a simple fixed-point class for holding fractional
cost values.  It can exactly represent the reciprocal of any
single-vector SVE element count (including the non-power-of-2 ones).
This means that it can also hold 1/N for all N in [1, 16], which should
be enough for the various *_per_cycle fields.

For now the assumption is that the number of possible reciprocals
is fixed at compile time and so the class should always be able
to hold an exact value.

The class uses a uint64_t to hold the fixed-point value, which means
that it can hold any scaled uint32_t cost.  Normally we don't worry
about overflow when manipulating raw uint32_t costs, but just to be
on the safe side, the class uses saturating arithmetic for all
operations.

As far as the changes to the cost routines themselves go:

- The changes to aarch64_add_stmt_cost and its subroutines are
  just laying groundwork for future patches; no functional change
  intended.

- The changes to aarch64_adjust_body_cost mean that we now
  take fractional differences into account.

gcc/
* config/aarch64/fractional-cost.h: New file.
* config/aarch64/aarch64.c: Include  (indirectly)
and cost_fraction.h.
(vec_cost_fraction): New typedef.
(aarch64_detect_scalar_stmt_subtype): Use it for statement costs.
(aarch64_detect_vector_stmt_subtype): Likewise.
(aarch64_sve_adjust_stmt_cost, aarch64_adjust_stmt_cost): Likewise.
(aarch64_estimate_min_cycles_per_iter): Use vec_cost_fraction
for cycle counts.
(aarch64_adjust_body_cost): Likewise.
(aarch64_test_cost_fraction): New function.
(aarch64_run_selftests): Call it.
---
 gcc/config/aarch64/aarch64.c | 179 +++-
 gcc/config/aarch64/fractional-cost.h | 236 +++
 2 files changed, 377 insertions(+), 38 deletions(-)
 create mode 100644 gcc/config/aarch64/fractional-cost.h

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1a8cd131ca2..17fcb34b2c8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -20,8 +20,9 @@
 
 #define IN_TARGET_CODE 1
 
-#include "config.h"
 #define INCLUDE_STRING
+#define INCLUDE_ALGORITHM
+#include "config.h"
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
@@ -76,6 +77,7 @@
 #include "function-abi.h"
 #include "gimple-pretty-print.h"
 #include "tree-ssa-loop-niter.h"
+#include "fractional-cost.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -14912,10 +14914,10 @@ aarch64_in_loop_reduction_latency (vec_info *vinfo, 
stmt_vec_info stmt_info,
for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
try to subdivide the target-independent categorization provided by KIND
to get a more accurate cost.  */
-static unsigned int
+static fractional_cost
 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
stmt_vec_info stmt_info,
-   unsigned int stmt_cost)
+   fractional_cost stmt_cost)
 {
   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
  the extension with the load.  */
@@ -14931,11 +14933,11 @@ aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, 
vect_cost_for_stmt kind,
the target-independent categorization provided by KIND to get a more
accurate cost.  WHERE specifies where the cost associated with KIND
occurs.  */
-static unsigned int
+static fractional_cost
 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
stmt_vec_info stmt_info, tree vectype,
enum vect_cost_model_location where,
-   unsigned int stmt_cost)
+   fractional_cost stmt_cost)
 {
   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
   const sve_vec_cost *sve_costs = nullptr;
@@ -15016,10 +15018,10 @@ aarch64_detect_vector_stmt_subtype (vec_info *vinfo, 
vect_cost_for_stmt kind,
for STMT_INFO, which has cost kind KIND and which when vectorized would
operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
targets.  */
-static unsigned int
+static fractional_cost
 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
  stmt_vec_info stmt_info, tree vectype,
- unsigned int stmt_cost)
+ fractional_cost stmt_cost)
 {
   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
  vector register size or number of units.  Integer promotions of this
@@ -15083,9 +15085,9 @@ aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, 
vect_cost_for_stmt kind,
 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
and which when vectorized would operate on 

[PATCH 1/8] aarch64: Turn sve_width tuning field into a bitmask

2021-08-03 Thread Richard Sandiford via Gcc-patches
The tuning structures have an sve_width field that specifies the
number of bits in an SVE vector (or SVE_NOT_IMPLEMENTED if not
applicable).  This patch turns the field into a bitmask so that
it can specify multiple widths at the same time.  For now we
always treat the mininum width as the likely width.

An alternative would have been to add extra fields, which would
have coped correctly with non-power-of-2 widths.  However,
we're very far from supporting constant non-power-of-2 vectors
in GCC, so I think the non-power-of-2 case will in reality always
have to be hidden behind VLA.

gcc/
* config/aarch64/aarch64-protos.h (tune_params::sve_width): Turn
into a bitmask.
* config/aarch64/aarch64.c (aarch64_cmp_autovec_modes): Update
accordingly.
(aarch64_estimated_poly_value): Likewise.  Use the least significant
set bit for the minimum and likely values.  Use the most significant
set bit for the maximum value.
---
 gcc/config/aarch64/aarch64-protos.h |  8 
 gcc/config/aarch64/aarch64.c| 15 ++-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index c2033387384..fb4ce8e9f84 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -506,10 +506,10 @@ struct tune_params
   const struct cpu_vector_cost *vec_costs;
   const struct cpu_branch_cost *branch_costs;
   const struct cpu_approx_modes *approx_modes;
-  /* Width of the SVE registers or SVE_NOT_IMPLEMENTED if not applicable.
- Only used for tuning decisions, does not disable VLA
- vectorization.  */
-  enum aarch64_sve_vector_bits_enum sve_width;
+  /* A bitmask of the possible SVE register widths in bits,
+ or SVE_NOT_IMPLEMENTED if not applicable.  Only used for tuning
+ decisions, does not disable VLA vectorization.  */
+  unsigned int sve_width;
   int memmov_cost;
   int issue_rate;
   unsigned int fusible_ops;
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index e2114605901..1a8cd131ca2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -19144,14 +19144,12 @@ aarch64_cmp_autovec_modes (machine_mode sve_m, 
machine_mode asimd_m)
   bool prefer_asimd = aarch64_autovec_preference == 3;
   bool prefer_sve = aarch64_autovec_preference == 4;
 
-  aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width;
-
   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
   /* If the CPU information does not have an SVE width registered use the
  generic poly_int comparison that prefers SVE.  If a preference is
  explicitly requested avoid this path.  */
-  if (tune_width == SVE_SCALABLE
+  if (aarch64_tune_params.sve_width == SVE_SCALABLE
   && !prefer_asimd
   && !prefer_sve)
 return maybe_gt (nunits_sve, nunits_asimd);
@@ -24980,8 +24978,7 @@ aarch64_estimated_poly_value (poly_int64 val,
  poly_value_estimate_kind kind
= POLY_VALUE_LIKELY)
 {
-  enum aarch64_sve_vector_bits_enum width_source
-= aarch64_tune_params.sve_width;
+  unsigned int width_source = aarch64_tune_params.sve_width;
 
   /* If there is no core-specific information then the minimum and likely
  values are based on 128-bit vectors and the maximum is based on
@@ -24996,6 +24993,14 @@ aarch64_estimated_poly_value (poly_int64 val,
  return val.coeffs[0] + val.coeffs[1] * 15;
   }
 
+  /* Allow sve_width to be a bitmask of different VL, treating the lowest
+ as likely.  This could be made more general if future -mtune options
+ need it to be.  */
+  if (kind == POLY_VALUE_MAX)
+width_source = 1 << floor_log2 (width_source);
+  else
+width_source = least_bit_hwi (width_source);
+
   /* If the core provides width information, use that.  */
   HOST_WIDE_INT over_128 = width_source - 128;
   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;


[PATCH 0/8] aarch64 vector cost tweaks

2021-08-03 Thread Richard Sandiford via Gcc-patches
This patch series:

(1) generalises the aarch64 vector costs to allow for the final patch.
This part should be a no-op for existing tuning code.

(2) tweaks the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS code.  This currently
only affects neoverse-v1 and again helps with the final patch.

(3) adds a new -mtune=neoverse-512tvb option.  See the covering message
in the final patch for details.

Tested on aarch64-linux-gnu and applied to trunk so far.  I'll backport
to GCC 11 in a few days if there is no fallout.  The patches should be
very low risk; as mentioned, (1) should be a no-op for existing targets
and (2) simply provides minor tweaks/fixes to -mtune code that was new
to GCC 11.

Thanks,
Richard


Re: [PATCH take 2] Fold (X<

2021-08-03 Thread Richard Biener via Gcc-patches
On Wed, Jul 28, 2021 at 2:45 PM Roger Sayle  wrote:
>
>
> Hi Marc,
>
> Thanks for the feedback.  After some quality time in gdb, I now appreciate
> that
> match.pd behaves (subtly) differently between generic and gimple, and the
> trees actually being passed to tree_nonzero_bits were not quite what I had
> expected.  Sorry for my confusion, the revised patch below is now much
> shorter
> (and my follow-up patch that was originally to tree_nonzero_bits looks like
> it
> now needs to be to get_nonzero_bits!).
>
> This revised patch has been retested on 864_64-pc-linux-gnu with a
> "make bootstrap" and "make -k check" with no new failures.
>
> Ok for mainline?

OK I think.  It would be nice if (match ...) could be used to merge
the cases, like

(match (mult_by_constant @0 @1)
 (mult @0 INTEGER_CST@1))
(match (mult_by_constant @0 (lshift { integer_one_node; } @1))
 (lshift @0 @1)
(match (mult_by_constant @0 integer_one_node)
 @0)

but (match ...) can't "mutate" the matching operands, so
at least the shift variant cannot be expressed right now,
likewise the "constant" matching operand doesn't parse.

Richard.

> 2021-07-28  Roger Sayle  
> Marc Glisse 
>
> gcc/ChangeLog
> * match.pd (bit_ior, bit_xor): Canonicalize (X*C1)|(X*C2) and
> (X*C1)^(X*C2) as X*(C1+C2), and related variants, using
> tree_nonzero_bits to ensure that operands are bit-wise disjoint.
>
> gcc/testsuite/ChangeLog
> * gcc.dg/fold-ior-4.c: New test.
>
> Roger
> --
>
> -Original Message-
> From: Marc Glisse 
> Sent: 26 July 2021 16:45
> To: Roger Sayle 
> Cc: 'GCC Patches' 
> Subject: Re: [PATCH] Fold (X<
> On Mon, 26 Jul 2021, Roger Sayle wrote:
>
> > The one aspect that's a little odd is that each transform is paired
> > with a convert@1 variant, using the efficient match machinery to
> > expose any zero extension to fold-const.c's tree_nonzero_bits
> functionality.
>
> Copying the first transform for context
>
> +(for op (bit_ior bit_xor)
> + (simplify
> +  (op (mult:s@0 @1 INTEGER_CST@2)
> +  (mult:s@3 @1 INTEGER_CST@4))
> +  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
> +   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
> +   (mult @1
> +{ wide_int_to_tree (type, wi::to_wide (@2) + wi::to_wide (@4));
> })))
> +(simplify
> +  (op (mult:s@0 (convert@1 @2) INTEGER_CST@3)
> +  (mult:s@4 (convert@1 @2) INTEGER_CST@5))
> +  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
> +   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@4)) == 0)
> +   (mult @1
> +{ wide_int_to_tree (type, wi::to_wide (@3) + wi::to_wide (@5));
> })))
>
> Could you explain how the convert helps exactly?
>
> --
> Marc Glisse


Re: [PATCH] Add a simple fraction class

2021-08-03 Thread Richard Sandiford via Gcc-patches
Richard Biener  writes:
> On Mon, Aug 2, 2021 at 1:31 PM Richard Sandiford
>  wrote:
>>
>> Richard Biener  writes:
>> > On Mon, Aug 2, 2021 at 12:43 PM Richard Sandiford
>> >  wrote:
>> >>
>> >> Richard Biener via Gcc-patches  writes:
>> >> > On Fri, Jul 30, 2021 at 5:59 PM Richard Sandiford via Gcc-patches
>> >> >  wrote:
>> >> >>
>> >> >> This patch adds a simple class for holding A/B fractions.
>> >> >> As the comments in the patch say, the class isn't designed
>> >> >> to have nice numerial properties at the extremes.
>> >> >>
>> >> >> The motivating use case was some aarch64 costing work,
>> >> >> where being able to represent fractions was much easier
>> >> >> than using single integers and avoided the rounding errors
>> >> >> that would come with using floats.  (Unlike things like
>> >> >> COSTS_N_INSNS, there was no sensible constant base factor
>> >> >> that could be used.)
>> >> >>
>> >> >> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>> >> >
>> >> > Hmm, we use the sreal type for profiles.  I don't see any 
>> >> > overflow/underflow
>> >> > handling in your class - I suppose you're going to use it on integer 
>> >> > types
>> >> > given we're not allowed to use native FP?
>> >>
>> >> Yeah, I'm going to use it on integer types.  And it's not designed
>> >> to have nice properties at extremes, including handling underflow and
>> >> overflow.
>> >
>> > So maybe assert that it doesn't?  In particular nominator/denominator
>> > are prone to overflowing in fractional representations.
>> >
>> > There's the option to round or ICE.  Or rather than the only option
>> > is to round (or use a more expensive arbitrary precision representation).
>>
>> Yeah, I guess we could do that, but it semes inconsistent to assert
>> for these costs and not do it for vector costs in general.  I think it's
>> difficult to guarantee that there is no user input for which the current
>> vector costs overflow.  And if we assert, we have to have a reason for
>> believing that no such user input exists (modulo bugs).
>>
>> E.g. vect-inner-loop-cost-factor has an upper limit of 99, so the
>> existing code only needs a cost of 2148 to overflow “int”.
>
> I'd argue those are of course bugs.  The 99 upper bound is way
> too big given REB_BR_PROB_BASE is only 1.  But then we're now
> set up to initialize vinfo->inner_loop_cost_factor based on profile data
> (if it is reliable).
>
>> > So the question is whether the fractional behavior is better in more
>> > cases than the sreal behavior (I can easily believe it is).
>> >
>> >> I want to use it in costing code, where we already happily multiply
>> >> and add “int”-sized costs without worrying about overflow.  I'll be
>> >> using uint64_t for the fractions though, just in case. :-)
>> >>
>> >> sreal doesn't help because it's still significand/exponent.  That matters
>> >> because…
>> >>
>> >> > I mean, how exactly does
>> >> > the class solve the problem of rounding errors?
>> >>
>> >> …I wanted something that represented the results exactly (barring any of
>> >> integer ops overflowing).  This makes it meaningful to compare costs for
>> >> equality.  It also means we can use ordered comparisons without having
>> >> to introduce a fudge factor to cope with one calculation having different
>> >> intermediate rounding from the other.
>> >
>> > I think you're underestimating how quickly your denominator will overflow?
>>
>> Well, it depends on how you use it. :-)  I agree you have to go into
>> this knowing the risks of the representation (but then I'd argue that's
>> true for floats/sreals too, if you use them for costs).
>
> Yeah, and sreals handle overflow/underflow in a well-defined way because
> profile info tends to be crap ;)
>
>> > So I suppose all factors of all possible denominators are known, in fact
>> > whats your main source for the divisions?  The VF?
>>
>> Yeah, the set of possible dominators is fixed at compile time and
>> relatively small, but not easily enumerable.  The VF is one source,
>> but we also have “number of X per cycle” values.  The problem with sreal
>> is that sometimes those “X per cycle” values are 3, and 1/3 is where the
>> rounding problems with floats/sreals start to come in.
>>
>> I'm fairly sure that using a uint64_t fractional representation for
>> int costs and these set of denominator values is safe.  But if we
>> think that this is just too dangerous to advertise as a general
>> class within GCC, we could make it local to the aarch64 cost code
>> instead.  Would that be OK?
>
> I think we should instead make its use safe, that is, simply round when
> the denominator gets too big.  The gcn compute is already expensive
> and so is the division, I suppose a practical way would be to use
> uint32 for the representation and [u]int64 for the intermediate compute?
>
> One could put extra debugging that dumps to the active dumpfile
> whenever this happens as well (but likely with a editable #define,
> disabled by 

Re: [PATCH v5] : Add pragma GCC target("general-regs-only")

2021-08-03 Thread Richard Biener via Gcc-patches
On Sun, Jul 18, 2021 at 3:46 AM H.J. Lu  wrote:
>
> On Thu, Apr 22, 2021 at 7:30 AM Richard Biener via Gcc-patches
>  wrote:
> >
> > On Thu, Apr 22, 2021 at 2:52 PM Richard Biener
> >  wrote:
> > >
> > > On Thu, Apr 22, 2021 at 2:22 PM Jakub Jelinek  wrote:
> > > >
> > > > On Thu, Apr 22, 2021 at 01:23:20PM +0200, Richard Biener via 
> > > > Gcc-patches wrote:
> > > > > > The question is if the pragma GCC target right now behaves 
> > > > > > incrementally
> > > > > > or not, whether
> > > > > > #pragma GCC target("avx2")
> > > > > > adds -mavx2 to options if it was missing before and nothing 
> > > > > > otherwise, or if
> > > > > > it switches other options off.  If it is incremental, we could e.g. 
> > > > > > try to
> > > > > > use the second least significant bit of global_options_set.x_* to 
> > > > > > mean
> > > > > > this option has been set explicitly by some surrounding #pragma GCC 
> > > > > > target.
> > > > > > The normal tests - global_options_set.x_flag_whatever could still 
> > > > > > work
> > > > > > fine because they wouldn't care if the option was explicit from 
> > > > > > anywhere
> > > > > > (command line or GCC target or target attribute) and just & 2 would 
> > > > > > mean
> > > > > > it was explicit from pragma GCC target; though there is the case of
> > > > > > bitfields... And then the inlining decision could check the & 2 
> > > > > > flags to
> > > > > > see what is required and what is just from command line.
> > > > > > Or we can have some other pragma GCC that would be like target but 
> > > > > > would
> > > > > > have flags that are explicit (and could e.g. be more restricted, to 
> > > > > > ISA
> > > > > > options only, and let those use in addition to #pragma GCC target.
> > > > >
> > > > > I'm still curious as to what you think will break if always-inline 
> > > > > does what
> > > > > it is documented to do.
> > > >
> > > > We will silently accept calling intrinsics that must be used only in 
> > > > certain
> > > > ISA contexts, which will lead to people writing non-portable code.
> > > >
> > > > So -O2 -mno-avx
> > > > #include 
> > > >
> > > > void
> > > > foo (__m256 *x)
> > > > {
> > > >   x[0] = _mm256_sub_ps (x[1], x[2]);
> > > > }
> > > > etc. will now be accepted when it shouldn't be.
> > > > clang rejects it like gcc with:
> > > > 1.c:6:10: error: always_inline function '_mm256_sub_ps' requires target 
> > > > feature 'avx', but would be inlined into function 'foo' that is 
> > > > compiled without support for 'avx'
> > > >   x[0] = _mm256_sub_ps (x[1], x[2]);
> > > >  ^
> > > >
> > > > Note, if I do:
> > > > #include 
> > > >
> > > > __attribute__((target ("no-sse3"))) void
> > > > foo (__m256 *x)
> > > > {
> > > >   x[0] = _mm256_sub_ps (x[1], x[2]);
> > > > }
> > > > and compile
> > > > clang -S -O2 -mavx2 1.c
> > > > 1.c:6:10: error: always_inline function '_mm256_sub_ps' requires target 
> > > > feature 'avx', but would be inlined into function 'foo' that is 
> > > > compiled without support for 'avx'
> > > >   x[0] = _mm256_sub_ps (x[1], x[2]);
> > > >  ^
> > > > then from the error message it seems that unlike GCC, clang remembers
> > > > the exact target features that are needed for the intrinsics and checks 
> > > > just
> > > > those.
> > > > Though, looking at the preprocessed source, seems it uses
> > > > static __inline __m256 __attribute__((__always_inline__, __nodebug__, 
> > > > __target__("avx"), __min_vector_width__(256)))
> > > > _mm256_sub_ps(__m256 __a, __m256 __b)
> > > > {
> > > >   return (__m256)((__v8sf)__a-(__v8sf)__b);
> > > > }
> > > > and not target pragmas.
> > > >
> > > > Anyway, if we tweak our intrinsic headers so that
> > > > -#ifndef __AVX__
> > > >  #pragma GCC push_options
> > > >  #pragma GCC target("avx")
> > > > -#define __DISABLE_AVX__
> > > > -#endif /* __AVX__ */
> > > >
> > > > ...
> > > > -#ifdef __DISABLE_AVX__
> > > > -#undef __DISABLE_AVX__
> > > >  #pragma GCC pop_options
> > > > -#endif /* __DISABLE_AVX__ */
> > > > and do the opts_set->x_* & 2 stuff on explicit options coming out of
> > > > target/optimize pragmas and attributes, perhaps we don't even need
> > > > to introduce a new attribute and can handle everything magically:
> >
> > Oh, and any such changes will likely interact with Martins ideas to rework
> > how optimize and target attributes work (aka adding ontop of the
> > commandline options).  That is, attribute target will then not be enough
> > to remember the exact set of needed ISA features (as opposed to what
> > likely clang implements?)
> >
> > > > 1) if it is gnu_inline extern inline, allow indirect calls, otherwise
> > > > disallow them for always_inline functions
> > >
> > > There are a lot of intrinsics using extern inline __gnu_inline though...
> > >
> > > > 2) for the isa flags and option mismatches, only disallow opts_set->x_* 
> > > > & 2
> > > > stuff
> > > > This will keep both intrinsics and glibc fortify macros working fine
> > > > in all the needed use 

Re: Re: [PATCH] Fix ICE when mixing VLAs and statement expressions [PR91038]

2021-08-03 Thread Martin Uecker



Am Dienstag, den 03.08.2021, 11:26 +0200 schrieb Richard Biener:
> On Tue, Aug 3, 2021 at 10:28 AM Martin Uecker 
> wrote:
> > 
> > Hi
> > Am Dienstag, den 03.08.2021, 10:10 +0200 schrieb Richard Biener:
> > > On Tue, Aug 3, 2021 at 7:32 AM Martin Uecker 
> > > wrote:
> > > > 
> > > > (resending from a different account, as emails seem to do not
> > > > go out from my other account at this time)
> > > > 
> > > > Am Montag, den 02.08.2021, 16:05 +0200 schrieb Martin Uecker:
> > > > > > On Sun, Aug 1, 2021 at 7:37 PM Uecker, Martin
> > > > > >  wrote:
> > > > > > > Here is an attempt to fix some old and annoying bugs
> > > > > > > related
> > > > > > > to VLAs and statement expressions. In particulary, this
> > > > > > > seems
> > > > > > > to fix the issues with variably-modified types which are
> > > > > > > returned from statement expressions (which works on
> > > > > > > clang),
> > > > > > > but there are still bugs remaining related to structs
> > > > > > > with VLA members (which seems to be a FE bug).
> > > > > > > 
> > > > > > > Of course, I might be doing something stupid...
> > > > > > 
> > > > > > How's evaluation order of (f())[g()] defined (with f
> > > > > > returning
> > > > > > a
> > > > > > pointer)?
> > > > > > Isn't that just f() + g()*sizeof(int) and thus undefined?
> > > > > 
> > > > > Yes, in C it is
> > > > > 
> > > > > f() + g()
> > > > > 
> > > > > and it is unsequenced. But the order of 'f' and 'g'
> > > > > is not relevant here and also the patch does not change
> > > > > it (the base expression is gimplified before the index).
> > > > > 
> > > > > Essentially, we have
> > > > > 
> > > > > ({ ... }) + g() * sizeof(X)
> > > > > 
> > > > > where X refers to a declaration in the statement expression.
> > > > > Without the patch the size expressions are gimplified before
> > > > > the base expression and also before the index expression.
> > > > > With the patch the ({ ... }) is gimplified also before the
> > > > > size expression.
> > > > > 
> > > > > > If it's undefined then I think the incoming GENERIC is ill-
> > > > > > defined.
> > > > > 
> > > > > I think it is OK because the arguments are evaluated
> > > > > before the operation.  Without the patch, parts of the
> > > > > operation (the size expressions) are gimplified before
> > > > > the arguments and this seems wrong to me.
> > > 
> > > But you said the evaluation order is undefined.
> > 
> > The evaluation order of the two arguments (base
> > and index) is undefined.  But the operation itself has
> > to happen after the arguments are evaluated like
> > the call to a is sequenced before f and g:
> > 
> > a(f(), g())
> > 
> > 
> > Computing the correct step size in the pointer
> > arithmetic is part of the operation itself and not
> > part of the evaluation of the arguments.
> > 
> > The problem here is that this part of the operation
> > is done before the arguments are evaluated, which
> > is a compiler bug.
> 
> Yes, but the bug is IMHO in the C frontend which inserts the
> DECL_EXPR at a wrong spot, not making sure it is evaluated before it
> is used.  Working around this deficiency in the gimplifier sounds
> incorrect.

The size if part of the type of the value which is 
returned from the compound  expression. 

So that there is a declaration involved was maybe
misleading in my example and explanation.  

So let me try again:

The size *expression* needs be be computed inside the 
statement expression (also because of side effects)
and then the result of this computation needs to 
returned together (as part of) the value returned 
by the statement expression. 

But the compilers tries to gimplify the size expression 
that  comes with the value already before the statement 
expression is evaluated. This can not work, no matter
what the front end does.

> 
> Does the same issue arise with writing the testcases as
> 
>  ({ ... }) + i;
> 
> ?  How can we fix it then if you also need to support
> 
>  i + ({ ...});
> 
> 
> ?

This already works correctly. I assume that here
the gimplifcation is  already  done in the right 
order.

But ARRAY_REF is handled as a separate case and
there it is wrong.

Martin

> 
> > > So IMHO the GENERIC is undefined in evaluating the size of sth
> > > that's not live?
> > > 
> > >  That said, given the statement expression
> > > result undergoes array to pointer decay doesn't this pointer
> > > refer to an object that ended its lifetime?
> > > "In a statement expression, any temporaries created within a
> > > statement are destroyed at that statement's end."
> > > That is, don't the testcases all invoke undefined behavior at
> > > runtime?
> > 
> > This is true for one of the test cases (where not having
> > an ICE is then
> > a QoI issue), but not for the others
> > where the object is allocated by
> > malloc and a pointer
> > to the object is returned from the statement
> > expression.
> > This is supposed to work.
> > 
> > 
> > Martin
> > 
> > 
> > 



Re: [PATCH, v2, libgomp, OpenMP 5.0] Implement omp_get_device_num

2021-08-03 Thread Jakub Jelinek via Gcc-patches
On Mon, Aug 02, 2021 at 09:10:57PM +0800, Chung-Lin Tang wrote:
> > I think this won't work properly with the intel micoffload, where the host
> > libgomp is used in the offloaded code.
> > For omp_is_initial_device, the plugin solves it by:
> > liboffloadmic/plugin/offload_target_main.cpp
> > overriding it:
> > /* Override the corresponding functions from libgomp.  */
> > extern "C" int
> > omp_is_initial_device (void) __GOMP_NOTHROW
> > {
> >return 0;
> > }
> > extern "C" int32_t
> > omp_is_initial_device_ (void)
> > {
> >return omp_is_initial_device ();
> > }
> > but guess it will need slightly more work because we need to copy the value
> > to the offloading device too.
> > It can be done incrementally though.
> 
> I guess this part of intelmic functionality will just have to wait later.
> There seem to be other parts of liboffloadmic that seems to need re-work,
> e.g. omp_get_num_devices() return mic_engines_total, where it should actually
> return the number of all devices (not just intelmic). omp_get_initial_device()
> returning -1 (which I don't quite understand), etc.

For omp_get_num_devices() the standard says:
When called from within a target region the effect of this routine is 
unspecified.
Ditto for omp_get_initial_device and various other routines.
So it is UB if those functions are called in offloaded regions.

> > For a single var it is acceptable (though, please avoid the double space
> > before offload plugin in the comment), but once we have more than one
> > variable, I think we should simply have a struct which will contain all the
> > parameters that need to be copied from the host to the offloading device at
> > image load time (and have eventually another struct that holds parameters
> > that we'll need to copy to the device on each kernel launch, I bet some ICVs
> > will be one category, other ICVs another one).
> 
> Actually, if you look at the 5.[01] specifications, omp_get_device_num() is 
> not
> defined in terms of an ICV. Maybe it conceptually ought to be, but the current
> description of "the device number of the device on which the calling thread is
> executing" is not one if the defined ICVs.
> 
> It looks like there will eventually be some kind of ICV block handled in a 
> similar
> way, but I think that the modifications will be straightforward then. For now,
> I think it's okay for GOMP_DEVICE_NUM_VAR to just be a normal global variable.

Yeah, it is ok for now, but even for the below mentioned omp_display_env
we'll need to replace it...

> There is a new function check_effective_target_offload_target_intelmic() in
> testsuite/lib/libgomp.exp, used to test for non-intelmic offloading 
> situations.
> 
> Re-tested with no regressions, seeking approval for trunk.
> 
> Thanks,
> Chung-Lin
> 
> 2021-08-02  Chung-Lin Tang  
> 
> libgomp/ChangeLog
> 
>   * icv-device.c (omp_get_device_num): New API function, host side.
>   * fortran.c (omp_get_device_num_): New interface function.
>   * libgomp-plugin.h (GOMP_DEVICE_NUM_VAR): Define macro symbol.
>   * libgomp.map (OMP_5.0.2): New version space with omp_get_device_num,
>   omp_get_device_num_.
>   * libgomp.texi (omp_get_device_num): Add documentation for new API
>   function.
>   * omp.h.in (omp_get_device_num): Add declaration.
>   * omp_lib.f90.in (omp_get_device_num): Likewise.
>   * omp_lib.h.in (omp_get_device_num): Likewise.
>   * target.c (gomp_load_image_to_device): If additional entry for device
>   number exists at end of returned entries from 'load_image_func' hook,
>   copy the assigned device number over to the device variable.
> 
>   * config/gcn/icv-device.c (GOMP_DEVICE_NUM_VAR): Define static global.
>   (omp_get_device_num): New API function, device side.
>   * config/plugin/plugin-gcn.c ("symcat.h"): Add include.
>   (GOMP_OFFLOAD_load_image): Add addresses of device GOMP_DEVICE_NUM_VAR
>   at end of returned 'target_table' entries.
> 
>   * config/nvptx/icv-device.c (GOMP_DEVICE_NUM_VAR): Define static global.
>   (omp_get_device_num): New API function, device side.
>   * config/plugin/plugin-nvptx.c ("symcat.h"): Add include.
>   (GOMP_OFFLOAD_load_image): Add addresses of device GOMP_DEVICE_NUM_VAR
>   at end of returned 'target_table' entries.
> 
>   * testsuite/lib/libgomp.exp
>   (check_effective_target_offload_target_intelmic): New function for
>   testing for intelmic offloading.
>   * testsuite/libgomp.c-c++-common/target-45.c: New test.
>   * testsuite/libgomp.fortran/target10.f90: New test.

Ok, thanks.

Jakub



Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.

2021-08-03 Thread Hongtao Liu via Gcc-patches
On Tue, Aug 3, 2021 at 6:20 PM Richard Biener
 wrote:
>
> On Tue, Aug 3, 2021 at 11:20 AM Richard Biener
>  wrote:
> >
> > On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches
> >  wrote:
> > >
> > > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu  wrote:
> > > >
> > > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu  wrote:
> > > > >
> > > > > Correct mail list, please reply under this email.
> > > > >
> > > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt  
> > > > > wrote:
> > > > > >
> > > > > > Hi:
> > > > > >   As decribled in PR, the pinsr instruction has poor throughput in 
> > > > > > SKX
> > > > > > and CLX, which leads to worse performance in vectorization in some 
> > > > > > cases.
> > > > > > This patch adds a cost member named integer_to_sse to simulate 
> > > > > > pinsr/movd
> > > > > > which is used by vector construction, the cost is same as sse_op on 
> > > > > > other
> > > > > >  targets, but twice much as sse_op on CLX/SKX.
> > > > > >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > > > >   Ok for trunk?
> > > > > >
> > > > I'm going to check in this patch if there's no objection.
> > > Pushed to trunk.
> >
> >   /* N element inserts into SSE vectors.  */
> > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > + int cost
> > +   = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > +   ix86_cost->sse_op
> > +   : 
> > ix86_cost->integer_to_sse);
> > +
> >
> > so that's costing movd and pinsr the same, shouldn't we try to separate this
> > by doing
> >
> >  /* N element inserts into SSE vectors.  */
> >  int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> >  /* Account for int->SSE reg moves.  */
> >  if (!fp)
> >cost += TYPE_VECTOR_SUBPARTS (vectype) * 
> > ix86_cost->integer_to_sse;
> >
> > ?  pinsr is only supported with SSE4+ IIRC.  Note we also have
we have pinsrw under sse2, and pinsrb/d/q  w/ sse4+.
integer_to_see is an estimate to model the average overhead of each
integer from gpr to sse, it can be movd + unpck,or movd + pinsr.
It seems reasonable to have uniform costs for scalar_to_vec and integer_to_sse.
vec_to_scalar  and sse_to_integer seems to be different,
sse_to_integer corresponds to movd. vec_to_scalar is vec_extract.
Maybe we should rename integer_to_sse to vec_set_integer.
> >
> >   case vec_to_scalar:
> >   case scalar_to_vec:
> > return ix86_vec_cost (mode, ix86_cost->sse_op);
> >
> > where scalar_to_vec is used to cost splats and vec_to_scalar is used
> > to cost element extracts.  Both lack costing of the move part.
> >
> > I realize we have GPR to XMM inserts which cover both the "move" and
> > the insert but then calling this 'integer_to_sse' is a bit odd.  The extract
> > cost also depends on the element number for AVX2/AVX512F.  The
> > vectorizer usually decomposes a vector fully and never does single
> > element extracts so the vextract128 cost amortizes.
> >
> > That said, the change leaves all targets besides skylake_cost with
> > not so great defaults I think.  For skylake you effectively add another
> > sse_op for the int->SSE move plus '1' (for whatever reason).  I think
> > that's reasonable for all targets.
> >
> > It does look a bit odd to have
> >
> >8,   /* cost of moving SSE register to 
> > intege
> > r.  */
> >   COSTS_N_INSNS (1),   /* cost of moving integer to sse 
> > registe
> > r.  */
> >
> > where sse_to_integer is used by the STV pass which mixes
> > CONST_N_INSNS scaled costs and unscaled costs (ick).
>
> Debugging some eventually related thing I applied the same costing
> as you did to skylake_cost to znver2_cost and figured that
> 538.imagick_r regresses by 23% at -Ofast -march=znver2 by such
> change.  So it seems changes here indeed need careful benchmarking.
>
> Richard.
>
> > Richard.
> >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR target/99881
> > > > > > * config/i386/i386.h (processor_costs): Add new member
> > > > > > integer_to_sse.
> > > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > > > > > generic_cost, core_cost): Initialize integer_to_sse same 
> > > > > > value
> > > > > > as sse_op.
> > > > > > (skylake_cost): Initialize integer_to_sse twice as much as 
> > > > > > sse_op.
> > > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > > > > > Use integer_to_sse instead of 

Re: [ARM] PR98435: Missed optimization in expanding vector constructor

2021-08-03 Thread Prathamesh Kulkarni via Gcc-patches
On Tue, 3 Aug 2021 at 14:59, Christophe Lyon
 wrote:
>
>
>
> On Tue, Jul 6, 2021 at 11:26 AM Prathamesh Kulkarni via Gcc-patches 
>  wrote:
>>
>> On Tue, 6 Jul 2021 at 13:33, Kyrylo Tkachov  wrote:
>> >
>> >
>> >
>> > > -Original Message-
>> > > From: Prathamesh Kulkarni 
>> > > Sent: 06 July 2021 08:06
>> > > To: Christophe LYON 
>> > > Cc: Kyrylo Tkachov ; gcc Patches > > > patc...@gcc.gnu.org>
>> > > Subject: Re: [ARM] PR98435: Missed optimization in expanding vector
>> > > constructor
>> > >
>> > > On Thu, 1 Jul 2021 at 16:26, Prathamesh Kulkarni
>> > >  wrote:
>> > > >
>> > > > On Wed, 30 Jun 2021 at 20:51, Christophe LYON
>> > > >  wrote:
>> > > > >
>> > > > >
>> > > > > On 29/06/2021 12:46, Prathamesh Kulkarni wrote:
>> > > > > > On Mon, 28 Jun 2021 at 14:48, Christophe LYON
>> > > > > >  wrote:
>> > > > > >>
>> > > > > >> On 28/06/2021 10:40, Kyrylo Tkachov via Gcc-patches wrote:
>> > > > >  -Original Message-
>> > > > >  From: Prathamesh Kulkarni 
>> > > > >  Sent: 28 June 2021 09:38
>> > > > >  To: Kyrylo Tkachov 
>> > > > >  Cc: Christophe Lyon ; gcc Patches
>> > > > > > > >  patc...@gcc.gnu.org>
>> > > > >  Subject: Re: [ARM] PR98435: Missed optimization in expanding
>> > > vector
>> > > > >  constructor
>> > > > > 
>> > > > >  On Thu, 24 Jun 2021 at 22:01, Kyrylo Tkachov
>> > > 
>> > > > >  wrote:
>> > > > > >
>> > > > > >> -Original Message-
>> > > > > >> From: Prathamesh Kulkarni 
>> > > > > >> Sent: 14 June 2021 09:02
>> > > > > >> To: Christophe Lyon 
>> > > > > >> Cc: gcc Patches ; Kyrylo Tkachov
>> > > > > >> 
>> > > > > >> Subject: Re: [ARM] PR98435: Missed optimization in expanding
>> > > vector
>> > > > > >> constructor
>> > > > > >>
>> > > > > >> On Wed, 9 Jun 2021 at 15:58, Prathamesh Kulkarni
>> > > > > >>  wrote:
>> > > > > >>> On Fri, 4 Jun 2021 at 13:15, Christophe Lyon
>> > > > >  
>> > > > > >> wrote:
>> > > > >  On Fri, 4 Jun 2021 at 09:27, Prathamesh Kulkarni via Gcc-
>> > > patches
>> > > > >   wrote:
>> > > > > > Hi,
>> > > > > > As mentioned in PR, for the following test-case:
>> > > > > >
>> > > > > > #include 
>> > > > > >
>> > > > > > bfloat16x4_t f1 (bfloat16_t a)
>> > > > > > {
>> > > > > > return vdup_n_bf16 (a);
>> > > > > > }
>> > > > > >
>> > > > > > bfloat16x4_t f2 (bfloat16_t a)
>> > > > > > {
>> > > > > > return (bfloat16x4_t) {a, a, a, a};
>> > > > > > }
>> > > > > >
>> > > > > > Compiling with arm-linux-gnueabi -O3 -mfpu=neon -mfloat-
>> > > > >  abi=softfp
>> > > > > > -march=armv8.2-a+bf16+fp16 results in f2 not being
>> > > vectorized:
>> > > > > >
>> > > > > > f1:
>> > > > > >   vdup.16 d16, r0
>> > > > > >   vmovr0, r1, d16  @ v4bf
>> > > > > >   bx  lr
>> > > > > >
>> > > > > > f2:
>> > > > > >   mov r3, r0  @ __bf16
>> > > > > >   adr r1, .L4
>> > > > > >   ldrdr0, [r1]
>> > > > > >   mov r2, r3  @ __bf16
>> > > > > >   mov ip, r3  @ __bf16
>> > > > > >   bfi r1, r2, #0, #16
>> > > > > >   bfi r0, ip, #0, #16
>> > > > > >   bfi r1, r3, #16, #16
>> > > > > >   bfi r0, r2, #16, #16
>> > > > > >   bx  lr
>> > > > > >
>> > > > > > This seems to happen because vec_init pattern in neon.md
>> > > has VDQ
>> > > > > >> mode
>> > > > > > iterator, which doesn't include V4BF. In attached patch, I
>> > > changed
>> > > > > > mode
>> > > > > > to VDQX which seems to work for the test-case, and the
>> > > compiler
>> > > > >  now
>> > > > > >> generates:
>> > > > > > f2:
>> > > > > >   vdup.16 d16, r0
>> > > > > >   vmovr0, r1, d16  @ v4bf
>> > > > > >   bx  lr
>> > > > > >
>> > > > > > However, the pattern is also gated on TARGET_HAVE_MVE
>> > > and I am
>> > > > > >> not
>> > > > > > sure if either VDQ or VDQX are correct modes for MVE since
>> > > MVE
>> > > > >  has
>> > > > > > only 128-bit vectors ?
>> > > > > >
>> > > > >  I think patterns common to both Neon and MVE should be
>> > > moved to
>> > > > >  vec-common.md, I don't know why such patterns were left in
>> > > > >  neon.md.
>> > > > > >>> Since we end up calling neon_expand_vector_init for both
>> > > NEON and
>> > > > >  MVE,
>> > > > > >>> I am not sure if we should separate the pattern ?
>> > > > > >>> Would it make sense to FAIL if the mode size isn't 16 bytes 
>> > > > > >>> for
>> > > MVE as
>> > > > > >>> in attached 

Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.

2021-08-03 Thread Richard Biener via Gcc-patches
On Tue, Aug 3, 2021 at 11:20 AM Richard Biener
 wrote:
>
> On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches
>  wrote:
> >
> > On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu  wrote:
> > >
> > > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu  wrote:
> > > >
> > > > Correct mail list, please reply under this email.
> > > >
> > > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt  wrote:
> > > > >
> > > > > Hi:
> > > > >   As decribled in PR, the pinsr instruction has poor throughput in SKX
> > > > > and CLX, which leads to worse performance in vectorization in some 
> > > > > cases.
> > > > > This patch adds a cost member named integer_to_sse to simulate 
> > > > > pinsr/movd
> > > > > which is used by vector construction, the cost is same as sse_op on 
> > > > > other
> > > > >  targets, but twice much as sse_op on CLX/SKX.
> > > > >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > > >   Ok for trunk?
> > > > >
> > > I'm going to check in this patch if there's no objection.
> > Pushed to trunk.
>
>   /* N element inserts into SSE vectors.  */
> - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> + int cost
> +   = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> +   ix86_cost->sse_op
> +   : ix86_cost->integer_to_sse);
> +
>
> so that's costing movd and pinsr the same, shouldn't we try to separate this
> by doing
>
>  /* N element inserts into SSE vectors.  */
>  int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
>  /* Account for int->SSE reg moves.  */
>  if (!fp)
>cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse;
>
> ?  pinsr is only supported with SSE4+ IIRC.  Note we also have
>
>   case vec_to_scalar:
>   case scalar_to_vec:
> return ix86_vec_cost (mode, ix86_cost->sse_op);
>
> where scalar_to_vec is used to cost splats and vec_to_scalar is used
> to cost element extracts.  Both lack costing of the move part.
>
> I realize we have GPR to XMM inserts which cover both the "move" and
> the insert but then calling this 'integer_to_sse' is a bit odd.  The extract
> cost also depends on the element number for AVX2/AVX512F.  The
> vectorizer usually decomposes a vector fully and never does single
> element extracts so the vextract128 cost amortizes.
>
> That said, the change leaves all targets besides skylake_cost with
> not so great defaults I think.  For skylake you effectively add another
> sse_op for the int->SSE move plus '1' (for whatever reason).  I think
> that's reasonable for all targets.
>
> It does look a bit odd to have
>
>8,   /* cost of moving SSE register to 
> intege
> r.  */
>   COSTS_N_INSNS (1),   /* cost of moving integer to sse 
> registe
> r.  */
>
> where sse_to_integer is used by the STV pass which mixes
> CONST_N_INSNS scaled costs and unscaled costs (ick).

Debugging some eventually related thing I applied the same costing
as you did to skylake_cost to znver2_cost and figured that
538.imagick_r regresses by 23% at -Ofast -march=znver2 by such
change.  So it seems changes here indeed need careful benchmarking.

Richard.

> Richard.
>
> > > > > gcc/ChangeLog:
> > > > >
> > > > > PR target/99881
> > > > > * config/i386/i386.h (processor_costs): Add new member
> > > > > integer_to_sse.
> > > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > > > > generic_cost, core_cost): Initialize integer_to_sse same value
> > > > > as sse_op.
> > > > > (skylake_cost): Initialize integer_to_sse twice as much as 
> > > > > sse_op.
> > > > > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > > > > Use integer_to_sse instead of sse_op to calculate the cost of
> > > > > vec_construct.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > > PR target/99881
> > > > > * gcc.target/i386/pr99881.c: New test.
> > > > > ---
> > > > >  gcc/config/i386/i386.c  |  6 ++-
> > > > >  gcc/config/i386/i386.h  |  1 +
> > > > >  gcc/config/i386/x86-tune-costs.h| 26 +
> > > > >  gcc/testsuite/gcc.target/i386/pr99881.c | 49 
> > > > > +
> > > > >  4 files changed, 81 insertions(+), 1 deletion(-)
> > > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> > > > >
> > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > index ff96134fb37..fbebd2d8f9a 

Re: [PATCH 1/8] aarch64: Use memcpy to copy vector tables in vqtbl[234] intrinsics

2021-08-03 Thread Christophe Lyon via Gcc-patches
On Fri, Jul 23, 2021 at 10:22 AM Jonathan Wright via Gcc-patches <
gcc-patches@gcc.gnu.org> wrote:

> Hi,
>
> This patch uses __builtin_memcpy to copy vector structures instead of
> building a new opaque structure one vector at a time in each of the
> vqtbl[234] Neon intrinsics in arm_neon.h. This simplifies the header file
> and also improves code generation - superfluous move instructions
> were emitted for every register extraction/set in this additional
> structure.
>
> Add new code generation tests to verify that superfluous move
> instructions are no longer generated for the vqtbl[234] intrinsics.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-08  Jonathan Wright  
>
> * config/aarch64/arm_neon.h (vqtbl2_s8): Use __builtin_memcpy
> instead of constructing __builtin_aarch64_simd_oi one vector
> at a time.
> (vqtbl2_u8): Likewise.
> (vqtbl2_p8): Likewise.
> (vqtbl2q_s8): Likewise.
> (vqtbl2q_u8): Likewise.
> (vqtbl2q_p8): Likewise.
> (vqtbl3_s8): Use __builtin_memcpy instead of constructing
> __builtin_aarch64_simd_ci one vector at a time.
> (vqtbl3_u8): Likewise.
> (vqtbl3_p8): Likewise.
> (vqtbl3q_s8): Likewise.
> (vqtbl3q_u8): Likewise.
> (vqtbl3q_p8): Likewise.
> (vqtbl4_s8): Use __builtin_memcpy instead of constructing
> __builtin_aarch64_simd_xi one vector at a time.
> (vqtbl4_u8): Likewise.
> (vqtbl4_p8): Likewise.
> (vqtbl4q_s8): Likewise.
> (vqtbl4q_u8): Likewise.
> (vqtbl4q_p8): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/vector_structure_intrinsics.c: New test.
>

Hi,

This new test fails on aarch64_be:
 FAIL: gcc.target/aarch64/vector_structure_intrinsics.c scan-assembler-not
mov\\t

Can you check?

Thanks

Christophe


Re: [PATCH V2] gcc: Add vec_select -> subreg RTL simplification

2021-08-03 Thread Christophe Lyon via Gcc-patches
Hi,

Since the arm-linux toolchain build has been fixed, I have noticed
additional failures on armeb:
gcc.target/arm/crypto-vsha1cq_u32.c scan-assembler-times
vdup.32\\tq[0-9]+, r[0-9]+ 4
gcc.target/arm/crypto-vsha1cq_u32.c scan-assembler-times
vmov.32\\tr[0-9]+, d[0-9]+\\[[0-9]+\\]+ 3
gcc.target/arm/crypto-vsha1h_u32.c scan-assembler-times
vdup.32\\tq[0-9]+, r[0-9]+ 4
gcc.target/arm/crypto-vsha1h_u32.c scan-assembler-times
vmov.32\\tr[0-9]+, d[0-9]+\\[[0-9]+\\]+ 3
gcc.target/arm/crypto-vsha1mq_u32.c scan-assembler-times
vdup.32\\tq[0-9]+, r[0-9]+ 4
gcc.target/arm/crypto-vsha1mq_u32.c scan-assembler-times
vmov.32\\tr[0-9]+, d[0-9]+\\[[0-9]+\\]+ 3
gcc.target/arm/crypto-vsha1pq_u32.c scan-assembler-times
vdup.32\\tq[0-9]+, r[0-9]+ 4
gcc.target/arm/crypto-vsha1pq_u32.c scan-assembler-times
vmov.32\\tr[0-9]+, d[0-9]+\\[[0-9]+\\]+ 3

I don't see them mentioned in this thread though?

Can you check?

Thanks

Christophe


On Thu, Jul 15, 2021 at 3:07 PM Jonathan Wright 
wrote:

> Ah, yes - those test results should have only been changed for little
> endian.
>
> I've submitted a patch to the list restoring the original expected results
> for big
> endian.
>
> Thanks,
> Jonathan
> --
> *From:* Christophe Lyon 
> *Sent:* 15 July 2021 10:09
> *To:* Richard Sandiford ; Jonathan Wright <
> jonathan.wri...@arm.com>; gcc-patches@gcc.gnu.org ;
> Kyrylo Tkachov 
> *Subject:* Re: [PATCH V2] gcc: Add vec_select -> subreg RTL simplification
>
>
>
> On Mon, Jul 12, 2021 at 5:31 PM Richard Sandiford via Gcc-patches <
> gcc-patches@gcc.gnu.org> wrote:
>
> Jonathan Wright  writes:
> > Hi,
> >
> > Version 2 of this patch adds more code generation tests to show the
> > benefit of this RTL simplification as well as adding a new helper
> function
> > 'rtx_vec_series_p' to reduce code duplication.
> >
> > Patch tested as version 1 - ok for master?
>
> Sorry for the slow reply.
>
> > Regression tested and bootstrapped on aarch64-none-linux-gnu,
> > x86_64-unknown-linux-gnu, arm-none-linux-gnueabihf and
> > aarch64_be-none-linux-gnu - no issues.
>
> I've also tested this on powerpc64le-unknown-linux-gnu, no issues again.
>
> > diff --git a/gcc/combine.c b/gcc/combine.c
> > index
> 6476812a21268e28219d1e302ee1c979d528a6ca..0ff6ca87e4432cfeff1cae1dd219ea81ea0b73e4
> 100644
> > --- a/gcc/combine.c
> > +++ b/gcc/combine.c
> > @@ -6276,6 +6276,26 @@ combine_simplify_rtx (rtx x, machine_mode
> op0_mode, int in_dest,
> > - 1,
> > 0));
> >break;
> > +case VEC_SELECT:
> > +  {
> > + rtx trueop0 = XEXP (x, 0);
> > + mode = GET_MODE (trueop0);
> > + rtx trueop1 = XEXP (x, 1);
> > + int nunits;
> > + /* If we select a low-part subreg, return that.  */
> > + if (GET_MODE_NUNITS (mode).is_constant ()
> > + && targetm.can_change_mode_class (mode, GET_MODE (x),
> ALL_REGS))
> > +   {
> > + int offset = BYTES_BIG_ENDIAN ? nunits - XVECLEN (trueop1, 0)
> : 0;
> > +
> > + if (rtx_vec_series_p (trueop1, offset))
> > +   {
> > + rtx new_rtx = lowpart_subreg (GET_MODE (x), trueop0, mode);
> > + if (new_rtx != NULL_RTX)
> > +   return new_rtx;
> > +   }
> > +   }
> > +  }
>
> Since this occurs three times, I think it would be worth having
> a new predicate:
>
> /* Return true if, for all OP of mode OP_MODE:
>
>  (vec_select:RESULT_MODE OP SEL)
>
>is equivalent to the lowpart RESULT_MODE of OP.  */
>
> bool
> vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx
> sel)
>
> containing the GET_MODE_NUNITS (…).is_constant, can_change_mode_class
> and rtx_vec_series_p tests.
>
> I think the function belongs in rtlanal.[hc], even though subreg_lowpart_p
> is in emit-rtl.c.
>
> > diff --git a/gcc/config/aarch64/aarch64.md
> b/gcc/config/aarch64/aarch64.md
> > index
> aef6da9732d45b3586bad5ba57dafa438374ac3c..f12a0bebd3d6dd3381ac8248cd3fa3f519115105
> 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -1884,15 +1884,16 @@
> >  )
> >
> >  (define_insn "*zero_extend2_aarch64"
> > -  [(set (match_operand:GPI 0 "register_operand" "=r,r,w")
> > -(zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand"
> "r,m,m")))]
> > +  [(set (match_operand:GPI 0 "register_operand" "=r,r,w,r")
> > +(zero_extend:GPI (match_operand:SHORT 1 "nonimmediate_operand"
> "r,m,m,w")))]
> >""
> >"@
> > and\t%0, %1, 
> > ldr\t%w0, %1
> > -   ldr\t%0, %1"
> > -  [(set_attr "type" "logic_imm,load_4,f_loads")
> > -   (set_attr "arch" "*,*,fp")]
> > +   ldr\t%0, %1
> > +   umov\t%w0, %1.[0]"
> > +  [(set_attr "type" "logic_imm,load_4,f_loads,neon_to_gp")
> > +   (set_attr "arch" "*,*,fp,fp")]
>
> FTR (just to show I thought about it): I don't know whether the umov
> can really be considered an fp operation rather than a simd operation,
> but since we 

Re: [ARM] PR98435: Missed optimization in expanding vector constructor

2021-08-03 Thread Christophe Lyon via Gcc-patches
On Tue, Jul 6, 2021 at 11:26 AM Prathamesh Kulkarni via Gcc-patches <
gcc-patches@gcc.gnu.org> wrote:

> On Tue, 6 Jul 2021 at 13:33, Kyrylo Tkachov 
> wrote:
> >
> >
> >
> > > -Original Message-
> > > From: Prathamesh Kulkarni 
> > > Sent: 06 July 2021 08:06
> > > To: Christophe LYON 
> > > Cc: Kyrylo Tkachov ; gcc Patches  > > patc...@gcc.gnu.org>
> > > Subject: Re: [ARM] PR98435: Missed optimization in expanding vector
> > > constructor
> > >
> > > On Thu, 1 Jul 2021 at 16:26, Prathamesh Kulkarni
> > >  wrote:
> > > >
> > > > On Wed, 30 Jun 2021 at 20:51, Christophe LYON
> > > >  wrote:
> > > > >
> > > > >
> > > > > On 29/06/2021 12:46, Prathamesh Kulkarni wrote:
> > > > > > On Mon, 28 Jun 2021 at 14:48, Christophe LYON
> > > > > >  wrote:
> > > > > >>
> > > > > >> On 28/06/2021 10:40, Kyrylo Tkachov via Gcc-patches wrote:
> > > > >  -Original Message-
> > > > >  From: Prathamesh Kulkarni 
> > > > >  Sent: 28 June 2021 09:38
> > > > >  To: Kyrylo Tkachov 
> > > > >  Cc: Christophe Lyon ; gcc Patches
> > >  > > > >  patc...@gcc.gnu.org>
> > > > >  Subject: Re: [ARM] PR98435: Missed optimization in expanding
> > > vector
> > > > >  constructor
> > > > > 
> > > > >  On Thu, 24 Jun 2021 at 22:01, Kyrylo Tkachov
> > > 
> > > > >  wrote:
> > > > > >
> > > > > >> -Original Message-
> > > > > >> From: Prathamesh Kulkarni 
> > > > > >> Sent: 14 June 2021 09:02
> > > > > >> To: Christophe Lyon 
> > > > > >> Cc: gcc Patches ; Kyrylo Tkachov
> > > > > >> 
> > > > > >> Subject: Re: [ARM] PR98435: Missed optimization in expanding
> > > vector
> > > > > >> constructor
> > > > > >>
> > > > > >> On Wed, 9 Jun 2021 at 15:58, Prathamesh Kulkarni
> > > > > >>  wrote:
> > > > > >>> On Fri, 4 Jun 2021 at 13:15, Christophe Lyon
> > > > >  
> > > > > >> wrote:
> > > > >  On Fri, 4 Jun 2021 at 09:27, Prathamesh Kulkarni via Gcc-
> > > patches
> > > > >   wrote:
> > > > > > Hi,
> > > > > > As mentioned in PR, for the following test-case:
> > > > > >
> > > > > > #include 
> > > > > >
> > > > > > bfloat16x4_t f1 (bfloat16_t a)
> > > > > > {
> > > > > > return vdup_n_bf16 (a);
> > > > > > }
> > > > > >
> > > > > > bfloat16x4_t f2 (bfloat16_t a)
> > > > > > {
> > > > > > return (bfloat16x4_t) {a, a, a, a};
> > > > > > }
> > > > > >
> > > > > > Compiling with arm-linux-gnueabi -O3 -mfpu=neon -mfloat-
> > > > >  abi=softfp
> > > > > > -march=armv8.2-a+bf16+fp16 results in f2 not being
> > > vectorized:
> > > > > >
> > > > > > f1:
> > > > > >   vdup.16 d16, r0
> > > > > >   vmovr0, r1, d16  @ v4bf
> > > > > >   bx  lr
> > > > > >
> > > > > > f2:
> > > > > >   mov r3, r0  @ __bf16
> > > > > >   adr r1, .L4
> > > > > >   ldrdr0, [r1]
> > > > > >   mov r2, r3  @ __bf16
> > > > > >   mov ip, r3  @ __bf16
> > > > > >   bfi r1, r2, #0, #16
> > > > > >   bfi r0, ip, #0, #16
> > > > > >   bfi r1, r3, #16, #16
> > > > > >   bfi r0, r2, #16, #16
> > > > > >   bx  lr
> > > > > >
> > > > > > This seems to happen because vec_init pattern in neon.md
> > > has VDQ
> > > > > >> mode
> > > > > > iterator, which doesn't include V4BF. In attached patch,
> I
> > > changed
> > > > > > mode
> > > > > > to VDQX which seems to work for the test-case, and the
> > > compiler
> > > > >  now
> > > > > >> generates:
> > > > > > f2:
> > > > > >   vdup.16 d16, r0
> > > > > >   vmovr0, r1, d16  @ v4bf
> > > > > >   bx  lr
> > > > > >
> > > > > > However, the pattern is also gated on TARGET_HAVE_MVE
> > > and I am
> > > > > >> not
> > > > > > sure if either VDQ or VDQX are correct modes for MVE
> since
> > > MVE
> > > > >  has
> > > > > > only 128-bit vectors ?
> > > > > >
> > > > >  I think patterns common to both Neon and MVE should be
> > > moved to
> > > > >  vec-common.md, I don't know why such patterns were left in
> > > > >  neon.md.
> > > > > >>> Since we end up calling neon_expand_vector_init for both
> > > NEON and
> > > > >  MVE,
> > > > > >>> I am not sure if we should separate the pattern ?
> > > > > >>> Would it make sense to FAIL if the mode size isn't 16
> bytes for
> > > MVE as
> > > > > >>> in attached patch so
> > > > > >>> it will call neon_expand_vector_init only for 128-bit
> vectors ?
> > > > > >>> Altho hard-coding 16 in the pattern doesn't seem a good
> idea to
> > 

Re: Re: [PATCH] Fix ICE when mixing VLAs and statement expressions [PR91038]

2021-08-03 Thread Richard Biener via Gcc-patches
On Tue, Aug 3, 2021 at 10:28 AM Martin Uecker  wrote:
>
>
> Hi
> Am Dienstag, den 03.08.2021, 10:10 +0200 schrieb Richard Biener:
> > On Tue, Aug 3, 2021 at 7:32 AM Martin Uecker  wrote:
> > >
> > >
> > > (resending from a different account, as emails seem to do not
> > > go out from my other account at this time)
> > >
> > > Am Montag, den 02.08.2021, 16:05 +0200 schrieb Martin Uecker:
> > > > > On Sun, Aug 1, 2021 at 7:37 PM Uecker, Martin
> > > > >  wrote:
> > > > > >
> > > > > > Here is an attempt to fix some old and annoying bugs related
> > > > > > to VLAs and statement expressions. In particulary, this seems
> > > > > > to fix the issues with variably-modified types which are
> > > > > > returned from statement expressions (which works on clang),
> > > > > > but there are still bugs remaining related to structs
> > > > > > with VLA members (which seems to be a FE bug).
> > > > > >
> > > > > > Of course, I might be doing something stupid...
> > > > >
> > > > > How's evaluation order of (f())[g()] defined (with f returning
> > > > > a
> > > > > pointer)?
> > > > > Isn't that just f() + g()*sizeof(int) and thus undefined?
> > > >
> > > > Yes, in C it is
> > > >
> > > > f() + g()
> > > >
> > > > and it is unsequenced. But the order of 'f' and 'g'
> > > > is not relevant here and also the patch does not change
> > > > it (the base expression is gimplified before the index).
> > > >
> > > > Essentially, we have
> > > >
> > > > ({ ... }) + g() * sizeof(X)
> > > >
> > > > where X refers to a declaration in the statement expression.
> > > > Without the patch the size expressions are gimplified before
> > > > the base expression and also before the index expression.
> > > > With the patch the ({ ... }) is gimplified also before the
> > > > size expression.
> > > >
> > > > > If it's undefined then I think the incoming GENERIC is ill-
> > > > > defined.
> > > >
> > > > I think it is OK because the arguments are evaluated
> > > > before the operation.  Without the patch, parts of the
> > > > operation (the size expressions) are gimplified before
> > > > the arguments and this seems wrong to me.
> >
> > But you said the evaluation order is undefined.
>
> The evaluation order of the two arguments (base
> and index) is undefined.  But the operation itself has
> to happen after the arguments are evaluated like
> the call to a is sequenced before f and g:
>
> a(f(), g())
>
>
> Computing the correct step size in the pointer
> arithmetic is part of the operation itself and not
> part of the evaluation of the arguments.
>
> The problem here is that this part of the operation
> is done before the arguments are evaluated, which
> is a compiler bug.

Yes, but the bug is IMHO in the C frontend which inserts the DECL_EXPR
at a wrong spot, not making sure it is evaluated before it is used.  Working
around this deficiency in the gimplifier sounds incorrect.

Does the same issue arise with writing the testcases as

 ({ ... }) + i;

?  How can we fix it then if you also need to support

 i + ({ ...});

?

> > So IMHO the GENERIC is undefined in evaluating the size of sth
> > that's not live?
> >
> >  That said, given the statement expression
> > result undergoes array to pointer decay doesn't this pointer
> > refer to an object that ended its lifetime?
>
> > "In a statement expression, any temporaries created within a
> > statement are destroyed at that statement's end."
>
> > That is, don't the testcases all invoke undefined behavior at
> > runtime?
>
> This is true for one of the test cases (where not having
> an ICE is then
> a QoI issue), but not for the others
> where the object is allocated by
> malloc and a pointer
> to the object is returned from the statement
> expression.
> This is supposed to work.
>
>
> Martin
>
>
>


Re: [PATCH] Add the member integer_to_sse to processor_cost as a cost simulation for movd/pinsrd. It will be used to calculate the cost of vec_construct.

2021-08-03 Thread Richard Biener via Gcc-patches
On Wed, Jul 28, 2021 at 4:51 AM Hongtao Liu via Gcc-patches
 wrote:
>
> On Tue, Jul 27, 2021 at 9:54 AM Hongtao Liu  wrote:
> >
> > On Mon, Jul 26, 2021 at 4:49 PM Hongtao Liu  wrote:
> > >
> > > Correct mail list, please reply under this email.
> > >
> > > On Mon, Jul 26, 2021 at 4:47 PM liuhongt  wrote:
> > > >
> > > > Hi:
> > > >   As decribled in PR, the pinsr instruction has poor throughput in SKX
> > > > and CLX, which leads to worse performance in vectorization in some 
> > > > cases.
> > > > This patch adds a cost member named integer_to_sse to simulate 
> > > > pinsr/movd
> > > > which is used by vector construction, the cost is same as sse_op on 
> > > > other
> > > >  targets, but twice much as sse_op on CLX/SKX.
> > > >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > >   Ok for trunk?
> > > >
> > I'm going to check in this patch if there's no objection.
> Pushed to trunk.

  /* N element inserts into SSE vectors.  */
- int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
+ int cost
+   = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
+   ix86_cost->sse_op
+   : ix86_cost->integer_to_sse);
+

so that's costing movd and pinsr the same, shouldn't we try to separate this
by doing

 /* N element inserts into SSE vectors.  */
 int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
 /* Account for int->SSE reg moves.  */
 if (!fp)
   cost += TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->integer_to_sse;

?  pinsr is only supported with SSE4+ IIRC.  Note we also have

  case vec_to_scalar:
  case scalar_to_vec:
return ix86_vec_cost (mode, ix86_cost->sse_op);

where scalar_to_vec is used to cost splats and vec_to_scalar is used
to cost element extracts.  Both lack costing of the move part.

I realize we have GPR to XMM inserts which cover both the "move" and
the insert but then calling this 'integer_to_sse' is a bit odd.  The extract
cost also depends on the element number for AVX2/AVX512F.  The
vectorizer usually decomposes a vector fully and never does single
element extracts so the vextract128 cost amortizes.

That said, the change leaves all targets besides skylake_cost with
not so great defaults I think.  For skylake you effectively add another
sse_op for the int->SSE move plus '1' (for whatever reason).  I think
that's reasonable for all targets.

It does look a bit odd to have

   8,   /* cost of moving SSE register to intege
r.  */
  COSTS_N_INSNS (1),   /* cost of moving integer to sse registe
r.  */

where sse_to_integer is used by the STV pass which mixes
CONST_N_INSNS scaled costs and unscaled costs (ick).

Richard.

> > > > gcc/ChangeLog:
> > > >
> > > > PR target/99881
> > > > * config/i386/i386.h (processor_costs): Add new member
> > > > integer_to_sse.
> > > > * config/i386/x86-tune-costs.h (ix86_size_cost, i386_cost,
> > > > i486_cost, pentium_cost, lakemont_cost, pentiumpro_cost,
> > > > geode_cost, k6_cost, athlon_cost, k8_cost, amdfam10_cost,
> > > > bdver_cost, znver1_cost, znver2_cost, znver3_cost,
> > > > btver1_cost, btver2_cost, btver3_cost, pentium4_cost,
> > > > nocona_cost, atom_cost, atom_cost, slm_cost, intel_cost,
> > > > generic_cost, core_cost): Initialize integer_to_sse same value
> > > > as sse_op.
> > > > (skylake_cost): Initialize integer_to_sse twice as much as 
> > > > sse_op.
> > > > * config/i386/i386.c (ix86_builtin_vectorization_cost):
> > > > Use integer_to_sse instead of sse_op to calculate the cost of
> > > > vec_construct.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > > PR target/99881
> > > > * gcc.target/i386/pr99881.c: New test.
> > > > ---
> > > >  gcc/config/i386/i386.c  |  6 ++-
> > > >  gcc/config/i386/i386.h  |  1 +
> > > >  gcc/config/i386/x86-tune-costs.h| 26 +
> > > >  gcc/testsuite/gcc.target/i386/pr99881.c | 49 +
> > > >  4 files changed, 81 insertions(+), 1 deletion(-)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr99881.c
> > > >
> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > index ff96134fb37..fbebd2d8f9a 100644
> > > > --- a/gcc/config/i386/i386.c
> > > > +++ b/gcc/config/i386/i386.c
> > > > @@ -22051,7 +22051,11 @@ ix86_builtin_vectorization_cost (enum 
> > > > vect_cost_for_stmt type_of_cost,
> > > >case vec_construct:
> > > > {
> > > >   /* N element inserts into SSE vectors.  */
> > > > - int cost = TYPE_VECTOR_SUBPARTS (vectype) * ix86_cost->sse_op;
> > > > + int cost
> > > > +   = TYPE_VECTOR_SUBPARTS (vectype) * (fp ?
> > > > +   

[PATCH] Fix loop split incorrect count and probability

2021-08-03 Thread Xionghu Luo via Gcc-patches
loop split condition is moved between loop1 and loop2, the split bb's
count and probability should also be duplicated instead of (100% vs INV),
secondly, the original loop1 and loop2 count need be propotional from the
original loop.

Regression tested pass, OK for master?

diff base/loop-cond-split-1.c.151t.lsplit  
patched/loop-cond-split-1.c.151t.lsplit:
...
   int prephitmp_16;
   int prephitmp_25;

[local count: 118111600]:
   if (n_7(D) > 0)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 118111600]:
   return;

[local count: 105119324]:
   pretmp_3 = ga;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # i_13 = PHI 
   # prephitmp_12 = PHI 
   if (prephitmp_12 != 0)
 goto ; [33.00%]
   else
 goto ; [67.00%]

-   [local count: 315357972]:
+   [local count: 104068130]:
   _2 = do_something ();
   ga = _2;

-   [local count: 955630225]:
+   [local count: 315357973]:
   # prephitmp_5 = PHI 
   i_10 = inc (i_13);
   if (n_7(D) > i_10)
 goto ; [89.00%]
   else
 goto ; [11.00%]

[local count: 105119324]:
   goto ; [100.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   if (prephitmp_12 != 0)
-goto ; [100.00%]
+goto ; [33.00%]
   else
-goto ; [INV]
+goto ; [67.00%]

-   [local count: 850510901]:
+   [local count: 280668596]:
   goto ; [100.00%]

-   [count: 0]:
+   [local count: 70429947]:
   # i_23 = PHI 
   # prephitmp_25 = PHI 

-   [local count: 955630225]:
+   [local count: 640272252]:
   # i_15 = PHI 
   # prephitmp_16 = PHI 
   i_22 = inc (i_15);
   if (n_7(D) > i_22)
 goto ; [89.00%]
   else
 goto ; [11.00%]

-   [local count: 850510901]:
+   [local count: 569842305]:
   goto ; [100.00%]

 }

gcc/ChangeLog:

* tree-ssa-loop-split.c (split_loop): Fix incorrect probability.
(do_split_loop_on_cond): Likewise.
---
 gcc/tree-ssa-loop-split.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c
index 3a09bbc39e5..8e5a7ded0f7 100644
--- a/gcc/tree-ssa-loop-split.c
+++ b/gcc/tree-ssa-loop-split.c
@@ -583,10 +583,10 @@ split_loop (class loop *loop1)
basic_block cond_bb;
 
class loop *loop2 = loop_version (loop1, cond, _bb,
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
-  profile_probability::always (),
+  true_edge->probability,
+  true_edge->probability.invert (),
+  true_edge->probability,
+  true_edge->probability.invert (),
   true);
gcc_assert (loop2);
 
@@ -1486,10 +1486,10 @@ do_split_loop_on_cond (struct loop *loop1, edge 
invar_branch)
   initialize_original_copy_tables ();
 
   struct loop *loop2 = loop_version (loop1, boolean_true_node, NULL,
-profile_probability::always (),
-profile_probability::never (),
-profile_probability::always (),
-profile_probability::always (),
+invar_branch->probability.invert (),
+invar_branch->probability,
+invar_branch->probability.invert (),
+invar_branch->probability,
 true);
   if (!loop2)
 {
-- 
2.25.1



[PATCH][www] Move mipsisa64elf from primary to secondary as mips64-linux-gnu

2021-08-03 Thread Richard Biener
This removes mipsisa64elf from the list of primary targets, adding
mips64-linux-gnu as secondary target for GCC 12.  Testresults for
the latter are regularly posted by Matthias Klose.
---
 htdocs/gcc-12/criteria.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htdocs/gcc-12/criteria.html b/htdocs/gcc-12/criteria.html
index 58e82021..f407d909 100644
--- a/htdocs/gcc-12/criteria.html
+++ b/htdocs/gcc-12/criteria.html
@@ -109,7 +109,6 @@ application testing.
 arm-linux-gnueabi
 i586-unknown-freebsd
 i686-pc-linux-gnu
-mipsisa64-elf
 powerpc64-unknown-linux-gnu
 powerpc64le-unknown-linux-gnu
 sparc-sun-solaris2.11
@@ -126,6 +125,7 @@ application testing.
 i686-mingw32
 powerpc-ibm-aix7.1.0.0
 s390x-linux-gnu
+mips64-linux-gnu
 
 
 Code Quality and Compilation Time
-- 
2.31.1


Re: [PATCH] x86: Use XMM31 for scratch SSE register

2021-08-03 Thread Uros Bizjak via Gcc-patches
On Tue, Aug 3, 2021 at 10:15 AM Hongtao Liu  wrote:
>
> On Tue, Aug 3, 2021 at 4:03 PM Uros Bizjak via Gcc-patches
>  wrote:
> >
> > On Mon, Aug 2, 2021 at 7:47 PM H.J. Lu  wrote:
> > >
> > > In 64-bit mode, use XMM31 for scratch SSE register to avoid vzeroupper
> > > if possible.
> > >
> > > gcc/
> > >
> > > * config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit mode,
> > > try XMM31 to avoid vzeroupper.
> > >
> > > gcc/testsuite/
> > >
> > > * gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to
> > > disable XMM31.
> > > * gcc.target/i386/avx-vzeroupper-15.c: Likewise.
> > > * gcc.target/i386/pr82941-1.c: Updated.  Check for vzeroupper.
> > > * gcc.target/i386/pr82942-1.c: Likewise.
> > > * gcc.target/i386/pr82990-1.c: Likewise.
> > > * gcc.target/i386/pr82990-3.c: Likewise.
> > > * gcc.target/i386/pr82990-5.c: Likewise.
> > > * gcc.target/i386/pr100865-4b.c: Likewise.
> > > * gcc.target/i386/pr100865-6b.c: Likewise.
> > > * gcc.target/i386/pr100865-7b.c: Likewise.
> > > * gcc.target/i386/pr100865-10b.c: Likewise.
> > > * gcc.target/i386/pr100865-8b.c: Updated.
> > > * gcc.target/i386/pr100865-9b.c: Likewise.
> > > * gcc.target/i386/pr100865-11b.c: Likewise.
> > > * gcc.target/i386/pr100865-12b.c: Likewise.
> > > ---
> > >  gcc/config/i386/i386.c | 18 +++---
> > >  .../gcc.target/i386/avx-vzeroupper-14.c|  2 +-
> > >  .../gcc.target/i386/avx-vzeroupper-15.c|  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-10b.c   |  1 +
> > >  gcc/testsuite/gcc.target/i386/pr100865-11b.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-12b.c   |  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-4b.c|  2 ++
> > >  gcc/testsuite/gcc.target/i386/pr100865-6b.c|  5 -
> > >  gcc/testsuite/gcc.target/i386/pr100865-7b.c|  5 -
> > >  gcc/testsuite/gcc.target/i386/pr100865-8b.c|  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr100865-9b.c|  2 +-
> > >  gcc/testsuite/gcc.target/i386/pr82941-1.c  |  3 ++-
> > >  gcc/testsuite/gcc.target/i386/pr82942-1.c  |  3 ++-
> > >  gcc/testsuite/gcc.target/i386/pr82990-1.c  |  3 ++-
> > >  gcc/testsuite/gcc.target/i386/pr82990-3.c  |  3 ++-
> > >  gcc/testsuite/gcc.target/i386/pr82990-5.c  |  3 ++-
> > >  16 files changed, 42 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > index 842eb0e6786..ec0690876b7 100644
> > > --- a/gcc/config/i386/i386.c
> > > +++ b/gcc/config/i386/i386.c
> > > @@ -23335,9 +23335,21 @@ rtx
> > >  ix86_gen_scratch_sse_rtx (machine_mode mode)
> > >  {
> > >if (TARGET_SSE && !lra_in_progress)
> > > -return gen_rtx_REG (mode, (TARGET_64BIT
> > > -  ? LAST_REX_SSE_REG
> > > -  : LAST_SSE_REG));
> > > +{
> > > +  unsigned int regno;
> > > +  if (TARGET_64BIT)
> > > +   {
> > > + /* In 64-bit mode, use XMM31 to avoid vzeroupper and always
> > > +use XMM31 for CSE.  */
> > > + if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
> > > +   regno = LAST_EXT_REX_SSE_REG;
> > > + else
> > > +   regno = LAST_REX_SSE_REG;
> > > +   }
> > > +  else
> > > +   regno = LAST_SSE_REG;
> >
> > Assuming that ix86_hard_regno_mode_ok always returns false for XMM31
> > in 64bit mode, we can do:
> >
> > /* Use XMM31 if available to avoid vzeroupper.  */
> > if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
> >   regno = LAST_EXST_REX_SSE_REG;
> > else if (TARGET_64BIT)
> >   regno = LAST_EXT_REX_SSE_REG;

> why? w/o avx512 xmm31 is not available.

Oh, a typo, this should read LAST_REX_SSE_REG.

Uros.


Re: Re: [PATCH] Fix ICE when mixing VLAs and statement expressions [PR91038]

2021-08-03 Thread Martin Uecker


Hi 
Am Dienstag, den 03.08.2021, 10:10 +0200 schrieb Richard Biener:
> On Tue, Aug 3, 2021 at 7:32 AM Martin Uecker  wrote:
> > 
> > 
> > (resending from a different account, as emails seem to do not
> > go out from my other account at this time)
> > 
> > Am Montag, den 02.08.2021, 16:05 +0200 schrieb Martin Uecker:
> > > > On Sun, Aug 1, 2021 at 7:37 PM Uecker, Martin
> > > >  wrote:
> > > > > 
> > > > > Here is an attempt to fix some old and annoying bugs related
> > > > > to VLAs and statement expressions. In particulary, this seems
> > > > > to fix the issues with variably-modified types which are
> > > > > returned from statement expressions (which works on clang),
> > > > > but there are still bugs remaining related to structs
> > > > > with VLA members (which seems to be a FE bug).
> > > > > 
> > > > > Of course, I might be doing something stupid...
> > > > 
> > > > How's evaluation order of (f())[g()] defined (with f returning
> > > > a
> > > > pointer)?
> > > > Isn't that just f() + g()*sizeof(int) and thus undefined?
> > > 
> > > Yes, in C it is
> > > 
> > > f() + g()
> > > 
> > > and it is unsequenced. But the order of 'f' and 'g'
> > > is not relevant here and also the patch does not change
> > > it (the base expression is gimplified before the index).
> > > 
> > > Essentially, we have
> > > 
> > > ({ ... }) + g() * sizeof(X)
> > > 
> > > where X refers to a declaration in the statement expression.
> > > Without the patch the size expressions are gimplified before
> > > the base expression and also before the index expression.
> > > With the patch the ({ ... }) is gimplified also before the
> > > size expression.
> > > 
> > > > If it's undefined then I think the incoming GENERIC is ill-
> > > > defined.
> > > 
> > > I think it is OK because the arguments are evaluated
> > > before the operation.  Without the patch, parts of the
> > > operation (the size expressions) are gimplified before
> > > the arguments and this seems wrong to me.
> 
> But you said the evaluation order is undefined.

The evaluation order of the two arguments (base
and index) is undefined.  But the operation itself has
to happen after the arguments are evaluated like
the call to a is sequenced before f and g:

a(f(), g())


Computing the correct step size in the pointer
arithmetic is part of the operation itself and not
part of the evaluation of the arguments.

The problem here is that this part of the operation
is done before the arguments are evaluated, which
is a compiler bug.

> So IMHO the GENERIC is undefined in evaluating the size of sth
> that's not live? 
>
>  That said, given the statement expression
> result undergoes array to pointer decay doesn't this pointer
> refer to an object that ended its lifetime?

> "In a statement expression, any temporaries created within a
> statement are destroyed at that statement's end."

> That is, don't the testcases all invoke undefined behavior at
> runtime?

This is true for one of the test cases (where not having
an ICE is then
a QoI issue), but not for the others
where the object is allocated by
malloc and a pointer
to the object is returned from the statement
expression.
This is supposed to work.


Martin





Re: [PATCH] x86: Use XMM31 for scratch SSE register

2021-08-03 Thread Hongtao Liu via Gcc-patches
On Tue, Aug 3, 2021 at 4:03 PM Uros Bizjak via Gcc-patches
 wrote:
>
> On Mon, Aug 2, 2021 at 7:47 PM H.J. Lu  wrote:
> >
> > In 64-bit mode, use XMM31 for scratch SSE register to avoid vzeroupper
> > if possible.
> >
> > gcc/
> >
> > * config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit mode,
> > try XMM31 to avoid vzeroupper.
> >
> > gcc/testsuite/
> >
> > * gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to
> > disable XMM31.
> > * gcc.target/i386/avx-vzeroupper-15.c: Likewise.
> > * gcc.target/i386/pr82941-1.c: Updated.  Check for vzeroupper.
> > * gcc.target/i386/pr82942-1.c: Likewise.
> > * gcc.target/i386/pr82990-1.c: Likewise.
> > * gcc.target/i386/pr82990-3.c: Likewise.
> > * gcc.target/i386/pr82990-5.c: Likewise.
> > * gcc.target/i386/pr100865-4b.c: Likewise.
> > * gcc.target/i386/pr100865-6b.c: Likewise.
> > * gcc.target/i386/pr100865-7b.c: Likewise.
> > * gcc.target/i386/pr100865-10b.c: Likewise.
> > * gcc.target/i386/pr100865-8b.c: Updated.
> > * gcc.target/i386/pr100865-9b.c: Likewise.
> > * gcc.target/i386/pr100865-11b.c: Likewise.
> > * gcc.target/i386/pr100865-12b.c: Likewise.
> > ---
> >  gcc/config/i386/i386.c | 18 +++---
> >  .../gcc.target/i386/avx-vzeroupper-14.c|  2 +-
> >  .../gcc.target/i386/avx-vzeroupper-15.c|  2 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-10b.c   |  1 +
> >  gcc/testsuite/gcc.target/i386/pr100865-11b.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-12b.c   |  2 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-4b.c|  2 ++
> >  gcc/testsuite/gcc.target/i386/pr100865-6b.c|  5 -
> >  gcc/testsuite/gcc.target/i386/pr100865-7b.c|  5 -
> >  gcc/testsuite/gcc.target/i386/pr100865-8b.c|  2 +-
> >  gcc/testsuite/gcc.target/i386/pr100865-9b.c|  2 +-
> >  gcc/testsuite/gcc.target/i386/pr82941-1.c  |  3 ++-
> >  gcc/testsuite/gcc.target/i386/pr82942-1.c  |  3 ++-
> >  gcc/testsuite/gcc.target/i386/pr82990-1.c  |  3 ++-
> >  gcc/testsuite/gcc.target/i386/pr82990-3.c  |  3 ++-
> >  gcc/testsuite/gcc.target/i386/pr82990-5.c  |  3 ++-
> >  16 files changed, 42 insertions(+), 16 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index 842eb0e6786..ec0690876b7 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -23335,9 +23335,21 @@ rtx
> >  ix86_gen_scratch_sse_rtx (machine_mode mode)
> >  {
> >if (TARGET_SSE && !lra_in_progress)
> > -return gen_rtx_REG (mode, (TARGET_64BIT
> > -  ? LAST_REX_SSE_REG
> > -  : LAST_SSE_REG));
> > +{
> > +  unsigned int regno;
> > +  if (TARGET_64BIT)
> > +   {
> > + /* In 64-bit mode, use XMM31 to avoid vzeroupper and always
> > +use XMM31 for CSE.  */
> > + if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
> > +   regno = LAST_EXT_REX_SSE_REG;
> > + else
> > +   regno = LAST_REX_SSE_REG;
> > +   }
> > +  else
> > +   regno = LAST_SSE_REG;
>
> Assuming that ix86_hard_regno_mode_ok always returns false for XMM31
> in 64bit mode, we can do:
>
> /* Use XMM31 if available to avoid vzeroupper.  */
> if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
>   regno = LAST_EXST_REX_SSE_REG;
> else if (TARGET_64BIT)
>   regno = LAST_EXT_REX_SSE_REG;
why? w/o avx512 xmm31 is not available.
> else
>   regno = LAST_SSE_REG;
>
> Uros.
>
> > +  return gen_rtx_REG (mode, regno);
> > +}
> >else
> >  return gen_reg_rtx (mode);
> >  }
> > diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c 
> > b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
> > index a31b4a2a63a..9590f25da22 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -mavx -mtune=generic -dp" } */
> > +/* { dg-options "-O2 -mavx -mno-avx512f -mtune=generic -dp" } */
> >
> >  #include 
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c 
> > b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
> > index 803936eef01..36dcf7367f1 100644
> > --- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
> > +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -mavx -mtune=generic -dp" } */
> > +/* { dg-options "-O2 -mavx -mno-avx512f -mtune=generic -dp" } */
> >
> >  #include 
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c 
> > b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > index e5616d8d258..77ace86ffe8 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> > @@ -5,3 +5,4 @@
> >
> >  

Re: [PATCH] analyzer: Fix ICE on MD builtin [PR101721]

2021-08-03 Thread Richard Biener via Gcc-patches
On Tue, Aug 3, 2021 at 9:11 AM Jakub Jelinek via Gcc-patches
 wrote:
>
> Hi!
>
> The following testcase ICEs because DECL_FUNCTION_CODE asserts the builtin
> is BUILT_IN_NORMAL, but it sees a backend (MD) builtin instead.
> The FE, normal and MD builtin numbers overlap, so one should always
> check what kind of builtin it is before looking at specific codes.
>
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
> trunk?

OK.

> On the other side, region-model.cc has:
>   if (fndecl_built_in_p (callee_fndecl, BUILT_IN_NORMAL)
>   && gimple_builtin_call_types_compatible_p (call, callee_fndecl))
> switch (DECL_UNCHECKED_FUNCTION_CODE (callee_fndecl))
> which IMO should use DECL_FUNCTION_CODE instead, it checked first it is
> a normal builtin...
>
> 2021-08-03  Jakub Jelinek  
>
> PR analyzer/101721
> * sm-malloc.cc (known_allocator_p): Only check DECL_FUNCTION_CODE on
> BUILT_IN_NORMAL builtins.
>
> * gcc.dg/analyzer/pr101721.c: New test.
>
> --- gcc/analyzer/sm-malloc.cc.jj2021-07-29 13:24:42.664013344 +0200
> +++ gcc/analyzer/sm-malloc.cc   2021-08-02 17:42:17.312821855 +0200
> @@ -1543,7 +1543,7 @@ known_allocator_p (const_tree fndecl, co
>
>/* ... or it is a builtin allocator that allocates objects freed with
>   __builtin_free.  */
> -  if (fndecl_built_in_p (fndecl))
> +  if (fndecl_built_in_p (fndecl, BUILT_IN_NORMAL))
>  switch (DECL_FUNCTION_CODE (fndecl))
>{
>case BUILT_IN_MALLOC:
> --- gcc/testsuite/gcc.dg/analyzer/pr101721.c.jj 2021-08-02 17:48:50.375370371 
> +0200
> +++ gcc/testsuite/gcc.dg/analyzer/pr101721.c2021-08-02 17:49:38.967696432 
> +0200
> @@ -0,0 +1,8 @@
> +/* PR analyzer/101721 */
> +/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
> +
> +void
> +foo ()
> +{
> +  __builtin_ia32_pause ();
> +}
>
> Jakub
>


Re: Re: [PATCH] Fix ICE when mixing VLAs and statement expressions [PR91038]

2021-08-03 Thread Richard Biener via Gcc-patches
On Tue, Aug 3, 2021 at 7:32 AM Martin Uecker  wrote:
>
>
>
> (resending from a different account, as emails seem to do not
> go out from my other account at this time)
>
> Am Montag, den 02.08.2021, 16:05 +0200 schrieb Martin Uecker:
> > > On Sun, Aug 1, 2021 at 7:37 PM Uecker, Martin
> > >  wrote:
> > > >
> > > >
> > > > Here is an attempt to fix some old and annoying bugs related
> > > > to VLAs and statement expressions. In particulary, this seems
> > > > to fix the issues with variably-modified types which are
> > > > returned from statement expressions (which works on clang),
> > > > but there are still bugs remaining related to structs
> > > > with VLA members (which seems to be a FE bug).
> > > >
> > > > Of course, I might be doing something stupid...
> > >
> > > How's evaluation order of (f())[g()] defined (with f returning a
> > > pointer)?
> > > Isn't that just f() + g()*sizeof(int) and thus undefined?
> >
> > Yes, in C it is
> >
> > f() + g()
> >
> > and it is unsequenced. But the order of 'f' and 'g'
> > is not relevant here and also the patch does not change
> > it (the base expression is gimplified before the index).
> >
> > Essentially, we have
> >
> > ({ ... }) + g() * sizeof(X)
> >
> > where X refers to a declaration in the statement expression.
> > Without the patch the size expressions are gimplified before
> > the base expression and also before the index expression.
> > With the patch the ({ ... }) is gimplified also before the
> > size expression.
> >
> > > If it's undefined then I think the incoming GENERIC is ill-defined.
> >
> > I think it is OK because the arguments are evaluated
> > before the operation.  Without the patch, parts of the
> > operation (the size expressions) are gimplified before
> > the arguments and this seems wrong to me.

But you said the evaluation order is undefined.  So IMHO
the GENERIC is undefined in evaluating the size of sth
that's not live?  That said, given the statement expression
result undergoes array to pointer decay doesn't this pointer
refer to an object that ended its lifetime?

"In a statement expression, any temporaries created within a statement
are destroyed at that statement's end."

That is, don't the testcases all invoke undefined behavior at runtime?

Richard.

> >
> > Martin
> >
> >
> >
> >
>


Re: [PATCH] x86: Use XMM31 for scratch SSE register

2021-08-03 Thread Uros Bizjak via Gcc-patches
On Mon, Aug 2, 2021 at 7:47 PM H.J. Lu  wrote:
>
> In 64-bit mode, use XMM31 for scratch SSE register to avoid vzeroupper
> if possible.
>
> gcc/
>
> * config/i386/i386.c (ix86_gen_scratch_sse_rtx): In 64-bit mode,
> try XMM31 to avoid vzeroupper.
>
> gcc/testsuite/
>
> * gcc.target/i386/avx-vzeroupper-14.c: Pass -mno-avx512f to
> disable XMM31.
> * gcc.target/i386/avx-vzeroupper-15.c: Likewise.
> * gcc.target/i386/pr82941-1.c: Updated.  Check for vzeroupper.
> * gcc.target/i386/pr82942-1.c: Likewise.
> * gcc.target/i386/pr82990-1.c: Likewise.
> * gcc.target/i386/pr82990-3.c: Likewise.
> * gcc.target/i386/pr82990-5.c: Likewise.
> * gcc.target/i386/pr100865-4b.c: Likewise.
> * gcc.target/i386/pr100865-6b.c: Likewise.
> * gcc.target/i386/pr100865-7b.c: Likewise.
> * gcc.target/i386/pr100865-10b.c: Likewise.
> * gcc.target/i386/pr100865-8b.c: Updated.
> * gcc.target/i386/pr100865-9b.c: Likewise.
> * gcc.target/i386/pr100865-11b.c: Likewise.
> * gcc.target/i386/pr100865-12b.c: Likewise.
> ---
>  gcc/config/i386/i386.c | 18 +++---
>  .../gcc.target/i386/avx-vzeroupper-14.c|  2 +-
>  .../gcc.target/i386/avx-vzeroupper-15.c|  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-10b.c   |  1 +
>  gcc/testsuite/gcc.target/i386/pr100865-11b.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-12b.c   |  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-4b.c|  2 ++
>  gcc/testsuite/gcc.target/i386/pr100865-6b.c|  5 -
>  gcc/testsuite/gcc.target/i386/pr100865-7b.c|  5 -
>  gcc/testsuite/gcc.target/i386/pr100865-8b.c|  2 +-
>  gcc/testsuite/gcc.target/i386/pr100865-9b.c|  2 +-
>  gcc/testsuite/gcc.target/i386/pr82941-1.c  |  3 ++-
>  gcc/testsuite/gcc.target/i386/pr82942-1.c  |  3 ++-
>  gcc/testsuite/gcc.target/i386/pr82990-1.c  |  3 ++-
>  gcc/testsuite/gcc.target/i386/pr82990-3.c  |  3 ++-
>  gcc/testsuite/gcc.target/i386/pr82990-5.c  |  3 ++-
>  16 files changed, 42 insertions(+), 16 deletions(-)
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 842eb0e6786..ec0690876b7 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -23335,9 +23335,21 @@ rtx
>  ix86_gen_scratch_sse_rtx (machine_mode mode)
>  {
>if (TARGET_SSE && !lra_in_progress)
> -return gen_rtx_REG (mode, (TARGET_64BIT
> -  ? LAST_REX_SSE_REG
> -  : LAST_SSE_REG));
> +{
> +  unsigned int regno;
> +  if (TARGET_64BIT)
> +   {
> + /* In 64-bit mode, use XMM31 to avoid vzeroupper and always
> +use XMM31 for CSE.  */
> + if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
> +   regno = LAST_EXT_REX_SSE_REG;
> + else
> +   regno = LAST_REX_SSE_REG;
> +   }
> +  else
> +   regno = LAST_SSE_REG;

Assuming that ix86_hard_regno_mode_ok always returns false for XMM31
in 64bit mode, we can do:

/* Use XMM31 if available to avoid vzeroupper.  */
if (ix86_hard_regno_mode_ok (LAST_EXT_REX_SSE_REG, mode))
  regno = LAST_EXST_REX_SSE_REG;
else if (TARGET_64BIT)
  regno = LAST_EXT_REX_SSE_REG;
else
  regno = LAST_SSE_REG;

Uros.

> +  return gen_rtx_REG (mode, regno);
> +}
>else
>  return gen_reg_rtx (mode);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c 
> b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
> index a31b4a2a63a..9590f25da22 100644
> --- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
> +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-14.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -mavx -mtune=generic -dp" } */
> +/* { dg-options "-O2 -mavx -mno-avx512f -mtune=generic -dp" } */
>
>  #include 
>
> diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c 
> b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
> index 803936eef01..36dcf7367f1 100644
> --- a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
> +++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-15.c
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -mavx -mtune=generic -dp" } */
> +/* { dg-options "-O2 -mavx -mno-avx512f -mtune=generic -dp" } */
>
>  #include 
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-10b.c 
> b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> index e5616d8d258..77ace86ffe8 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-10b.c
> @@ -5,3 +5,4 @@
>
>  /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, 
> %ymm\[0-9\]+" 1 } } */
>  /* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%ymm\[0-9\]+, " 8 } } 
> */
> +/* { dg-final { scan-assembler-not "vzeroupper" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-11b.c 
> 

[PATCH] c: Fix ICE caused by get_parm_array_spec [PR101702]

2021-08-03 Thread Jakub Jelinek via Gcc-patches
Hi!

The following testcase ICEs, because nelts is NOP_EXPR around INTEGER_CST
- it is a VLA whose extent folds into a constant - and get_parm_array_spec
has specific INTEGER_CST handling and otherwise strips nops from nelts
and stores it into a TREE_LIST that is later asserted to be a DECL_P
or EXPR_P, where the INTEGER_CST is neither of that.

So, either we can strip nops earlier (needs moving the integral type
check first as STRIP_NOPS can alter that e.g. to pointer or from
pointer to integer) and thus handle as INTEGER_CST even the case
of INTEGER_CST wrapped into casts as this patch does, or we need
to change handle_argspec_attribute's assertion to allow INTEGER_CSTs
as well there.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
Or do you prefer to change handle_argspec_attribute ?

2021-08-03  Jakub Jelinek  

PR c/101702
* c-decl.c (get_parm_array_spec): Check for non-integral
type first, then STRIP_NOPS and only afterwards check for
INTEGER_CST.

* gcc.dg/pr101702.c: New test.

--- gcc/c/c-decl.c.jj   2021-07-15 18:50:52.0 +0200
+++ gcc/c/c-decl.c  2021-08-02 18:56:35.532045128 +0200
@@ -5842,6 +5842,11 @@ get_parm_array_spec (const struct c_parm
   if (pd->u.array.static_p)
spec += 's';
 
+  if (!INTEGRAL_TYPE_P (TREE_TYPE (nelts)))
+   /* Avoid invalid NELTS.  */
+   return attrs;
+
+  STRIP_NOPS (nelts);
   if (TREE_CODE (nelts) == INTEGER_CST)
{
  /* Skip all constant bounds except the most significant one.
@@ -5859,13 +5864,9 @@ get_parm_array_spec (const struct c_parm
  spec += buf;
  break;
}
-  else if (!INTEGRAL_TYPE_P (TREE_TYPE (nelts)))
-   /* Avoid invalid NELTS.  */
-   return attrs;
 
   /* Each variable VLA bound is represented by a dollar sign.  */
   spec += "$";
-  STRIP_NOPS (nelts);
   vbchain = tree_cons (NULL_TREE, nelts, vbchain);
 }
 
--- gcc/testsuite/gcc.dg/pr101702.c.jj  2021-08-02 18:58:24.614534975 +0200
+++ gcc/testsuite/gcc.dg/pr101702.c 2021-08-02 18:57:52.611978024 +0200
@@ -0,0 +1,11 @@
+/* PR c/101702 */
+/* { dg-do compile } */
+/* { dg-options "" } */
+
+double foo (double x[!__builtin_copysignf (~2, 3)]);
+
+double
+bar (double *x)
+{
+  return foo (x);
+}

Jakub



[PATCH] analyzer: Fix ICE on MD builtin [PR101721]

2021-08-03 Thread Jakub Jelinek via Gcc-patches
Hi!

The following testcase ICEs because DECL_FUNCTION_CODE asserts the builtin
is BUILT_IN_NORMAL, but it sees a backend (MD) builtin instead.
The FE, normal and MD builtin numbers overlap, so one should always
check what kind of builtin it is before looking at specific codes.

Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
trunk?

On the other side, region-model.cc has:
  if (fndecl_built_in_p (callee_fndecl, BUILT_IN_NORMAL)
  && gimple_builtin_call_types_compatible_p (call, callee_fndecl))
switch (DECL_UNCHECKED_FUNCTION_CODE (callee_fndecl))
which IMO should use DECL_FUNCTION_CODE instead, it checked first it is
a normal builtin...

2021-08-03  Jakub Jelinek  

PR analyzer/101721
* sm-malloc.cc (known_allocator_p): Only check DECL_FUNCTION_CODE on
BUILT_IN_NORMAL builtins.

* gcc.dg/analyzer/pr101721.c: New test.

--- gcc/analyzer/sm-malloc.cc.jj2021-07-29 13:24:42.664013344 +0200
+++ gcc/analyzer/sm-malloc.cc   2021-08-02 17:42:17.312821855 +0200
@@ -1543,7 +1543,7 @@ known_allocator_p (const_tree fndecl, co
 
   /* ... or it is a builtin allocator that allocates objects freed with
  __builtin_free.  */
-  if (fndecl_built_in_p (fndecl))
+  if (fndecl_built_in_p (fndecl, BUILT_IN_NORMAL))
 switch (DECL_FUNCTION_CODE (fndecl))
   {
   case BUILT_IN_MALLOC:
--- gcc/testsuite/gcc.dg/analyzer/pr101721.c.jj 2021-08-02 17:48:50.375370371 
+0200
+++ gcc/testsuite/gcc.dg/analyzer/pr101721.c2021-08-02 17:49:38.967696432 
+0200
@@ -0,0 +1,8 @@
+/* PR analyzer/101721 */
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+
+void
+foo ()
+{
+  __builtin_ia32_pause ();
+}

Jakub



[PATCH] c++: Implement P0466R5 __cpp_lib_is_layout_compatible compiler helpers [PR101539]

2021-08-03 Thread Jakub Jelinek via Gcc-patches
Hi!

The following patch implements __is_layout_compatible trait and
__builtin_is_corresponding_member helper function for the
std::is_corresponding_member template function.
For now it implements the IMHO buggy but
standard definition of layout-compatible and std::is_layout_compatible
requirements (that Jonathan was discussing to change),
including ignoring of alignment differences, mishandling of bitfields in unions
and [[no_unique_address]] issues with empty classes.
Until we know what exactly is decided in a CWG that seems better to trying
to guess what the standard will say, but of course if you have different
ideas, the patch can change.

For __builtin_is_corresponding_member, it will sorry if corresponding members
could have different offsets (doesn't do so during constant evaluation but
unless one uses the builtin directly, even using std::is_corresponding_member
in constant expressions only will result in instantiation of the template and
the code in the template doesn't have constant arguments and so can emit sorry).
For anonymous structs (GCC extension) it will recurse into the anonymous
structs.  For anonymous unions it will emit another sorry if it can't prove such
member types can't appear in the anonymous unions or anonymous aggregates in
that union, because corresponding member is defined only using common initial
sequence which is only defined for std-layout non-union class types and so I
have no idea what to do otherwise in that case.

Bootstrapped/regtested on x86_64-linux and i686-linux.

2021-08-03  Jakub Jelinek  

PR c++/101539
gcc/c-family/
* c-common.h (enum rid): Add RID_IS_LAYOUT_COMPATIBLE.
* c-common.c (c_common_reswords): Add __is_layout_compatible.
gcc/cp/
* cp-tree.h (enum cp_trait_kind): Add CPTK_IS_LAYOUT_COMPATIBLE.
(enum cp_built_in_function): Add CP_BUILT_IN_IS_CORRESPONDING_MEMBER.
(fold_builtin_is_corresponding_member, layout_compatible_type_p):
Declare.
* parser.c (cp_parser_primary_expression): Handle
RID_IS_LAYOUT_COMPATIBLE.
(cp_parser_trait_expr): Likewise.
* cp-objcp-common.c (names_builtin_p): Likewise.
* constraint.cc (diagnose_trait_expr): Handle
CPTK_IS_LAYOUT_COMPATIBLE.
* decl.c (cxx_init_decl_processing): Register
__builtin_is_corresponding_member builtin.
* constexpr.c (cxx_eval_builtin_function_call): Handle
CP_BUILT_IN_IS_CORRESPONDING_MEMBER builtin.
* semantics.c (is_corresponding_member_union,
is_corresponding_member_aggr, fold_builtin_is_corresponding_member):
New functions.
(trait_expr_value): Handle CPTK_IS_LAYOUT_COMPATIBLE.
(finish_trait_expr): Likewise.
* typeck.c (layout_compatible_type_p): New function.
* cp-gimplify.c (cp_gimplify_expr): Fold
CP_BUILT_IN_IS_CORRESPONDING_MEMBER.
(cp_fold): Likewise.
* tree.c (builtin_valid_in_constant_expr_p): Handle
CP_BUILT_IN_IS_CORRESPONDING_MEMBER.
* cxx-pretty-print.c (pp_cxx_trait_expression): Handle
CPTK_IS_LAYOUT_COMPATIBLE.
* class.c (remove_zero_width_bit_fields): Remove.
(layout_class_type): Don't call it.
gcc/testsuite/
* g++.dg/cpp2a/is-corresponding-member1.C: New test.
* g++.dg/cpp2a/is-corresponding-member2.C: New test.
* g++.dg/cpp2a/is-corresponding-member3.C: New test.
* g++.dg/cpp2a/is-corresponding-member4.C: New test.
* g++.dg/cpp2a/is-corresponding-member5.C: New test.
* g++.dg/cpp2a/is-corresponding-member6.C: New test.
* g++.dg/cpp2a/is-corresponding-member7.C: New test.
* g++.dg/cpp2a/is-corresponding-member8.C: New test.
* g++.dg/cpp2a/is-layout-compatible1.C: New test.
* g++.dg/cpp2a/is-layout-compatible2.C: New test.
* g++.dg/cpp2a/is-layout-compatible3.C: New test.

--- gcc/c-family/c-common.h.jj  2021-07-31 18:35:23.879983218 +0200
+++ gcc/c-family/c-common.h 2021-07-31 18:37:07.038600605 +0200
@@ -173,7 +173,8 @@ enum rid
   RID_IS_ABSTRACT, RID_IS_AGGREGATE,
   RID_IS_BASE_OF,  RID_IS_CLASS,
   RID_IS_EMPTY,RID_IS_ENUM,
-  RID_IS_FINAL,RID_IS_LITERAL_TYPE,
+  RID_IS_FINAL,RID_IS_LAYOUT_COMPATIBLE,
+  RID_IS_LITERAL_TYPE,
   RID_IS_POINTER_INTERCONVERTIBLE_BASE_OF,
   RID_IS_POD,  RID_IS_POLYMORPHIC,
   RID_IS_SAME_AS,
--- gcc/c-family/c-common.c.jj  2021-07-31 09:17:09.190343988 +0200
+++ gcc/c-family/c-common.c 2021-07-31 18:35:23.881983192 +0200
@@ -420,6 +420,7 @@ const struct c_common_resword c_common_r
   { "__is_empty",  RID_IS_EMPTY,   D_CXXONLY },
   { "__is_enum",   RID_IS_ENUM,D_CXXONLY },
   { "__is_final",  RID_IS_FINAL,   D_CXXONLY },
+  { "__is_layout_compatible", RID_IS_LAYOUT_COMPATIBLE, D_CXXONLY },
   { "__is_literal_type", RID_IS_LITERAL_TYPE, D_CXXONLY },
   { "__is_pointer_interconvertible_base_of",
  

[PATCH] Add cond_add/sub/mul for vector integer modes.

2021-08-03 Thread liuhongt via Gcc-patches
Hi:
  This is a follow up of [1].
  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Pushed to trunk.
[1] https://gcc.gnu.org/pipermail/gcc-patches/2021-August/576514.html

gcc/ChangeLog:

* config/i386/sse.md (cond_): New expander.
(cond_mul): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cond_op_addsubmul_d-1.c: New test.
* gcc.target/i386/cond_op_addsubmul_d-2.c: New test.
* gcc.target/i386/cond_op_addsubmul_q-1.c: New test.
* gcc.target/i386/cond_op_addsubmul_q-2.c: New test.
* gcc.target/i386/cond_op_addsubmul_w-1.c: New test.
* gcc.target/i386/cond_op_addsubmul_w-2.c: New test.
---
 gcc/config/i386/sse.md| 88 +--
 .../gcc.target/i386/cond_op_addsubmul_d-1.c   | 32 +++
 .../gcc.target/i386/cond_op_addsubmul_d-2.c   | 76 
 .../gcc.target/i386/cond_op_addsubmul_q-1.c   |  7 ++
 .../gcc.target/i386/cond_op_addsubmul_q-2.c   |  4 +
 .../gcc.target/i386/cond_op_addsubmul_w-1.c   |  6 ++
 .../gcc.target/i386/cond_op_addsubmul_w-2.c   |  5 ++
 7 files changed, 210 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_d-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_q-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/cond_op_addsubmul_w-2.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8bf1764d3d5..52b2b4214d7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -333,6 +333,14 @@ (define_mode_iterator VI48_AVX512VL
   [V16SI (V8SI  "TARGET_AVX512VL") (V4SI  "TARGET_AVX512VL")
V8DI  (V4DI  "TARGET_AVX512VL") (V2DI  "TARGET_AVX512VL")])
 
+(define_mode_iterator VI1248_AVX512VLBW
+  [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V16QI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX512VL && TARGET_AVX512BW")
+   (V8HI "TARGET_AVX512VL && TARGET_AVX512BW")
+   V16SI (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
+   V8DI (V4DI "TARGET_AVX512VL") (V2DI "TARGET_AVX512VL")])
+
 (define_mode_iterator VF_AVX512VL
   [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")
V8DF (V4DF "TARGET_AVX512VL") (V2DF "TARGET_AVX512VL")])
@@ -11803,6 +11811,24 @@ (define_expand "3"
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (, mode, operands);")
 
+(define_expand "cond_"
+  [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
+   (vec_merge:VI1248_AVX512VLBW
+ (plusminus:VI1248_AVX512VLBW
+   (match_operand:VI1248_AVX512VLBW 2 "nonimmediate_operand")
+   (match_operand:VI1248_AVX512VLBW 3 "nonimmediate_operand"))
+ (match_operand:VI1248_AVX512VLBW 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F"
+{
+  emit_insn (gen_3_mask (operands[0],
+operands[2],
+operands[3],
+operands[4],
+operands[1]));
+  DONE;
+})
+
 (define_expand "3_mask"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
@@ -11929,6 +11955,24 @@ (define_expand "mul3"
   DONE;
 })
 
+(define_expand "cond_mul"
+  [(set (match_operand:VI2_AVX512VL 0 "register_operand")
+   (vec_merge:VI2_AVX512VL
+ (mult:VI2_AVX512VL
+   (match_operand:VI2_AVX512VL 2 "vector_operand")
+   (match_operand:VI2_AVX512VL 3 "vector_operand"))
+ (match_operand:VI2_AVX512VL 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512BW"
+{
+  emit_insn (gen_mul3_mask (operands[0],
+ operands[2],
+ operands[3],
+ operands[4],
+ operands[1]));
+  DONE;
+})
+
 (define_expand "mul3"
   [(set (match_operand:VI2_AVX2 0 "register_operand")
(mult:VI2_AVX2 (match_operand:VI2_AVX2 1 "vector_operand")
@@ -12363,6 +12407,24 @@ (define_insn "*sse2_pmaddwd"
(set_attr "prefix" "orig,vex")
(set_attr "mode" "TI")])
 
+(define_expand "cond_mul"
+  [(set (match_operand:VI8_AVX512VL 0 "register_operand")
+   (vec_merge:VI8_AVX512VL
+ (mult:VI8_AVX512VL
+   (match_operand:VI8_AVX512VL 2 "vector_operand")
+   (match_operand:VI8_AVX512VL 3 "vector_operand"))
+ (match_operand:VI8_AVX512VL 4 "nonimm_or_0_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512DQ"
+{
+  emit_insn (gen_avx512dq_mul3_mask (operands[0],
+  operands[2],
+  operands[3],
+