[Bug tree-optimization/115120] Bad interaction between ivcanon and early break vectorization

2024-06-25 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115120

--- Comment #5 from Tamar Christina  ---
considering ivopts bails out on doloop prediction for multiple exits anyway,
what do you think about:

diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index 5ef24a91917..d1b25ad7dea 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -1319,7 +1319,8 @@ canonicalize_loop_induction_variables (class loop *loop,

   if (create_iv
   && niter && !chrec_contains_undetermined (niter)
-  && exit && just_once_each_iteration_p (loop, exit->src))
+  && exit && just_once_each_iteration_p (loop, exit->src)
+  && (single_dom_exit (loop) || targetm.predict_doloop_p (loop)))
 {
   tree iv_niter = niter;
   if (may_be_zero)

richi?

[Bug tree-optimization/115629] New: Inefficient if-convert of masked conditionals

2024-06-24 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629

Bug ID: 115629
   Summary: Inefficient if-convert of masked conditionals
   Product: gcc
   Version: 15.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

With cases such as:

void foo1 (int *restrict a, int *restrict b, int *restrict c,
  int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
  res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}

we generate:

foo1:
cmp w5, 0
ble .L1
mov x6, 0
whilelo p7.s, wzr, w5
ptrue   p3.b, all
.L3:
ld1wz31.s, p7/z, [x0, x6, lsl 2]
cmpeq   p6.s, p7/z, z31.s, #0
cmpne   p5.s, p7/z, z31.s, #0
ld1wz0.s, p6/z, [x2, x6, lsl 2]
ld1wz30.s, p5/z, [x1, x6, lsl 2]
cmpne   p15.s, p3/z, z0.s, #0
orr z0.d, z31.d, z0.d
and p6.b, p15/z, p6.b, p6.b
cmpeq   p4.s, p7/z, z0.s, #0
ld1wz28.s, p6/z, [x1, x6, lsl 2]
ld1wz29.s, p4/z, [x3, x6, lsl 2]
sel z29.s, p15, z28.s, z29.s
sel z29.s, p5, z30.s, z29.s
st1wz29.s, p7, [x4, x6, lsl 2]
incwx6
whilelo p7.s, w6, w5
b.any   .L3

where b is loaded twice, once with the mask a[i] != 0, and once a[i] == 0 &&
c[i] != 0.
clearly we don't need the second compare nor the second load.  This loop can be
optimally handled as:

foo1:
cmp w5, 0
ble .L1
mov x6, 0
cntwx7
whilelo p7.s, wzr, w5
.p2align 5,,15
.L3:
ld1wz1.s, p7/z, [x0, x6, lsl 2]
ld1wz0.s, p7/z, [x2, x6, lsl 2]
orr z0.d, z1.d, z0.d
cmpne   p6.s, p7/z, z0.s, #0
cmpeq   p5.s, p7/z, z0.s, #0
ld1wz31.s, p6/z, [x1, x6, lsl 2]
ld1wz30.s, p5/z, [x3, x6, lsl 2]
sel z30.s, p6, z31.s, z30.s
st1wz30.s, p7, [x4, x6, lsl 2]
add x6, x6, x7
whilelo p7.s, w6, w5
b.any   .L3
.L1:
ret

i.e. transform a ? b : (c ? b : d) into (a || c) ? b : d.

This transform is actually also beneficial for scalar, but that's not the case
when one of the conditions have to be inverted. i.e. cases 2 to 4 are
beneficial for vector masked operations but not scalar:

/* Convert a ? b : (c ? b : d) into (a || c) ? b : d.  */
void foo1 (int *restrict a, int *restrict b, int *restrict c,
  int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
  res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}

/* Convert a ? (c ? b : d) : b into (-a || c) ? b : d.  */
void foo2 (int *restrict a, int *restrict b, int *restrict c,
  int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
  res[i] = a[i] ? (c[i] ? b[i] : d[i]) : b[i];
}

/* Convert a ? (c ? d :b) : b into (-a || -c) ? b : d.  */
void foo3 (int *restrict a, int *restrict b, int *restrict c,
  int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
  res[i] = a[i] ? (c[i] ? d[i] : b[i]) : b[i];
}

/* Convert a ? b : (c ? d : b) into (a || -c) ? b : d.  */
void foo4 (int *restrict a, int *restrict b, int *restrict c,
  int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
  res[i] = a[i] ? b[i] : (c[i] ? d[i] : b[i]);
}

for instance case 3 is currently vectorized as:

foo3:
cmp w5, 0
ble .L10
mov x6, 0
whilelo p7.s, wzr, w5
ptrue   p3.b, all
.L12:
ld1wz1.s, p7/z, [x0, x6, lsl 2]
cmpeq   p5.s, p7/z, z1.s, #0
cmpne   p6.s, p7/z, z1.s, #0
ld1wz29.s, p5/z, [x1, x6, lsl 2]
ld1wz0.s, p6/z, [x2, x6, lsl 2]
cmpne   p15.s, p3/z, z0.s, #0
cmpeq   p4.s, p6/z, z0.s, #0
and p5.b, p15/z, p6.b, p6.b
ld1wz30.s, p4/z, [x1, x6, lsl 2]
ld1wz31.s, p5/z, [x3, x6, lsl 2]
sel z30.s, p15, z31.s, z30.s
sel z30.s, p6, z30.s, z29.s
st1wz30.s, p7, [x4, x6, lsl 2]
incwx6
whilelo p7.s, w6, w5
b.any   .L12

but can be

foo3:
cmp w5, 0
ble .L10
mov x6, 0
cntwx7
whilelo p6.s, wzr, w5
ptrue   p5.b, all
.p2align 5,,15
.L12:
ld1wz29.s, p6/z, [x0, x6, lsl 2]
ld1wz28.s, p6/z, [x2, x6, lsl 2]
cmpeq   p15.s, p5/z, z29.s, #0
cmpeq   p14.s, p5/z, z28.s, #0
orr p15.b, p5/z, p15.b, p14.b
and p4.b, p6/z, p15.b, p15.b
bic p7.b, p5/z, p6.b, p15.b
ld1wz31.s, p4/z, [x1, x6, lsl 2]
ld1wz30.s, p7/z, [x3, x6, lsl 2]
sel z30.s, p4, z31.s, z30.s
  

[Bug tree-optimization/115531] vectorizer generates inefficient code for masked conditional update loops

2024-06-24 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115531

Tamar Christina  changed:

   What|Removed |Added

   Assignee|unassigned at gcc dot gnu.org  |tnfchris at gcc dot 
gnu.org
 Status|NEW |ASSIGNED

[Bug c++/115623] ICE: Segmentation fault in finish_for_cond with novector and almost infinite loop

2024-06-24 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115623

--- Comment #4 from Tamar Christina  ---
novect3.c: In function 'void f(char*, int)':
novect3.c:4:9: error: missing loop condition in loop with 'GCC novector' pragma
before ';' token
4 |   for (;;i++)
  | 

should do it, will send patch later today.

[Bug c++/115623] ICE: Segmentation fault in finish_for_cond with novector and almost infinite loop

2024-06-24 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115623

Tamar Christina  changed:

   What|Removed |Added

 Status|NEW |ASSIGNED
   Assignee|unassigned at gcc dot gnu.org  |tnfchris at gcc dot 
gnu.org

--- Comment #3 from Tamar Christina  ---
It looks like cp_parser_c_for is missing the handling for novector.

Mine.

[Bug tree-optimization/115120] Bad interaction between ivcanon and early break vectorization

2024-06-24 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115120

--- Comment #4 from Tamar Christina  ---
You asked why this doesn't happen with a normal vector loop Richi.

For a normal loop when IVcannon adds the downward counting loop there are two
main differences.

1. for a single exit loop, the downward IV is the main IV. which we ignore as
the vectorizer replaces the loop exit condition with a bound iteration check.

2.  when we peel, the main loop has a known iteration count.  So the starting
downward IV for the scalar loop is a known constant.  That means we statically
compute the start of the IV.  As such there's no data-flow from for this
downwards counting IV from the main loop into the scalar loop.

i.e. in this loop:

   [local count: 1063004408]:
  # i_8 = PHI 
  # ivtmp_2 = PHI 
  res[i_8] = i_8;
  i_5 = i_8 + 1;
  ivtmp_1 = ivtmp_2 - 1;
  if (ivtmp_1 != 0)
goto ; [98.99%]
  else
goto ; [1.01%]

   [local count: 1052266995]:
  goto ; [100.00%]

when we vectorize the final loop looks like:

   [local count: 1063004408]:
  # i_8 = PHI 
  # ivtmp_2 = PHI 
  # vect_vec_iv_.6_19 = PHI <_20(5), { 0, 1, 2, 3 }(2)>
  # vectp_res.7_21 = PHI 
  # ivtmp_24 = PHI 
  _20 = vect_vec_iv_.6_19 + { 4, 4, 4, 4 };
  MEM  [(int *)vectp_res.7_21] = vect_vec_iv_.6_19;
  i_5 = i_8 + 1;
  ivtmp_1 = ivtmp_2 - 1;
  vectp_res.7_22 = vectp_res.7_21 + 16;
  ivtmp_25 = ivtmp_24 + 1;
  if (ivtmp_25 < 271)
goto ; [98.99%]
  else
goto ; [1.01%]

   [local count: 1052266995]:
  goto ; [100.00%]

   [local count: 10737416]:
  # i_16 = PHI 
  # ivtmp_17 = PHI 

   [local count: 32212248]:
  # i_7 = PHI 
  # ivtmp_11 = PHI 
  res[i_7] = i_7;
  i_13 = i_7 + 1;
  ivtmp_14 = ivtmp_11 - 1;
  if (ivtmp_14 != 0)
goto ; [66.67%]
  else
goto ; [33.33%]

   [local count: 21474835]:
  goto ; [100.00%]

for a vector code neither assumption are no longer true.

1.  The vectorizer may pick another exit other than the downwards counting IV.
In particular if the early exit has a known iteration count lower than the main
exit.

2.  Because we don't know which exit the loop takes, we can't tell how many
iteration you have to do at a minimum for the scalar loop.  We only know the
maximum.  As such the loop reduction into the second loop is:

   [local count: 58465242]:
  # vect_vec_iv_.6_30 = PHI 
  # vect_vec_iv_.7_35 = PHI 
  _36 = BIT_FIELD_REF ;
  ivtmp_26 = _36;
  _31 = BIT_FIELD_REF ;
  i_25 = _31;
  goto ; [100.00%]

   [local count: 214528238]:
  # i_3 = PHI 
  # ivtmp_17 = PHI 

Since we don't know the iteration count we require both IVs to be live.  the
downcounting IV is live because the scalar loop needs a starting point, and the
incrementing IV is live due to addressing mode usages.

This means neither can be removed.

In the single exit case, the downward IV is only used for loop control:

   [local count: 32212248]:
  # i_7 = PHI 
  # ivtmp_11 = PHI 
  res[i_7] = i_7;
  i_13 = i_7 + 1;
  ivtmp_14 = ivtmp_11 - 1;
  if (ivtmp_14 != 0)
goto ; [66.67%]
  else
goto ; [33.33%]

   [local count: 21474835]:
  goto ; [100.00%]

and so IVopts rewrites the addressing mode usages of `i` into

   [local count: 32212248]:
  # ivtmp.12_2 = PHI 
  _5 = (unsigned int) ivtmp.12_2;
  i_7 = (int) _5;
  MEM[(int *) + ivtmp.12_2 * 4] = i_7;
  ivtmp.12_8 = ivtmp.12_2 + 1;
  if (ivtmp.12_8 != 1087)
goto ; [66.67%]
  else
goto ; [33.33%]

   [local count: 21474835]:
  goto ; [100.00%]

and rewrites the loop back into an incrementing loop.  This also happens for
the early exit loop, that's why the scalar code doesn't have the double IVs.

But vector loop we have this issue due to needing the second IV live.

We might be able to rewrite the vector IVs as you say in IVopts,  however not
only does IVopts not rewrite vector IVs, it also doesn't rewrite multiple exit
loops in general. 

It has two checks:

  /* Make sure that the loop iterates till the loop bound is hit, as otherwise
 the calculation of the BOUND could overflow, making the comparison
 invalid.  */
  if (!data->loop_single_exit_p)
return false;

and seems to lose a lot of information when niter_for_single_dom_exit (..) is
null, it seems that in order for this to work correctly IVopts needs to know
which exit we've chosen in the vectorizer. i.e. I think it would have issued
with a PEELED loop.

We also have the problem where both IVs are required:

int arr[1024];
int f()
{
int i;
for (i = 0; i < 1024; i++)
  if (arr[i] == 42)
return i;
return *(arr + i);
}

but with the downward counting IV enabled, we get a much more complicated
latch.

> Note this isn't really because of IVCANON but because the IV is live.  
> IVCANON adds a downward counting IV historically to enable RTL doloop 
> transforms.

IVopts currently has:

  /* Similar to doloop_optimize, check iteration description to know it's
 suitable or not.  Keep it as simple as possible, feel free to extend it
 if you find any multiple exits cases matter.  */
  edge exit 

[Bug middle-end/115597] [15 Regression] vectorizer takes 20+ h compiling 510.parest in SPECCPU2017 since g:46bb4ce4d30ab749d40f6f4cef6f1fb7c7813452

2024-06-23 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115597

--- Comment #4 from Tamar Christina  ---
(In reply to Richard Biener from comment #2)
> Ah, I feared this would happen - this case seems to be because of a lot of
> VEC_PERM nodes(?) which are not handled by the CSE process as well as the
> two-operator nodes which lack SLP_TREE_SCALAR_STMTS (we'd need NULL elements
> there, something I need to add anyway).
> 
> The bst_map deals as "visited" map, but nodes not handled there would need
> a "visited" set (but as said above, the plan is to reduce that set to zero).
> 

Ah I see, that makes sense.

> I'll see to reproduce to confirm.  Usually a two-operator node shouldn't
> be too bad since the next non-two-operator one will serve as 'visited' point
> but in this graph we have several adjacent two-operator nodes without any
> intermediate node handled by the bst-map processing code.  I can't reproduce
> with -Ofast -march=znver2 though.
> 

Yeah I forgot to mention I could only reproduce it with LTO and a recent glibc.

Thanks for the fix!

[Bug middle-end/115597] [15 Regression] vectorizer takes 20+ h compiling 510.parest in SPECCPU2017 since g:46bb4ce4d30ab749d40f6f4cef6f1fb7c7813452

2024-06-23 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115597

--- Comment #3 from Tamar Christina  ---
> 
> Can you check whether that fixes the issue?
> 
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 9465d94de1a..212d5f97f7d 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -6085,7 +6085,6 @@ static void
>  vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
>  {
>if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
> -  && SLP_TREE_CODE (node) != VEC_PERM_EXPR
>/* Besides some VEC_PERM_EXPR, two-operator nodes also
>  lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
>  we'd have sth that works for all internal and external nodes.  */

Yeah that seems to do it, can compile SPECFP again.

[Bug middle-end/115597] New: [15 Regression] vectorizer takes 20+ h compiling 510.parest in SPECCPU2017 since g:46bb4ce4d30ab749d40f6f4cef6f1fb7c7813452

2024-06-23 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115597

Bug ID: 115597
   Summary: [15 Regression] vectorizer takes 20+ h compiling
510.parest in SPECCPU2017 since
g:46bb4ce4d30ab749d40f6f4cef6f1fb7c7813452
   Product: gcc
   Version: 15.0
Status: UNCONFIRMED
  Keywords: compile-time-hog
  Severity: normal
  Priority: P3
 Component: middle-end
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
CC: rguenth at gcc dot gnu.org
  Target Milestone: ---
Target: aarch64*

Created attachment 58496
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=58496=edit
slp dump graph

Since:

commit 46bb4ce4d30ab749d40f6f4cef6f1fb7c7813452 (HEAD)
Author: Richard Biener 
Date:   Wed Jun 19 12:57:27 2024 +0200

tree-optimization/114413 - SLP CSE after permute optimization

We currently fail to re-CSE SLP nodes after optimizing permutes
which results in off cost estimates.  For gcc.dg/vect/bb-slp-32.c
this shows in not re-using the SLP node with the load and arithmetic
for both the store and the reduction.  The following implements
CSE by re-bst-mapping nodes as finalization part of vect_optimize_slp.

I've tried to make the CSE part of permute materialization but it
isn't a very good fit there.  I've not bothered to implement something
more complete, also handling external defs or defs without
SLP_TREE_SCALAR_STMTS.

I realize this might result in more BB SLP which in turn might slow
down code given costing for BB SLP is difficult (even that we now
vectorize gcc.dg/vect/bb-slp-32.c on x86_64 might be not a good idea).
This is nevertheless feeding more accurate info to costing which is
good.

PR tree-optimization/114413
* tree-vect-slp.cc (release_scalar_stmts_to_slp_tree_map):
New function, split out from ...
(vect_analyze_slp): ... here.  Call it.
(vect_cse_slp_nodes): New function.
(vect_optimize_slp): Call it.

* gcc.dg/vect/bb-slp-32.c: Expect CSE and vectorization on x86.

Compilation takes an extremely long time in 510.parest_r.

The problem seems to be that vect_cse_slp_nodes visits the same nodes twice.
It looks like the function has no visited set, and the hot loop in parest (when
vectorizable thanks to libmvec) has many TWO_OPERANDS nodes and one of them is
rooted at the top level.

vect_cse_slp_nodes seems to skip VEC_PERM_EXPR but not it's children, as such
it ends up visiting the same subgraphs multiple times. The graph in parest has
so many TWO_OPERAND nodes that essentially compilation never finishes.

I believe this function needs a visited node set.

example call graph:

#334 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x4132f40: 0x3df2ec0) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#335 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x41321a0: 0x3df2b90) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#336 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x4130b00: 0x3df2860) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#337 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x41348a0: 0x3df2530) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#338 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x3b8b0d0: 0x3df2310) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#339 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x41348f0: 0x3dee928) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#340 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x4134500: 0x3dee460) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#341 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x3c14600: 0x3ded690) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#342 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x3ca75f0: 0x3de7910) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#343 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x3e28590: 0x3de8768) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#344 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x3c2e4b8: 0x3de7778) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#345 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x3da5e58: 0x3de7dd8) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#346 0x018a1e14 in vect_cse_slp_nodes (bst_map=0x41627a0,
node=@0x41d0770: 0x3de7f70) at
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6111
#347 0x018a1f20 in vect_optimize_slp 

[Bug middle-end/115534] intermediate stack use not eliminated

2024-06-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115534

--- Comment #5 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #4)
> This might be improved by
> https://gcc.gnu.org/pipermail/gcc-patches/2024-June/654819.html . Or it
> might be the case the vectorizer case needs to be improved afterwards. But I
> think that is the infrastructure for fixing this issue.

Yeah Richard pointed me to this today as well. The vectorizer case is a bit
unique because the vectorizer has packed scalar values in two vector registers.

So yeah think it's likely some work will be needed afterwards but will see
after the fsra patch lands :)

[Bug tree-optimization/115537] [15 Regression] vectorizable_reduction ICEs after g:d66b820f392aa9a7c34d3cddaf3d7c73bf23f82d

2024-06-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115537

--- Comment #5 from Tamar Christina  ---
Thanks for the fix!

I think the testcase needs SVE enabled to ICE no?
shouldn't that be -mcpu=neoverse-v1 and not -mcpu=neoverse-n1?

[Bug middle-end/115534] intermediate stack use not eliminated

2024-06-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115534

--- Comment #2 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #1)
> I suspect there is a dup of this already. See the bug which I made this one
> blocking for a list of related bugs.

Most of the other bugs relate to the argument expansions, however this one,
regardless of the expansion itself shouldn't need the intermediate stack.

I think there are various other ways the operation could have been kept in a
gimple register.

[Bug tree-optimization/115537] New: [15 Regression] vectorizable_reduction ICEs after g:d66b820f392aa9a7c34d3cddaf3d7c73bf23f82d

2024-06-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115537

Bug ID: 115537
   Summary: [15 Regression] vectorizable_reduction ICEs after
g:d66b820f392aa9a7c34d3cddaf3d7c73bf23f82d
   Product: gcc
   Version: 15.0
Status: UNCONFIRMED
  Keywords: ice-on-valid-code
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
CC: rguenth at gcc dot gnu.org
  Target Milestone: ---

testcase:

---
char *a;
int b;
void c() {
  int d = 0, e = 0, f;
  for (; f; ++f)
if (a[f] == 5)
  ;
else if (a[f])
  e = 1;
else
  d = 1;
  if (d)
if (e)
  b = 0;
}
---

compiled with -mcpu=neoverse-v1 -O3 produces the following ICE:

during GIMPLE pass: vect
pngrtran.i: In function 'c':
pngrtran.i:3:6: internal compiler error: in vectorizable_reduction, at
tree-vect-loop.cc:8335
3 | void c() {
  |  ^
0xff74ff vectorizable_reduction(_loop_vec_info*, _stmt_vec_info*, _slp_tree*,
_slp_instance*, vec*)
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-loop.cc:8335
0x1b598f7 vect_analyze_stmt(vec_info*, _stmt_vec_info*, bool*, _slp_tree*,
_slp_instance*, vec*)
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-stmts.cc:13353
0x10225df vect_slp_analyze_node_operations_1
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6457
0x10225df vect_slp_analyze_node_operations
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6656
0x102253f vect_slp_analyze_node_operations
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:6635
0x1023ec3 vect_slp_analyze_operations(vec_info*)
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-slp.cc:7053
0xff816f vect_analyze_loop_2
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-loop.cc:2953
0xff9fb7 vect_analyze_loop_1
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-loop.cc:3484
0xffa6f7 vect_analyze_loop(loop*, vec_info_shared*)
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vect-loop.cc:3642
0x1035547 try_vectorize_loop_1
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vectorizer.cc:1067
0x1035547 try_vectorize_loop
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vectorizer.cc:1183
0x1035a5b execute
/opt/buildAgent/work/5c94c4ced6ebfcd0/gcc/tree-vectorizer.cc:1299
Please submit a full bug report, with preprocessed source (by using
-freport-bug).
Please include the complete backtrace with any bug report.
See  for instructions.

after:

commit d66b820f392aa9a7c34d3cddaf3d7c73bf23f82d
Author: Richard Biener 
Date:   Thu Jun 13 14:42:25 2024 +0200

Support single def-use cycle optimization for SLP reduction vectorization

We can at least mimic single def-use cycle optimization when doing
single-lane SLP reductions and that's required to avoid regressing
compared to non-SLP.

* tree-vect-loop.cc (vectorizable_reduction): Allow
single-def-use cycles with SLP.
(vect_transform_reduction): Handle SLP single def-use cycles.
(vect_transform_cycle_phi): Likewise.

* gcc.dg/vect/slp-reduc-12.c: New testcase.

 gcc/testsuite/gcc.dg/vect/slp-reduc-12.c | 18 +
 gcc/tree-vect-loop.cc| 45 +++-
 2 files changed, 45 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-reduc-12.c
bisect run success

looks like it's hitting the assert:

  gcc_assert (op.code != COND_EXPR);

[Bug tree-optimization/115534] New: intermediate stack use not eliminated

2024-06-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115534

Bug ID: 115534
   Summary: intermediate stack use not eliminated
   Product: gcc
   Version: 15.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

Consider the following example:

#include 

typedef struct _pixel_t
{
  double red, green, blue, opacity;
} pixel_t;

typedef struct _PixelPacket
{
  unsigned short blue, green, red, opacity;
} PixelPacket;

pixel_t f (unsigned height, unsigned width, unsigned virt_width,
   uint8_t *restrict k, const PixelPacket *restrict k_pixels)
{
pixel_t result = {};
for (unsigned u=0; u < (width & -4); u++, k--) {
result.red += (*k)*k_pixels[u].red;
result.green   += (*k)*k_pixels[u].green;
result.blue+= (*k)*k_pixels[u].blue;
result.opacity += (*k)*k_pixels[u].opacity;
k_pixels += virt_width;
}
return result;
}

---

compiled with -O3 vectorizes as good, but the epilogue code is very
inefficient:

faddv29.2d, v29.2d, v30.2d
faddv28.2d, v28.2d, v31.2d
cmp w5, w1
bhi .L3
mov v31.16b, v28.16b
ins v31.d[1], v29.d[1]
ins v29.d[1], v28.d[1]
stp q31, q29, [sp, 32]
ldp d0, d1, [sp, 32]
ldp d2, d3, [sp, 48]
add sp, sp, 64
ret
.L4:
moviv29.2d, 0
mov v31.16b, v29.16b
stp q31, q29, [sp, 32]
ldp d0, d1, [sp, 32]
ldp d2, d3, [sp, 48]
add sp, sp, 64
ret

as in it goes through the stack to create the return registers.  This looks
like  at gimple we still have the store:

   [local count: 105119324]:
  _33 = VEC_PERM_EXPR ;
  _31 = VEC_PERM_EXPR ;

   [local count: 118111600]:
  # vect_result_red_64.18_28 = PHI <_33(5), { 0.0, 0.0 }(2)>
  # vect_result_red_64.18_105 = PHI <_31(5), { 0.0, 0.0 }(2)>
  MEM  [(double *)] = vect_result_red_64.18_28;
  MEM  [(double *) + 16B] = vect_result_red_64.18_105;
  return D.4535;

clang is able to generate much better code here:

faddv0.2d, v0.2d, v1.2d
faddv2.2d, v2.2d, v3.2d
b.ne.LBB0_2
.LBB0_3:
mov d1, v2.d[1]
mov d3, v0.d[1]
ret

The vectorized code gets reg-alloc'ed so that d0 an d2 are already in the right
registers at the end of the vector loop, and the epilogue only has to split the
registers up to get d1 and d3.

I think we would generate the same if we were to elide the intermediate stack
store.

See https://godbolt.org/z/ocqchWWs5

[Bug tree-optimization/115531] vectorizer generates inefficient code for masked conditional update loops

2024-06-17 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115531

--- Comment #3 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #1)
> I suspect PR 20999 would fix this ...
> but we have to be careful since without masked stores, you could still
> vectorize this unlike the transformed version.
> 
> Maybe ifcvt can produce a masked store version if this pattern ...

doing so during ifcvt forces you to commit to a masked operation. So you loose
the ability to not vectorize for non-fully masked architectures.

So it's too early.  A vector pattern doesn't have this problem. This question
was mostly to what degree the vectorizer has support for MASK_STORE as an
input. vect_get_vector_types_for_stmt seems to have support for it so it looks
like it may work.

[Bug tree-optimization/115531] New: vectorizer generates inefficient code for masked conditional update loops

2024-06-17 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115531

Bug ID: 115531
   Summary: vectorizer generates inefficient code for masked
conditional update loops
   Product: gcc
   Version: 15.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

The following code:

void __attribute__((noipa))
foo (char *restrict a, int *restrict b, int *restrict c, int n, int stride)
{
  if (stride <= 1)
return;

  for (int i = 0; i < n; i++)
{
  int res = c[i];
  int t = b[i+stride];
  if (a[i] != 0)
res = t;
  c[i] = res;
}
}

generates at -O3 -g0 -mcpu=generic+sve:

.L3:
ld1bz29.s, p7/z, [x0, x5]
ld1wz31.s, p7/z, [x2, x5, lsl 2]
ld1wz30.s, p7/z, [x1, x5, lsl 2]
cmpne   p15.b, p6/z, z29.b, #0
sel z30.s, p15, z30.s, z31.s
st1wz30.s, p7, [x2, x5, lsl 2]
add x5, x5, x4
whilelo p7.s, w5, w3
b.any   .L3
.L1:

and makes vectorization unprofitable until very high iterations of n.
This is because the vector code has more instructions than needed.

Since it's a masked store, whenever a value is being conditionally set we don't
need the intermediate VEC_COND_EXPR.  This loop can be vectorized as:

.L3:
ld1bz29.s, p7/z, [x0, x5]
ld1wz31.s, p7/z, [x2, x5, lsl 2]
cmpne   p4.b, p6/z, z29.b, #0
st1wz31.s, p4, [x2, x5, lsl 2]
add x5, x5, x4
whilelo p7.s, w5, w3
b.any   .L3
.L1:

I currently prototyped a load-to-store forward optimization in forwprop but
looking to move it into the vectorizer to cost it properly, however I'm not
entirely sure what the best way to do so is.

I can certainly fix it up during codegen but to cost it I need to do so during
analysis. I could detect it during vectorizable_condition but then the dead
load is still costed. Or I could maybe use a pattern, but unsure how to
represent the mask into the load.

Is it valid to produce a pattern with .IFN_MASK_STORE?

[Bug target/115464] [14 Backport] ICE when building libaom on arm64 (neon sve bridge usage with tbl/perm)

2024-06-13 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115464

--- Comment #10 from Tamar Christina  ---
Thanks for the fix, but I don't think it's sufficient.

what I meant with the earlier comment was that the subregs are broken in
general, so not just the one generated by the undef fast path.

i.e.

#include 
#include 
#include 

svuint16_t
convolve4_4_x (uint16x8x2_t permute_tbl, svuint16_t a)
{
return svset_neonq_u16 (a, permute_tbl.val[1]);
}

seems to still ICE for me because it goes into the general expander which
produces the same subreg.

[Bug target/115464] ICE when building libaom on arm64 (neon sve bridge usage with tbl/perm)

2024-06-12 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115464

--- Comment #7 from Tamar Christina  ---
(In reply to Tamar Christina from comment #6)
> (In reply to Richard Sandiford from comment #5)
> > In this kind of situation, we should go through a fresh pseudo rather than
> > try to take the subreg directly.
> 
> I did try that but fwprop pushed it back in.

Ahh no, I used force_reg, doh.. Fair.

[Bug target/115464] ICE when building libaom on arm64 (neon sve bridge usage with tbl/perm)

2024-06-12 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115464

--- Comment #6 from Tamar Christina  ---
(In reply to Richard Sandiford from comment #5)
> In this kind of situation, we should go through a fresh pseudo rather than
> try to take the subreg directly.

I did try that but fwprop pushed it back in.

[Bug target/115464] ICE when building libaom on arm64 (neon sve bridge usage with tbl/perm)

2024-06-12 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115464

Tamar Christina  changed:

   What|Removed |Added

 CC||rsandifo at gcc dot gnu.org

--- Comment #4 from Tamar Christina  ---
Looks like the Neon lowpart optimization doesn't take into account aggregate
vectors.

That means simplest reproduction is:

#include 
#include 
#include 

svuint16_t
convolve4_4_x (uint16x8x2_t permute_tbl)
{
return svset_neonq_u16 (svundef_u16 (), permute_tbl.val[1]);
}

This generates a subreg between from E_V2x8HImode to E_VNx8HImode.

This subreg is an invalid paradoxical subreg as there's no strict ordered
relationship between the modes.

This fails because ordered_p does not have a relationship defined between a
poly vector
and an aggregate non-poly vector.

I think one should probably be provided, essentially the NEON<->SVE bridge is
broken for aggregate types in general at the moment.  I don't know the exact
semantics for poly-ints.

I tried patching ordered_p but the ICE just moves. Any thoughts Richard?

[Bug target/115464] ICE when building libaom on arm64 (neon sve bridge usage with tbl/perm)

2024-06-12 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115464

Tamar Christina  changed:

   What|Removed |Added

   Last reconfirmed||2024-06-12
 CC||tnfchris at gcc dot gnu.org
 Status|UNCONFIRMED |NEW
 Ever confirmed|0   |1

--- Comment #3 from Tamar Christina  ---
Confirmed, slightly more cleaned up example:

#include 
#include 
#include 

int16x8_t
convolve4_4_x (uint16x8x2_t permute_tbl, svint16_t res)
{
return svget_neonq_s16 (
svtbl_s16 (res,
   svset_neonq_u16 (svundef_u16 (), permute_tbl.val[0])));
}

[Bug tree-optimization/114932] IVopts inefficient handling of signed IV used for addressing.

2024-06-06 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

--- Comment #15 from Tamar Christina  ---
(In reply to rguent...@suse.de from comment #14)
> On Thu, 6 Jun 2024, tnfchris at gcc dot gnu.org wrote:
> 
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932
> > 
> > --- Comment #13 from Tamar Christina  ---
> > (In reply to rguent...@suse.de from comment #12) 
> > > > since we don't care about overflow here, it looks like the stripping 
> > > > should
> > > > be recursive as long as it's a NOP expression between two integral 
> > > > types.
> > > > 
> > > > That would get them to hash to the same IV expression.  Trying now..
> > > 
> > > Note tree-affine is a tool that's used for this kind of "weak" equalities.
> > > Convert both to affine, subtract them and if that's zero they are equal.
> > 
> > Hmm that's useful, though in this case this becomes the actual expression 
> > that
> > IVOpts uses.
> > 
> > For instance this is done in alloc_iv and add_iv_candidate_for_use when
> > determining the uses for the IV.
> > 
> > It looks like it's trying to force a canonical representation with as 
> > minimum
> > casting as possible.
> > 
> > would the "affine"'ed tree be safe to use for this context?
> 
> No, I'd use that just for the comparison.
> 
> > What I've done currently is make a STRIP_ALL_NOPS that recursively strips 
> > NOPs
> > for PLUS/MULT/MINUS.
> 
> But the stripped expression isn't necessarily valid to use either because
> of possible undefined overflow.  It's probably safe to pick any of the
> existing expressions (all are evaluated at each iteration), but if you
> strip all NOPs from all of them you might end up with new undefined
> behavior.
> 
> At least if that stripped expression is inserted somewhere or other
> new expressions are built upon it.

does overflow matter for addressing mode though? if you have undefined behavior
in your address space then your program would have crashed anyway no?

In this case IVOpts would have already stripped away the outer NOPs so building
upon this one could also cause undefined overflow can it not? i.e. if the IV
was
((signed) unsigned_calculation.)

[Bug tree-optimization/114932] IVopts inefficient handling of signed IV used for addressing.

2024-06-06 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

--- Comment #13 from Tamar Christina  ---
(In reply to rguent...@suse.de from comment #12) 
> > since we don't care about overflow here, it looks like the stripping should
> > be recursive as long as it's a NOP expression between two integral types.
> > 
> > That would get them to hash to the same IV expression.  Trying now..
> 
> Note tree-affine is a tool that's used for this kind of "weak" equalities.
> Convert both to affine, subtract them and if that's zero they are equal.

Hmm that's useful, though in this case this becomes the actual expression that
IVOpts uses.

For instance this is done in alloc_iv and add_iv_candidate_for_use when
determining the uses for the IV.

It looks like it's trying to force a canonical representation with as minimum
casting as possible.

would the "affine"'ed tree be safe to use for this context?

What I've done currently is make a STRIP_ALL_NOPS that recursively strips NOPs
for PLUS/MULT/MINUS.

[Bug tree-optimization/114932] IVopts inefficient handling of signed IV used for addressing.

2024-06-05 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

--- Comment #11 from Tamar Christina  ---
(In reply to Richard Biener from comment #10)
> I think the question is why IVOPTs ends up using both the signed and
> unsigned variant of the same IV instead of expressing all uses of both with
> one IV?
> 
> That's where I'd look into.

It looks like this is because of a subtle difference in the expressions.

In get_loop_invariant_expr IVOPTs first tries to strip away all casts with
STRIP_NOPS.

The first expression is (unsigned long) (stride.3_27 * 4) and the second
expression is ((unsigned long) stride.3_27) * 4 (The pretty printing here is
pretty bad...)

So the first one becomes:
  (unsigned long) (stride.3_27 * 4) -> stride.3_27 * 4

and second one:
  ((unsigned long) stride.3_27) * 4 -> ((unsigned long) stride.3_27) * 4

since we don't care about overflow here, it looks like the stripping should
be recursive as long as it's a NOP expression between two integral types.

That would get them to hash to the same IV expression.  Trying now..

[Bug tree-optimization/54013] Loop with control flow not vectorized

2024-06-05 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54013

Tamar Christina  changed:

   What|Removed |Added

 Blocks||115130

--- Comment #4 from Tamar Christina  ---
Since there's only one source here, alignment peeling should be enough to
vectorize it.

our pending patches should support it.  Will add it to verify list.


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115130
[Bug 115130] [meta-bug] early break vectorization

[Bug tree-optimization/114932] IVopts inefficient handling of signed IV used for addressing.

2024-06-05 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

--- Comment #9 from Tamar Christina  ---
It's taken me a bit of time to track down all the reasons for the speedup with
the earlier patch.

This comes from two parts:

1. Signed IVs don't get simplified.  Due to possible UB with signed overflows
gimple expressions don't get simplified when the type is signed.

However for addressing modes it doesn't matter as simplifying the constants any
potential overflow can still happen.  Secondly most architectures say you can
never reach the full address space range anyway.  Those that due (like those
that offer baremetal variants like Arm and AArch64) explicitly specify that
overflow is defined as wrapping around.  That means that IVs for their use in
IV opts should be save to simplify as if they were unsigned.

I have a patch that during the creation of IV candidates folds them to unsigned
and then folds them back to their original signed types.  This maintains all
the original overflow analysis and the correct typing in gimple.

2. The second problem is that due to Fortran not having unsigned types, the
front-end generates a signed IV.  Some optimizations as they work can convert
these to unsigned due to folding, e.g. extract_muldiv is one place where this
is done.

This can make us end up having the same IV as both signed and unsigned, as is
the case here:

:   
   
   
 inv_expr 1: stride.3_27 * 4   
   
   
  inv_expr 2:
(unsigned long) stride.3_27 * 4   

These end up being used in the same group:

Group 1:   
   
   
   cand  costcompl.  inv.expr.   inv.vars  
   
   
1 0   0
  NIL;6
   
   
 2 0   0   NIL;6   
   
   
  3 0   0   NIL;6  
   
   
   4 0 
 0   NIL;6 

which ends up with IV opts picking the signed and unsigned IVs:

Improved to:
  cost: 24 (complexity 3)
  reg_cost: 9
  cand_cost: 15
  cand_group_cost: 0 (complexity 3)
  candidates: 1, 6, 8
   group:0 --> iv_cand:6, cost=(0,1)
   group:1 --> iv_cand:1, cost=(0,0)
   group:2 --> iv_cand:8, cost=(0,1)
   group:3 --> iv_cand:8, cost=(0,1)
  invariant variables: 6
  invariant expressions: 1, 2

and so generates the same IV as both signed and unsigned:

;;   basic block 21, loop depth 3, count 214748368 (estimated locally, freq
58.2545), maybe hot
   
 ;;prev block 28, next block 31, flags:
(NEW, REACHABLE, VISITED)
;;pred:   28 [always]  count:23622320 (estimated locally, freq 6.4080)
(FALLTHRU,EXECUTABLE)  
   
  ;;25 [always]  count:191126046
(estimated locally, freq 51.8465) (FALLTHRU,DFS_BACK,EXECUTABLE)
  # .MEM_66 = PHI <.MEM_34(28), .MEM_22(25)>
  # ivtmp.22_41 = PHI <0(28), ivtmp.22_82(25)>
  # ivtmp.26_51 = PHI 
  # ivtmp.28_90 = PHI 

...

;;   basic block 24, loop depth 3, count 214748366 (estimated locally, freq
58.2545), maybe hot
   
 ;;prev block 22, 

[Bug target/114860] [14/15 regression] [aarch64] 511.povray regresses by ~5.5% with -O3 -flto -march=native -mcpu=neoverse-v2 since r14-10014-ga2f4be3dae04fa

2024-05-22 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114860

--- Comment #9 from Tamar Christina  ---
(In reply to prathamesh3492 from comment #8)
> Hi Tamar,
> Using -falign-loops=5 indeed brings back the performance.
> The adrp instruction has same address (0x4ae784) by setting -falign-loops=5
> (which reduces misalignment to 4) with/without a2f4be3dae0. So I guess this
> is really code-alignment issue ?
> 

Indeed, we don't aggressively align loops if they require too much padding to
not bloat the binaries too much.  That's why sometimes you just get unlucky and
the hot loop gets misaligned.

[Bug tree-optimization/115130] (early-break) [meta-bug] early break vectorization

2024-05-17 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115130

Tamar Christina  changed:

   What|Removed |Added

 Ever confirmed|0   |1
   Last reconfirmed||2024-05-17
 Status|UNCONFIRMED |NEW

[Bug tree-optimization/115130] New: (early-break) [meta-bug] early break vectorization

2024-05-17 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115130

Bug ID: 115130
   Summary: (early-break) [meta-bug] early break vectorization
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: meta-bug, missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
Blocks: 53947
  Target Milestone: ---

Meta tickets about early break vectorization to better keep track of early
break specific issues


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

[Bug tree-optimization/115120] Bad interaction between ivcanon and early break vectorization

2024-05-17 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115120

--- Comment #3 from Tamar Christina  ---
That makes sense, though I also wonder how it works for scalar multi exit
loops, IVops has various checks on single exits.

I guess one problem is that the code in IVops that does this uses the exit to
determine niters.

But in the case of the multiple exits vector code the vectorizer could have
picked a different exit.

So I guess the question is how do we even tell which one is used or could the
transformation be driven from the PHI nodes themselves instead of an exit.

[Bug target/114860] [14/15 regression] [aarch64] 511.povray regresses by ~5.5% with -O3 -flto -march=native -mcpu=neoverse-v2 since r14-10014-ga2f4be3dae04fa

2024-05-16 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114860

--- Comment #7 from Tamar Christina  ---
Yeah, it's most likely an alignment issue, especially as there's no code
changes.

We run our benchmarking with different flags so it may be why we don't see it.
the loop seems misaligned, you can try increasing the alignment guarantee to
check. e.f. -falign-loops=5.

But ultimately, I think it's just bad luck. We don't align loops and labels if
they require too much padding instructions.

[Bug target/114412] [14/15 Regression] 7% slowdown of 436.cactusADM on aarch64

2024-05-16 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114412

--- Comment #5 from Tamar Christina  ---
(In reply to Filip Kastl from comment #4)
> (In reply to Tamar Christina from comment #3)
> > Hi Filip,
> > 
> > Do you generate these runs with counters based PGO or compiler
> > instrumentation?
> > 
> > Just so I know before I start trying to reproduce them.
> 
> Hi Tamar,
> 
> By counters you mean some sort of hardware counters? I didn't know there
> were multiple ways to do PGO with GCC.
> 
> I think that the answer to your question is "compiler instrumentation". I
> just do -fprofile-generate, run the instrumented binary and then
> -fprofile-use.

Yeah, with some elbow grease the perf record method works too, but it's not
very accurate on Armv8.

I'll try to reproduce and bisect these over the weekend!

[Bug target/115087] New: dead block not eliminated in SVE intrinsics code

2024-05-14 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115087

Bug ID: 115087
   Summary: dead block not eliminated in SVE intrinsics code
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: target
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

The testcase in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114151 had another
"regression" in that the same loop seems to have been peeled but only for 1
iteration.

However the loops are identical so the peeling is weird.

This was caused by

f5fb9ff2396fd41fdd2e6d35a412e936d2d42f75 is the first bad commit
commit f5fb9ff2396fd41fdd2e6d35a412e936d2d42f75
Author: Jan Hubicka 
Date:   Fri Jul 28 16:18:32 2023 +0200

loop-split improvements, part 3

extend tree-ssa-loop-split to understand test of the form
 if (i==0)
and
 if (i!=0)
which triggers only during the first iteration.  Naturally we should
also be able to trigger last iteration or split into 3 cases if
the test indeed can fire in the middle of the loop.

however the commit is innocent, it looks like we're not below a magic threshold
that causes the issue.

However a simpler testcase:

#include 

void test(int size, char uplo, float16_t *p_mat)
{
  int col_stride = uplo == 'u' ? 1 : size;
  auto *a = _mat[0];
  auto pg = svptrue_b16();
  for (int j = 0; j < size; ++j) {
auto *a_j = [j];
if (j > 0) {
  int col_i = j + 1;
  auto v_a_ji_0 = svld1_vnum_f16(pg, (const float16_t *)_j[col_i], 0);
  v_a_ji_0 = svcmla_f16_x(pg, v_a_ji_0, v_a_ji_0, v_a_ji_0, 180);
}

int col_i = j * col_stride;
auto v_a_ji_0 = svld1_vnum_f16(pg, (const float16_t *)_j[col_i], 0);
auto v_old_a_jj_0 = svld1_vnum_f16(pg, (const float16_t *)_j[j], 0);
v_a_ji_0 = svmul_f16_x(pg, v_old_a_jj_0, v_a_ji_0);

svst1_vnum_f16(pg, (float16_t *)_j[col_i], 0, v_a_ji_0);
  }
}

shows that the change in the patch is a positive one.

The issue seems to be that GCC does not see the if block as dead code:

if (j > 0) {
  int col_i = j + 1;
  auto v_a_ji_0 = svld1_vnum_f16(pg, (const float16_t *)_j[col_i], 0);
  v_a_ji_0 = svcmla_f16_x(pg, v_a_ji_0, v_a_ji_0, v_a_ji_0, 180);
}

is dead because v_a_ji_0 is overwritten before use.

  _29 = MEM <__SVFloat16_t> [(__fp16 *)_88 + ivtmp.10_52 * 2];
  svcmla_f16_x ({ -1, 0, ... }, _29, _29, _29, 180);

_29 is dead, but I guess it's not eliminated because it doesn't know what
svcmla_f16_x does. But are these intrinsics not marked as CONST|PURE ?

We finally eliminate it at RTL level but I think we should mark these
intrinsics   as ECF_CONST

[Bug target/114412] [14/15 Regression] 7% slowdown of 436.cactusADM on aarch64

2024-05-13 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114412

Tamar Christina  changed:

   What|Removed |Added

 CC||tnfchris at gcc dot gnu.org

--- Comment #3 from Tamar Christina  ---
Hi Filip,

Do you generate these runs with counters based PGO or compiler instrumentation?

Just so I know before I start trying to reproduce them.

[Bug tree-optimization/114932] Improvement in CHREC can give large performance gains

2024-05-13 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

Tamar Christina  changed:

   What|Removed |Added

 Ever confirmed|0   |1
 Status|UNCONFIRMED |ASSIGNED
   Last reconfirmed||2024-05-13
   Assignee|unassigned at gcc dot gnu.org  |tnfchris at gcc dot 
gnu.org

--- Comment #8 from Tamar Christina  ---
(In reply to Richard Biener from comment #7)
> Likely
> 
>   Base: (integer(kind=4) *)  + ((sizetype) ((unsigned long) l0_19(D) *
> 324) + 36)
> 
> vs.
> 
>   Base: (integer(kind=4) *)  + ((sizetype) ((integer(kind=8)) l0_19(D)
> * 81) + 9) * 4
> 
> where we fail to optimize the outer multiply.  It's
> 
>  ((unsigned)((signed)x * 81) + 9) * 4
> 
> and likely done by extract_muldiv for the case of (unsigned)x.  The trick
> would be to promote the inner multiply to unsigned to make the otherwise
> profitable transform valid.  But best not by enhancing extract_muldiv ...

Ah, merci!

Mine then.

[Bug tree-optimization/114932] Improvement in CHREC can give large performance gains

2024-05-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

--- Comment #6 from Tamar Christina  ---
Created attachment 58096
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=58096=edit
exchange2.fppized-bad.f90.187t.ivopts

[Bug tree-optimization/114932] Improvement in CHREC can give large performance gains

2024-05-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

--- Comment #5 from Tamar Christina  ---
Created attachment 58095
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=58095=edit
exchange2.fppized-good.f90.187t.ivopts

[Bug tree-optimization/114932] Improvement in CHREC can give large performance gains

2024-05-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

--- Comment #4 from Tamar Christina  ---
reduced more:

---
  module brute_force
integer, parameter :: r=9
 integer  block(r, r, 0)
contains
  subroutine brute
 do
  do
  do
   do
do
 do
 do i7 = l0, 1
   select case(1 )
   case(1)
   block(:2, 7:, 1) = block(:2, 7:, i7) - 1
   end select
do i8 = 1, 1
   do i9 = 1, 1
if(1 == 1) then
call digits_20
end if
end do
  end do
end do
end do
  end do
  end do
   end do
 end do
  end do
 end
  end
---

I'll have to stop now till I'm back, but the main difference seems to be in:

good:

:
IV struct:
  SSA_NAME: _1
  Type: integer(kind=8)
  Base: (integer(kind=8)) ((unsigned long) l0_19(D) * 81)
  Step: 81
  Biv:  N
  Overflowness wrto loop niter: Overflow
IV struct:
  SSA_NAME: _20
  Type: integer(kind=8)
  Base: (integer(kind=8)) l0_19(D)
  Step: 1
  Biv:  N
  Overflowness wrto loop niter: No-overflow
IV struct:
  SSA_NAME: i7_28
  Type: integer(kind=4)
  Base: l0_19(D) + 1
  Step: 1
  Biv:  Y
  Overflowness wrto loop niter: No-overflow
IV struct:
  SSA_NAME: vectp.22_46
  Type: integer(kind=4) *
  Base: (integer(kind=4) *)  + ((sizetype) ((unsigned long) l0_19(D) *
324) + 36)
  Step: 324
  Object:   (void *) 
  Biv:  N
  Overflowness wrto loop niter: No-overflow

bad:

:
IV struct:
  SSA_NAME: _1
  Type: integer(kind=8)
  Base: (integer(kind=8)) l0_19(D) * 81
  Step: 81
  Biv:  N
  Overflowness wrto loop niter: No-overflow
IV struct:
  SSA_NAME: _20
  Type: integer(kind=8)
  Base: (integer(kind=8)) l0_19(D)
  Step: 1
  Biv:  N
  Overflowness wrto loop niter: No-overflow
IV struct:
  SSA_NAME: i7_28
  Type: integer(kind=4)
  Base: l0_19(D) + 1
  Step: 1
  Biv:  Y
  Overflowness wrto loop niter: No-overflow
IV struct:
  SSA_NAME: vectp.22_46
  Type: integer(kind=4) *
  Base: (integer(kind=4) *)  + ((sizetype) ((integer(kind=8)) l0_19(D) *
81) + 9) * 4
  Step: 324
  Object:   (void *) 
  Biv:  N
  Overflowness wrto loop niter: No-overflow

[Bug tree-optimization/114932] Improvement in CHREC can give large performance gains

2024-05-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

--- Comment #3 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #2)
> > which is harder for prefetchers to follow.
> 
> This seems like a limitation in the HW prefetcher rather than anything else.
> Maybe the cost model for addressing mode should punish base+index if so.
> Many HW prefetchers I know of are based on the final VA (or even PA) rather
> looking at the instruction to see if it increments or not ...

That was the first thing we tried, and even increasing the cost of
register_offset to something ridiculously high doesn't change a thing.

IVopts thinks it needs to use it and generates:

  _1150 = (voidD.26 *) _1148;
  _1152 = (sizetype) l0_78(D);
  _1154 = _1152 * 324;
  _1156 = _1154 + 216;
  # VUSE <.MEM_421>
  vect__349.614_1418 = MEM  [(integer(kind=4)D.9
*)_1150 + _1156 * 1 clique 2 base 0];

Hence the bug report to see what's going on.

[Bug tree-optimization/114932] New: Improvement in CHREC can give large performance gains

2024-05-02 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114932

Bug ID: 114932
   Summary: Improvement in CHREC can give large performance gains
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
CC: rguenth at gcc dot gnu.org
  Target Milestone: ---

With the original fix from PR114074 applied (e.g.
g:a0b1798042d033fd2cc2c806afbb77875dd2909b) we not only saw regressions but saw
big improvements.

The following testcase:

---
  module brute_force
integer, parameter :: r=9
 integer  block(r, r, r)
contains
  subroutine brute
  k = 1
call digits_2(k)
  end
   recursive subroutine digits_2(row)
  integer, intent(in) :: row
  logical OK
 do i1 = 0, 1
  do i2 = 1, 1
  do i3 = 1, 1
   do i4 = 0, 1
do i5 = 1, select
 do i6 = 0, 1
 do i7 = l0, u0
   select case(1 )
   case(1)
   block(:2, 7:, i7) = block(:2, 7:, i7) - 1
   end select
do i8 = 1, 1
   do i9 = 1, 1
if(row == 5) then
  elseif(OK)then
call digits_2(row + 1)
end if
end do
  end do
   block(:, 1, i7) =   select
end do
end do
  end do
  end do
   end do
block = 1
 end do
 block = 1
 block = block0 + select
  end do
 end
  end
---

compiled with: -mcpu=neoverse-v1 -Ofast -fomit-frame-pointer foo.f90

gets vectorized after sra and constprop.  But the final addressing modes are so
complicated that IVopts generates a register offset mode:

  4c:   2f00041dmvniv29.2s, #0x0
  50:   fc666842ldr d2, [x2, x6]
  54:   fc656841ldr d1, [x2, x5]
  58:   fc646840ldr d0, [x2, x4]
  5c:   0ebd8442add v2.2s, v2.2s, v29.2s
  60:   0ebd8421add v1.2s, v1.2s, v29.2s
  64:   0ebd8400add v0.2s, v0.2s, v29.2s

which is harder for prefetchers to follow.  When the patch was applied it was
able to correctly lower these to the immediate offset loads that the scalar
code was using:

  38:   2f00041dmvniv29.2s, #0x0
  34:   fc594002ldurd2, [x0, #-108]
  40:   fc5b8001ldurd1, [x0, #-72]
  44:   fc5dc000ldurd0, [x0, #-36]
  48:   0ebd8442add v2.2s, v2.2s, v29.2s
  4c:   0ebd8421add v1.2s, v1.2s, v29.2s
  50:   0ebd8400add v0.2s, v0.2s, v29.2s

and also removes all the additional instructions to keep x6,x5 and x4 up to
date.

This gave 10%+ improvements on various workloads.

(ps I'm looking at the __brute_force_MOD_digits_2.constprop.3.isra.0
specialization).

I will try to reduce it more, but am filing this so we can keep track and
hopefully fix.

[Bug ipa/92538] Proposal for IPA init() constant propagation

2024-05-02 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92538

Tamar Christina  changed:

   What|Removed |Added

 CC||jamborm at gcc dot gnu.org

--- Comment #7 from Tamar Christina  ---
Hi Martin,

this is the IPA pass I was talking about at the cauldron.

I see the patch doesn't have any standalone testcases. Do you have some Erick?

[Bug target/114860] [14/15 regression] [aarch64] 511.povray regresses by ~5.5% with -O3 -flto -march=native -mcpu=neoverse-v2 since r14-10014-ga2f4be3dae04fa

2024-05-01 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114860

--- Comment #3 from Tamar Christina  ---
I cannot reproduce this even recompiling libc.

[Bug target/114860] [14/15 regression] [aarch64] 511.povray regresses by ~5.5% with -O3 -flto -march=native -mcpu=neoverse-v2 since r14-10014-ga2f4be3dae04fa

2024-04-26 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114860

Tamar Christina  changed:

   What|Removed |Added

 CC||tnfchris at gcc dot gnu.org

--- Comment #2 from Tamar Christina  ---
There's no change in the binaries aside from an alignment change in one of the
lesser hot functions.

I guess you recompile libc? In which case you need to compare against GCC 13.

As mentioned in the quoted commit this reverts the general case for GCC 14.

[Bug target/114860] [14/15 regression] [aarch64] 511.povray regresses by ~5.5% with -O3 -flto -march=native -mcpu=neoverse-v2 since r14-10014-ga2f4be3dae04fa

2024-04-26 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114860

--- Comment #1 from Tamar Christina  ---
Hmm

I Am unable to reproduce this with -O3 - flto -mcpu=neoverse-v2 on a
neoverse-v2 machine.

Is any other option required?

Also that code was new in gcc 14 and was partially reverted due to register
allocation issue.

So if there is a performance difference it's a miss optimization and not a
regression

[Bug rtl-optimization/114766] ^ constraint modifier unexpectedly affects register class selection.

2024-04-20 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114766

--- Comment #2 from Tamar Christina  ---
(In reply to Vladimir Makarov from comment #1)
> (In reply to Tamar Christina from comment #0)
> > The documentation for ^ states:
>
> If it works for you, we could try to use the patch (although it needs some
> investigation how other targets uses the hint).  In any case, the
> documentation should be modified or made more clear depending on applying or
> not applying the patch.

Yeah, using the patch gives us the behavior we expected, we added a workaround
for now so we can investigate what other targets do in GCC 15.

But while looking at this we also got some unexpected behavior with using ?

For instance we have the pattern:

;; Equal width integer to fp conversion.
(define_insn "2"
  [(set (match_operand:GPF 0 "register_operand")
(FLOATUORS:GPF (match_operand: 1 "register_operand")))]
  "TARGET_FLOAT"
  {@ [ cons: =0 , 1  ; attrs: type , arch  ]
 [ w, w  ; neon_int_to_fp_ , simd  ]
cvtf\t%0, %1
 [ w, ?r ; f_cvti2f, fp]
cvtf\t%0, %1
  })

for modeling floating point conversions. We had expected ? to make the
alternative more expensive, but still possible.  However again during IRA the
entire register class is blocked:

r103: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS

I would have expected that the additional penalty should never make an
alternative impossible.
We thought maybe the move costs were an issue, but we tried with various big
and small numbers but it looks like the move costs have little to no effect. 
Since the original value in this case was in r:

  Choosing alt 0 in insn 7:  (0) =rk  (1) %rk  (2) I {*adddi3_aarch64}

I would have expected the cost for FP_REGS not to be 0, as there's a register
file move involved:

  r103 costs: W8_W11_REGS:2000 W12_W15_REGS:2000 TAILCALL_ADDR_REGS:2000
STUB_REGS:2000 GENERAL_REGS:2000 FP_LO8_REGS:0 FP_LO_REGS:0 FP_REGS:0
POINTER_AND_FP_REGS:7000 MEM:9000

In this particular pattern the ? isn't needed so we're removing it, but the
behavior is still unexpected.

[Bug tree-optimization/114769] [14 Regression] Suspicious code in vect_recog_sad_pattern() since r14-1832

2024-04-19 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114769

Tamar Christina  changed:

   What|Removed |Added

 Resolution|--- |FIXED
 Status|ASSIGNED|RESOLVED

--- Comment #5 from Tamar Christina  ---
(In reply to Feng Xue from comment #3)
> When half_type is null, and call chain would be:
> 
> vect_supportable_direct_optab_p (vinfo, sum_type, SAD_EXPR, NULL, ...)
>   -> get_vectype_for_scalar_type (vinfo, NULL)
>  -> get_related_vectype_for_scalar_type (vinfo, /*scalar_type= */NULL,
> ...)
>  
>if ((!INTEGRAL_TYPE_P (scalar_type)
>&& !POINTER_TYPE_P (scalar_type)
>&& !SCALAR_FLOAT_TYPE_P (scalar_type))
> 
> Here will be a segfault.

Sure, I wasn't able to trigger that even disabling the optab.
In any case this version doesn't get there anymore.

So fixed, thanks for the report!

[Bug tree-optimization/114769] [14 Regression] Suspicious code in vect_recog_sad_pattern() since r14-1832

2024-04-19 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114769

--- Comment #2 from Tamar Christina  ---
I believe this is safe, but the interface is definitely not the cleanest.

vect_recog_absolute_difference has two callers:

1. vect_recog_sad_pattern where if you return true with unprom not set, then
*half_type will be NULL.  The call to vect_supportable_direct_optab_p will
always reject it since there's no vector mode for NULL.  Note that if looking
at the dump files, the convention in the dump files have always been that we
first indicate that a pattern could possibly be recognize and then check that
it's supported.

This change somewhat incorrectly makes the diagnostic message get printed for
"invalid" patterns.

2. vect_recog_abd_pattern, where if half_type is NULL, it then uses diff_stmt
to set them.

So while the note in the dump file is misleading, the code is safe.  I will
however slightly refactor it to prevent the confusion.

[Bug target/113625] Interesting behavior with and without -mcpu=generic

2024-04-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113625

Tamar Christina  changed:

   What|Removed |Added

 CC||tnfchris at gcc dot gnu.org

--- Comment #4 from Tamar Christina  ---
Generic is not the default for aarch64 and the default depends on the
architecture revision.

The documentation patch got stuck on bike shedding on the list.

I'll resubmit it tomorrow.

Generic was kept around for backwards compatibility reasons.

That said, both codegen below are suboptimal...

[Bug rtl-optimization/114766] New: ^ constraint modifier unexpectedly affects register class selection.

2024-04-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114766

Bug ID: 114766
   Summary: ^ constraint modifier unexpectedly affects register
class selection.
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: rtl-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
CC: vmakarov at gcc dot gnu.org
  Target Milestone: ---

The documentation for ^ states:

"This constraint is analogous to ‘?’ but it disparages slightly the alternative
only if the operand with the ‘^’ needs a reload."

>From this we gathered that there's only a slight costs when the given operand
required a reload.

In PR114741 we had a regression when using this modifier because it seems to
unexpectedly also change register class selection during coloring.

The pattern was:

(define_insn "3"
  [(set (match_operand:GPI 0 "register_operand")
(LOGICAL:GPI (match_operand:GPI 1 "register_operand")
 (match_operand:GPI 2 "aarch64_logical_operand")))]
  ""
  {@ [ cons: =0 , 1  , 2; attrs: type , arch  ]
 [ r, %r , r; logic_reg   , * ] \t%0,
%1, %2
 [ rk   , ^r ,  ; logic_imm   , * ] \t%0,
%1, %2
 [ w, 0  ,  ; *   , sve   ] \t%Z0.,
%Z0., #%2
 [ w, w  , w; neon_logic  , simd  ] \t%0.,
%1., %2.
  }
)

where we wanted to prefer the r->r alternative unless a reload is needed in
which case we preferred the w->w.

But in the simple example of:

void foo(unsigned v, unsigned *p)
{
*p = v & 1;
}

where the operation can be done on both r->r and w->w, but w->w needs a mov it
would not pick r->r.

This is because during sched1 the penalty applied to the ^ alternative made it
no longer consider GP regs:

r106: preferred FP_REGS, alternative NO_REGS, allocno FP_REGS
;;3--> b  0: i   9 r106=r105&0x1
:cortex_a53_slot_any:GENERAL_REGS+0(-1)FP_REGS+1(1)PR_LO_REGS+0(0)
 PR_HI_REGS+0(0):model 4

The penalty here seems incorrect, and removing it seems to get the constraint
to work properly.
So the question is, is it a bug, or are we using it incorrectly? or a
documentation bug?

[Bug target/114741] [14 regression] aarch64 sve: unnecessary fmov for scalar int bit operations

2024-04-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114741

Tamar Christina  changed:

   What|Removed |Added

 Resolution|--- |FIXED
 Status|ASSIGNED|RESOLVED

--- Comment #10 from Tamar Christina  ---
Fixed, thanks for the report.

[Bug target/114513] [11/12/13/14 Regression] [aarch64] floating-point registers are used when GPRs are preferred

2024-04-18 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114513
Bug 114513 depends on bug 114741, which changed state.

Bug 114741 Summary: [14 regression] aarch64 sve: unnecessary fmov for scalar 
int bit operations
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114741

   What|Removed |Added

 Status|ASSIGNED|RESOLVED
 Resolution|--- |FIXED

[Bug target/114741] [14 regression] aarch64 sve: unnecessary fmov for scalar int bit operations

2024-04-17 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114741

Tamar Christina  changed:

   What|Removed |Added

   Assignee|unassigned at gcc dot gnu.org  |tnfchris at gcc dot 
gnu.org
 Status|NEW |ASSIGNED

--- Comment #8 from Tamar Christina  ---
Indeed, Regtesting a patch that fixes it, so mine...

It looks like ^ doesn't work well when there's a choice of multiple register
files.

[Bug target/114741] [14 regression] aarch64 sve: unnecessary fmov for scalar int bit operations

2024-04-16 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114741

--- Comment #6 from Tamar Christina  ---
and the exact armv9-a cost model you quoted, also does the right codegen.
https://godbolt.org/z/obafoT6cj

There is just an inexplicable penalty being applied to the r->r alternative.

[Bug target/114741] [14 regression] aarch64 sve: unnecessary fmov for scalar int bit operations

2024-04-16 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114741

Tamar Christina  changed:

   What|Removed |Added

 CC||tnfchris at gcc dot gnu.org,
   ||vmakarov at gcc dot gnu.org

--- Comment #5 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #4)
> (In reply to Wilco from comment #2)
> > It looks like the underlying bug is '^' being incorrectly treated like '?'
> > in record_reg_classes (which is never used during reload). Fixing that
> > results in the expected code being generated in all cases. It looks this
> > issue was introduced in the original commit
> > d1457701461d5a49ca6b5d8a6d1c83a37a6dc771
> 
> static const struct cpu_regmove_cost generic_armv9_a_regmove_cost =
> {
>   1, /* GP2GP  */
>   /* Spilling to int<->fp instead of memory is recommended so set
>  realistic costs compared to memmov_cost.  */
>   3, /* GP2FP  */
>   2, /* FP2GP  */
>   2 /* FP2FP  */
> };
> 
> 
> Note these costs are broken.
> TARGET_REGISTER_MOVE_COST  has this to say about the special value 2:
> ```
> If reload sees an insn consisting of a single set between two hard
> registers, and if TARGET_REGISTER_MOVE_COST applied to their classes returns
> a value of 2, reload does not check to ensure that the constraints of the
> insn are met. Setting a cost of other than 2 will allow reload to verify
> that the constraints are met. You should do this if the ‘movm’ pattern’s
> constraints do not allow such copying.
> ```
> 
> The way I implemented this for thunderx was have GP2GP being cost of 2 and
> then be relative from there. That gave better code generation in general and
> even less spilling. I know I wrote about this before too.

I don't think this is related to this at all
the old generic costs, which armv8 was taken from doesn't use 2, and is broken
https://github.com/gcc-mirror/gcc/blob/master/gcc/config/aarch64/tuning_models/generic.h#L42
generic-armv8-a doesn't use 2 and is broken
https://github.com/gcc-mirror/gcc/blob/master/gcc/config/aarch64/tuning_models/generic_armv8_a.h#L43
neoverse-n2 uses 2 and isn't broken
https://github.com/gcc-mirror/gcc/blob/master/gcc/config/aarch64/tuning_models/neoversen2.h

I think the issue is what Wilco mentioned before. 
The GCC docs claim that ^ shouldn't affect initial costing/IRA, but it clearly
does.

it's penalizing the r->r alternative during initial costing and not just during
lra if a reload is needed.

so the question to Vlad is it correct that ^ is treated the same way as ?
during initial costing? i.e. it's penalizing the alternative.

[Bug tree-optimization/113552] [11/12/13 Regression] vectorizer generates calls to vector math routines with 1 simd lane.

2024-04-15 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113552

Tamar Christina  changed:

   What|Removed |Added

 Status|ASSIGNED|RESOLVED
 Resolution|--- |FIXED

--- Comment #15 from Tamar Christina  ---
Fixed on trunk and all open branches

[Bug tree-optimization/114403] [14 regression] LLVM miscompiled with -O3 -march=znver2 -fno-vect-cost-model since r14-6822-g01f4251b8775c8

2024-04-15 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114403

Tamar Christina  changed:

   What|Removed |Added

 Status|ASSIGNED|RESOLVED
 Resolution|--- |FIXED

--- Comment #29 from Tamar Christina  ---
Fixed, thanks for the report!

[Bug tree-optimization/114403] [14 regression] LLVM miscompiled with -O3 -march=znver2 -fno-vect-cost-model since r14-6822-g01f4251b8775c8

2024-04-12 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114403

--- Comment #26 from Tamar Christina  ---
(In reply to Richard Biener from comment #25)
> That means, when the loop takes the early exit we _must_ take that during
> the vector iterations.  Peeling for gaps means if we would take the early
> exit during one of the gap peeled iterations this is a conflicting
> requirement.
> Now - the current analysis guarantees that the early exit conditions can
> be safely evaluated even for the gap iterations, but not the following
> code when the early exit is _not_ taken.
> 
> So peeling for gaps and early exit vect are not compatible?

I don't see why not, as my email explains for the early exits we always go
to the scalar loop, which already adheres to the condition of peeling for
gaps.

I just think that peeling for gaps should not force it to exit from the main
exit.

[Bug tree-optimization/114403] [14 regression] LLVM miscompiled with -O3 -march=znver2 -fno-vect-cost-model since r14-6822-g01f4251b8775c8

2024-04-12 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114403

--- Comment #24 from Tamar Christina  ---
(In reply to Richard Biener from comment #23)
> Maybe easier to understand testcase:
> 
> with -O3 -msse4.1 -fno-vect-cost-model we return 20 instead of 8.  Adding
> -fdisable-tree-cunroll avoids the issue.  The upper bound we set on the
> vector loop causes us to force taking the IV exit which continues
> with i == (niter - 1) / VF * VF, but 'niter' is 20 here.

yes,indeed, that's what my patch was arguing last time, but I didn't explain it
well enough.

I'm about to send out v2 (waiting for regtest to finish) which hopefully
articulates this better.

[Bug tree-optimization/114403] [14 regression] LLVM miscompiled with -O3 -march=znver2 -fno-vect-cost-model since r14-6822-g01f4251b8775c8

2024-04-11 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114403

--- Comment #22 from Tamar Christina  ---
note that due to the secondary exit the actual full vector iteration count is 8
scalar elements at VF=4 == 2.

And it's this boundary condition where we fail, since ceil (8/4) == 2. any
other value would have done the partial vector iteration.

Basically final_iter_may_be_partial ends up being ignored.

[Bug tree-optimization/114403] [14 regression] LLVM miscompiled with -O3 -march=znver2 -fno-vect-cost-model since r14-6822-g01f4251b8775c8

2024-04-11 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114403

--- Comment #21 from Tamar Christina  ---
Created attachment 57932
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57932=edit
loop.c

attached reduced testcase that reproduces the issue and also checks the buffer
position and copied values.

As discussed on IRC when peeling for gaps we need to either adjust the upper
bounds of the vector loop or force the vector loop to get to the scalar loop. 
However we already go to the scalar loop, just with the wrong induction value
because we were never supposed to take the main exit.

whether go to the scalar loop depends on
x = (((ptr2 - ptr1) - 16) / 16) + 1
x == (((x - 1) >> 2) << 2)

in this case x == 26, so we do go to the scalar code already, but through the
main exit.

exiting through the main exit assumes you've done all vector iterations, in
this case 6 iterations based on the main exit condition which is first != last.

In this case the inductions values will be set on niters_vector_mult.

so in this case first += 24

But that's wrong since the secondary exit has a known iteration count of 9, due
to (buffer_ptr + store_size) <= buffer_end.

Statement (exit)if (ivtmp_21 != 0)
 is executed at most 8 (bounded by 8) + 1 times in loop 1.

So we will always exit through it as 9 < 24.

that means that when we calculate the upper bounds of the vector loop, we must
add a bias so that in this boundary condition that we do an extra partial
vector iteration.

I think the discussion on IRC went off track for a bit and hopefully this
testcase and the explanation above shows that for all early break and all
epilogue peeling reasons, we must bias up for the upper bound to give the
secondary exits a chance to trigger.

So really do think the correct patch is:

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 4375ebdcb49..0973b952c70 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -12144,6 +12144,9 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple
*loop_vectorized_call)
  -min_epilogue_iters to remove iterations that cannot be performed
by the vector code.  */
   int bias_for_lowest = 1 - min_epilogue_iters;
+  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
+bias_for_lowest = 1;
+
   int bias_for_assumed = bias_for_lowest;
   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))

for the reasons described above.  There's no way for us to take the main exit,
which signifies (we've reached the end of all iterations we can possibly do as
vector) and get the correct induction values in this case.

[Bug tree-optimization/114635] OpenMP reductions fail dependency analysis

2024-04-08 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114635

--- Comment #6 from Tamar Christina  ---
(In reply to Jakub Jelinek from comment #4)
> Now, with SVE/RISCV vectors the actual vectorization factor is a poly_int
> rather than constant.  One possibility would be to use VLA arrays in those
> cases, but then it will be hard to undo that later, or allow both shrinking
> and growing the arrays and even turning them into VLA-like ones.

I think they are already VLAs of some kind going into vect, unless I've
misunderstood the declaration:

  float D.28295[0:POLY_INT_CST [15, 16]];
  float D.28293[0:POLY_INT_CST [15, 16]];
  float D.28290[0:POLY_INT_CST [15, 16]];

it looks like during vectorization they are lowered though.

[Bug tree-optimization/114635] New: OpenMP reductions fail dependency analysis

2024-04-08 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114635

Bug ID: 114635
   Summary: OpenMP reductions fail dependency analysis
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

The following testcase reduced from an HPC workload:

#include 

#define RESTRICT restrict

void work(int n, float *RESTRICT x, float *RESTRICT y,
  float *RESTRICT z, float *RESTRICT mass,
  float x0, float y0, float z0,
  float *RESTRICT ax, float *RESTRICT ay,
  float *RESTRICT az) {
  float lax = 0.0f, lay = 0.0f, laz = 0.0f;

#if _OPENMP >= 201307
#pragma omp simd reduction(+:lax,lay,laz)
#endif
  for (int i = 0; i < n; ++i) {
float dx = x[i] - x0;
float dy = y[i] - y0;
float dz = z[i] - z0;
float r2 = dx + dy + dz;

if (r2 == 0.0f)
  continue;

float f = (1.0f / (r2 * sqrtf(r2))) * mass[i];

lax += f * dx;
lay += f * dy;
laz += f * dz; 
  }

  *ax += lax;
  *ay += lay;
  *az += laz;
}

when compiled with -Ofast -march=armv9-a -fopenmp-simd vectorizes as expected
but when the pragma is in effect, e.g.  -Ofast -march=armv9-a -fopenmp then the
main loop fails to vectorize with:

(compute_affine_dependence
  ref_a: D.5962[_33], stmt_a: _69 = D.5962[_33];
  ref_b: D.5962[_33], stmt_b: D.5962[_33] = _ifc__147;
) -> dependence analysis failed
/app/example.c:16:17: missed:  bad data dependence.
/app/example.c:16:17: note:  * Analysis  failed with vector mode VNx4SF

This doesn't seem to happen with just 2 reductions, but with 3 dependency
analysis seems to fail.

I don't know much about openmp but my understanding is that this pragma is
intended for architectures that don't have masking support and works by
splitting the loop and removing the reductions from the main loop creating
openmp "workers" whom each work on one thread.

the reduction values are turned into local arrays and these threads then write
into their own slots into these arrays.

The reduction itself is then done as a final post step.

It looks like the only thing we can vectorize is the post step.

I wonder, since the compiler is the one introducing these local arrays, can we
not mark them safe from inter dependencies?

[Bug target/114577] New: Inefficient codegen for SVE/NEON bridge

2024-04-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114577

Bug ID: 114577
   Summary: Inefficient codegen for SVE/NEON bridge
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: target
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---
Target: aarch64*

The following sequence:

#include 

svint32_t f (int *a, int *b)
{
  int32x4_t va = vld1q_s32 (a);
  svint32_t za = svset_neonq_s32 (svundef_s32 (), va);
  return za;
}

-O2 -march=armv9-a

is expected to be a simple load but generates:

f:
ldr q31, [x0]
ptrue   p3.s, vl4
sel z0.s, p3, z31.s, z0.s
ret

instead of the expected (from clang):

f:  // @f
ldr q0, [x0]
ret

it looks like GCC's implementation of svset_neonq_s32 with svundef does not
become a view_convert/subreg.

[Bug target/114510] [14 Regression] missed proping of multiply by 2 into address of load/stores

2024-04-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114510

Tamar Christina  changed:

   What|Removed |Added

 CC||tnfchris at gcc dot gnu.org

--- Comment #3 from Tamar Christina  ---
Richard's patch should fix this next year.

https://gcc.gnu.org/pipermail/gcc-patches/2023-October/634166.html

We'll then stop relying on combine or other passes to fix this.

[Bug rtl-optimization/114515] [14 Regression] Failure to use aarch64 lane forms after PR101523

2024-04-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114515

Tamar Christina  changed:

   What|Removed |Added

 CC||tnfchris at gcc dot gnu.org
   See Also||https://gcc.gnu.org/bugzill
   ||a/show_bug.cgi?id=114575

--- Comment #10 from Tamar Christina  ---
This has also broken our addressing modes
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114575

[Bug rtl-optimization/114575] New: [14 Regression] SVE addressing modes broken since g:839bc42772ba7af66af3bd16efed4a69511312ae

2024-04-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114575

Bug ID: 114575
   Summary: [14 Regression] SVE addressing modes broken since
g:839bc42772ba7af66af3bd16efed4a69511312ae
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: rtl-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---
Target: aarch64*

Created attachment 57864
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57864=edit
addr.cc

Since the offending commit, compiling the example attached with -O3
-march=armv8.5-a+sve2 results in:

.L14:
lsl x0, x1, 1
add x1, x1, 8
add x2, x3, x0
add x6, x0, x4
ld1hz2.h, p7/z, [x2]
ld1hz22.h, p7/z, [x6]
add x0, x0, x5
fmadz22.h, p7/m, z21.h, z2.h
ld1hz20.h, p7/z, [x0]
fmadz20.h, p7/m, z26.h, z22.h
st1hz20.h, p7, [x2]
cmp x1, 808
bne .L14

instead of what it was before the commit:

.L14:
ld1hz2.h, p7/z, [x1, x0, lsl 1]
ld1hz22.h, p7/z, [x2, x0, lsl 1]
ld1hz20.h, p7/z, [x3, x0, lsl 1]
fmadz22.h, p7/m, z21.h, z2.h
fmadz20.h, p7/m, z26.h, z22.h
st1hz20.h, p7, [x1, x0, lsl 1]
add x0, x0, 8
cmp x0, 808
bne .L14

It's now no longer pushing in the register shifts. This causes significant
performance loss as it needs to now perform the integer ALU ops before doing
the load and they're on the critical path.

[Bug rtl-optimization/113682] Branches in branchless binary search rather than cmov/csel/csinc

2024-04-03 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113682

--- Comment #9 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #8)
> This might be the path splitting running on the gimple level causing issues
> too; see PR 112402 .

Ah that's a good shout.  It looks like Richi already agrees that we should
recognize/do some ifcvt at GIMPLE.

Guess that just leaves the where.

[Bug rtl-optimization/113682] Branches in branchless binary search rather than cmov/csel/csinc

2024-04-02 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113682

Tamar Christina  changed:

   What|Removed |Added

 Status|UNCONFIRMED |NEW
 Ever confirmed|0   |1
  Component|middle-end  |rtl-optimization
   Last reconfirmed||2024-04-02

--- Comment #7 from Tamar Christina  ---
Ok, so for now I think we should separate out the issue of whether/when we
should perform the ifconversion and instead for a bit look at why we miss these
conversions:

The first problem is that GCC does not support if-converting blocks which
terminate into another branch.  In other words, the destination of 1 of the
arms must have a single successor per noce_find_if_block which requires the
final block to be a JOIN block.

In

size_t bsearch2(ITEM* user, ITEM** tree, size_t treeSize) {
auto pos = tree;
size_t low = 0;
size_t high = treeSize;
while (low < high) {
size_t i = (low + high) / 2;
int res = cmp(user, tree[i]);

// These should be cmp + csinc + csel on arm
// and lea + test + cmov + cmov on x86.
low = res > 0 ? i + 1 : low; // csinc
high = res < 0 ? i: high; // csel

if (res == 0) return i;
}
return -1;
}

this fails because of the:

if (res == 0) return i;

which means that the final block doesn't adhere to this.  so the cbranch blocks
the ifconversion.

if we simplify the testcase:

typedef unsigned long size_t;
extern int cmp(size_t *);

size_t bsearch2(size_t high, size_t low, size_t i) {
while (low < high) {
int res = cmp();

low = res > 0 ? i + 1 : low;
high = res < 0 ? i: high;
}
return -1;
}

we now detect the csel but not the csinc. This has two reasons, we visit the BB
in entry->exit order, but we probably should do exit->entry, such that when we
if-convert blocks and delete the branches, the preceeding blocks become valid
to if-convert.

but the main reason the csinc is missed is because of code sinking. The update
of the reference  is sank into the res > 0 branch.

   [local count: 955630224]:
  # high_18 = PHI 
  # low_19 = PHI 
  res_11 = cmp ();
  if (res_11 > 0)
goto ; [59.00%]
  else
goto ; [41.00%]

   [local count: 563821836]:
  i.1_1 = i;
  iftmp.0_12 = i.1_1 + 1;
  goto ; [100.00%]

This is because the compiler (righfully) decides that on the dereference should
only happen in the branch that needs it.  Because of this block 4 is no longer
if-convertable.  However since the load is to the stack we are allowed to move
it around technically speaking.  But this is only beneficial is the branch is
really unpredictable.

To show that it is the sinking, we can avoid it by using -fdisable-tree-sink2
and this example:

size_t bsearch2(size_t high, size_t low, size_t *iv, int *resv) {
while (low < high) {
int res = cmp (iv);
size_t i = *iv;
low = res > 0 ? i + 1 : low;
}
return -1;
}

which generates the csinc.

However the following example again doesn't generate the csinc for GCC but does
for LLVM:

size_t bsearch3(size_t high, size_t low, size_t i, int res) {
while (low < high) {
asm volatile ("" : "=w"(i));
asm volatile ("" : "=w"(res));
low = res > 0 ? i + 1 : low;
}
return -1;
}

which is because of the change header pass.  GCC optimizes this scalar code
from:

size_t bsearch3 (size_t high, size_t low, size_t i, int res)
{
  size_t iftmp.0_7;

   [local count: 59055799]:
  goto ; [100.00%]

   [local count: 1014686024]:
  __asm__ __volatile__("" : "=w" i_5);
  __asm__ __volatile__("" : "=w" res_6);
  if (res_6 > 0)
goto ; [5.50%]
  else
goto ; [94.50%]

   [local count: 55807731]:
  iftmp.0_7 = i_5 + 1;

   [local count: 114863530]:
  # low_1 = PHI 

   [local count: 1073741824]:
  if (low_1 < high_3(D))
goto ; [94.50%]
  else
goto ; [5.50%]

   [local count: 59055800]:
  return 18446744073709551615;

}

into 

size_t bsearch3 (size_t high, size_t low, size_t i, int res)
{
  size_t iftmp.0_7;

   [local count: 59055799]:
  if (low_2(D) < high_3(D))
goto ; [48.59%]
  else
goto ; [51.41%]

   [local count: 958878294]:
  __asm__ __volatile__("" : "=w" i_5);
  __asm__ __volatile__("" : "=w" res_6);
  if (res_6 > 0)
goto ; [5.50%]
  else
goto ; [94.50%]

   [local count: 55807731]:
  # i_10 = PHI 
  iftmp.0_7 = i_10 + 1;
  if (high_3(D) > iftmp.0_7)
goto ; [48.59%]
  else
goto ; [51.41%]

   [local count: 55807730]:
  __asm__ __volatile__("" : "=w" i_8);
  __asm__ __volatile__("" : "=w" res_9);
  if (res_9 > 0)
goto ; [5.50%]
  else
goto ; [94.50%]

   [local count: 59055800]:
  return 18446744073709551615;

}

which then hits the limitation of the single pred I mentioned before.

using -fdisable-tree-ch2 we get:

add x2, x2, 1
cselx1, 

[Bug tree-optimization/114403] [14 regression] LLVM miscompiled with -O3 -march=znver2 -fno-vect-cost-model since r14-6822-g01f4251b8775c8

2024-04-02 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114403

--- Comment #20 from Tamar Christina  ---
This is a bad interaction with early break and peeling for gaps.

when peeling for gaps we set bias_for_lowest to 0, which then negates the ceil
for the upper bound calculation when the div is exact.

We end up doing on a loop that does:

Analyzing # of iterations of loop 1
  exit condition [8, + , 18446744073709551615] != 0
  bounds on difference of bases: -8 ... -8
  result:
# of iterations 8, bounded by 8

and a VF=4 calculating:

Loop 1 iterates at most 1 times.
Loop 1 likely iterates at most 1 times.
Analyzing # of iterations of loop 1
  exit condition [1, + , 1](no_overflow) < bnd.5505_39
  bounds on difference of bases: 0 ... 4611686018427387902
Matching expression match.pd:2011, generic-match-8.cc:27
Applying pattern match.pd:2067, generic-match-1.cc:4813
  result:
# of iterations bnd.5505_39 + 18446744073709551615, bounded by
4611686018427387902
Estimating sizes for loop 1
...
   Induction variable computation will be folded away.
  size:   2 if (ivtmp_312 < bnd.5505_39)
   Exit condition will be eliminated in last copy.
size: 24-3, last_iteration: 24-5
  Loop size: 24
  Estimated size after unrolling: 26
;; Guessed iterations of loop 1 is 0.858446. New upper bound 1.

upper bound should be 2 not 1. I have a working patch, trying to create a
standalone testcase for it.

[Bug tree-optimization/114403] [14 regression] LLVM miscompiled with -O3 -march=znver2 -fno-vect-cost-model since r14-6822-g01f4251b8775c8

2024-04-02 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114403

Tamar Christina  changed:

   What|Removed |Added

 Status|UNCONFIRMED |ASSIGNED
   Last reconfirmed||2024-04-02
   Assignee|unassigned at gcc dot gnu.org  |tnfchris at gcc dot 
gnu.org
 Ever confirmed|0   |1

--- Comment #19 from Tamar Christina  ---
Thanks! back from holidays and looking into it now.

mine.

[Bug tree-optimization/114345] FRE missing knowledge of semantics of IFN loads

2024-03-15 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114345

--- Comment #5 from Tamar Christina  ---
(In reply to Richard Biener from comment #4)
> Well, the shuffling in .LOAD_LANES will be a bit awkward to do, but sure.  We
> basically lack "constant folding" of .LOAD_LANES and similarly of course
> we can't see through .STORE_LANES of a constant when later folding a scalar
> load from the same memory.

I guess it becomes harder with the 3 and 4 lane ones, but the 2 lanes one is
just a single VEC_PERM_EXPR no?

[Bug target/114350] New: missing support for SVE widening floating point conversion

2024-03-15 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114350

Bug ID: 114350
   Summary: missing support for SVE widening floating point
conversion
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: target
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---
Target: aarch64*

The following example:

#include 
svfloat64_t widening (svint32_t a)
{
svbool_t pred = svptrue_b32 ();
svint64_t cvt = svreinterpret_s64_s32 (a);
svint64_t ext = svextw_s64_x (pred, cvt);
svfloat64_t res = svcvt_f64_s64_x (pred, ext);
return res;
}

compiled with -Ofast -march=armv9-a generates:

widening(__SVInt32_t):
ptrue   p3.b, all
sxtwz0.d, p3/m, z0.d
scvtf   z0.d, p3/m, z0.d
ret

but SVE has widening and narrowing floating point conversions, as such this
should generate:

widening(__SVInt32_t):
ptrue   p3.b, all
scvtf   z0.d, p3/m, z0.s
ret

The autovec equivalent is:


void f(int n, double *data) {
for (int i=0;i

[Bug tree-optimization/114345] FRE missing knowledge of semantics of IFN loads

2024-03-15 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114345

--- Comment #3 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #2)
> Oh VN does have some knowledge of MASK_STORE and LEN_STORE. Just not
> LOAD_LANES .
> 
> 
> See PR 106365 for MASK_STORE and LEN_STORE implementation. Shouldn't be hard
> to add LOAD_LANES/STORE_LANES there ...

Ah!, thanks for the pointer.

[Bug tree-optimization/114346] New: vectorizer generates the same IV twice

2024-03-14 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114346

Bug ID: 114346
   Summary: vectorizer generates the same IV twice
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

The following example:

---
double f(int n, double *data, double b) {
double res = b;

for (int i=0;i
  _48 = vect_vec_iv_.7_45 + { POLY_INT_CST [2, 2], ... };
  _71 = VIEW_CONVERT_EXPR(vect_vec_iv_.7_45);
  _72 = VIEW_CONVERT_EXPR({ POLY_INT_CST [4, 4],
... });
  _73 = _71 + _72;
  _49 = VIEW_CONVERT_EXPR(_73);

so it looks like _48 and _49 are the same value, except that _48 is done as
32-bit IV and _49 is calculated as a 64-bit one and truncated to 32?

[Bug tree-optimization/114345] New: FRE missing knowledge of semantics of IFN loads

2024-03-14 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114345

Bug ID: 114345
   Summary: FRE missing knowledge of semantics of IFN loads
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

The following testcase:

---
long tdiff = 10412095;

int main() {
  struct {
long maximum;
int nonprimary_delay;
  } delays[] = {{}, {}, {}, {9223372036854775807, 36 * 60 * 60}};

  for (unsigned i = 0; i < sizeof(delays) / sizeof(delays[0]); ++i)
if (tdiff <= delays[i].maximum)
  return delays[i].nonprimary_delay;

  __builtin_abort();
}
---

compiled with -O2 -fno-vect-cost-model

generates on AArch64:

  vect_cst__45 = {tdiff.0_2, tdiff.0_2};
  vect_array.11 = .LOAD_LANES (MEM  [(long int *)]);
  vect__1.12_40 = vect_array.11[0];
  vect_array.11 ={v} {CLOBBER};
  vect_array.14 = .LOAD_LANES (MEM  [(long int *) + 32B]);
  vect__1.15_43 = vect_array.14[0];
  vect_array.14 ={v} {CLOBBER};
  mask_patt_15.17_46 = vect__1.12_40 >= vect_cst__45;
  mask_patt_15.17_47 = vect__1.15_43 >= vect_cst__45;
  vexit_reduc_51 = mask_patt_15.17_46 | mask_patt_15.17_47;

and on x86_64:

  vect_cst__53 = {tdiff.0_2, tdiff.0_2};
  _37 = { 0, 4294967295, 4294967294, 4294967293 };
  _32 = { 4, 5, 6, 7 };
  vect__1.11_42 = MEM  [(long int *)];
  vectp_delays.9_43 =  + 16;
  vect__1.12_44 = MEM  [(long int *)vectp_delays.9_43];
  vect_perm_even_45 = VEC_PERM_EXPR ;
  vectp_delays.9_47 =  + 32;
  vect__1.13_48 = MEM  [(long int *)vectp_delays.9_47];
  vectp_delays.9_49 =  + 48;
  vect__1.14_50 = MEM  [(long int *)vectp_delays.9_49];
  vect_perm_even_51 = VEC_PERM_EXPR ;
  mask_patt_17.15_54 = vect_perm_even_45 >= vect_cst__53;
  mask_patt_17.15_55 = vect_perm_even_51 >= vect_cst__53;
  vexit_reduc_59 = mask_patt_17.15_54 | mask_patt_17.15_55;

which is eventually simplified by FRE into:

  vect_cst__53 = {tdiff.0_2, tdiff.0_2};
  mask_patt_17.15_54 = vect_cst__53 <= { 0, 0 };
  mask_patt_17.15_55 = vect_cst__53 <= { 0, 9223372036854775807 };
  vexit_reduc_59 = mask_patt_17.15_54 | mask_patt_17.15_55;

and realizing that the loads aren't needed.

It looks like the reason is that FRE doesn't understand LOAD_LANES and
MASKED_LOAD_LANES or the other load IFNs.

We thus end up with a spill to the stack and a load of the constants.

[Bug tree-optimization/114339] [14 regression] Tor miscompiled with -O2 -mavx -fno-vect-cost-model since r14-6822

2024-03-14 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114339

--- Comment #6 from Tamar Christina  ---
vectorizer generates:

  mask_patt_21.19_58 = vect_perm_even_49 >= vect_cst__57;
  mask_patt_21.19_59 = vect_perm_even_55 >= vect_cst__57;
  vexit_reduc_63 = mask_patt_21.19_58 | mask_patt_21.19_59;
  if (vexit_reduc_63 != { 0, 0 })
goto ; [20.00%]
  else
goto ; [80.00%]

This is changed at loopdone into:

  delays[3].nonprimary_delay = 129600;
  vect_cst__57 = {tdiff_6, tdiff_6};
  mask_patt_21.19_58 = vect_cst__57 <= { 0, 0 };
  mask_patt_21.19_59 = vect_cst__57 <= { 0, 0x7FFF };
  vexit_reduc_63 = mask_patt_21.19_58 | mask_patt_21.19_59;
  if (vexit_reduc_63 != { 0, 0 })
goto ; [20.00%]
  else
goto ; [80.00%]

or in other words, if there's any value where the compare succeeds, find it and
return.
This looks correct to me.

It could be that my AVX is rusty but, this generates:

   vmovdqa 0xf9c(%rip),%xmm1# 0x402010 
   mov$0x1,%eax
   vmovq  %rcx,%xmm3
   vmovdqa %xmm0,(%rsp)
   vpunpcklqdq %xmm3,%xmm3,%xmm2
   vmovdqa %xmm0,0x10(%rsp)
   vpcmpgtq %xmm2,%xmm1,%xmm1#
   vmovdqa %xmm0,0x20(%rsp)
   vmovq  %rax,%xmm0
   vpunpcklqdq %xmm0,%xmm0,%xmm0
   movl   $0x1fa40,0x38(%rsp)
   vpcmpgtq %xmm2,%xmm0,%xmm0
   vpor   %xmm1,%xmm0,%xmm0
   vptest %xmm0,%xmm0

which looks off, particularly for the second compare it look like it doesn't do
a load but instead just duplicates the constant 1.
gdb seems to confirm this. At the first compare:

(gdb) p $xmm2.v2_int64
$4 = {10412095, 10412095}
(gdb) p $xmm0.v2_int64
$5 = {0, 0}

which is what's expected, but at the second compare:

(gdb) p $xmm2.v2_int64
$7 = {10412095, 10412095}
(gdb) p $xmm0.v2_int64
$6 = {1, 1}

at the second it's comparing {1, 1} instead of {0, 0x7FFF}.

on AArch64 where it doesn't fail the comparison is:

   moviv29.4s, 0
   add x1, sp, 16
   ldr x5, [x0, 8]
   mov w0, 64064
   movkw0, 0x1, lsl 16
   add x3, sp, 48
   str q29, [sp, 64]
   mov x2, 57407
   mov x4, 9223372036854775807
   str x4, [sp, 64]
   movkx2, 0x9e, lsl 16
   str w0, [sp, 72]
   sub x2, x2, x5
   stp q29, q29, [x1]
   dup v27.2d, x2
   ld2 {v30.2d - v31.2d}, [x1]
   str q29, [sp, 48]
   ld2 {v28.2d - v29.2d}, [x3]
   cmgev30.2d, v30.2d, v27.2d
   cmgev28.2d, v28.2d, v27.2d
   orr v30.16b, v30.16b, v28.16b
   umaxp   v30.4s, v30.4s, v30.4s
   fmovx0, d30
   cbnzx0, .L12

which has v30.2d being {0, 0} and v28.2d being {0, 0x7FFF} as
expected...

On AArch64 we don't inline the constants because whatever is propagating the
constants can't understand the LOAD_LANES:

  mask_patt_19.21_50 = vect__2.16_44 >= vect_cst__49;
  mask_patt_19.21_51 = vect__2.19_47 >= vect_cst__49;
  vexit_reduc_55 = mask_patt_19.21_50 | mask_patt_19.21_51;
  if (vexit_reduc_55 != { 0, 0 })
goto ; [20.00%]
  else
goto ; [80.00%]

so could this be another expansion bug?

Note that a simpler reproducer is this:

---
long tdiff = 10412095;

int main() {
  struct {
long maximum;
int nonprimary_delay;
  } delays[] = {{}, {}, {}, {9223372036854775807, 36 * 60 * 60}};

  for (unsigned i = 0; i < sizeof(delays) / sizeof(delays[0]); ++i)
if (tdiff <= delays[i].maximum)
  return delays[i].nonprimary_delay;

  __builtin_abort();
}
---

the key point is that we're not allowed to constprop tdiff at GIMPLE. If we do,
e.g:

int main() {
  struct {
long maximum;
int nonprimary_delay;
  } delays[] = {{}, {}, {}, {9223372036854775807, 36 * 60 * 60}};
  long tdiff = 10412095;

  for (unsigned i = 0; i < sizeof(delays) / sizeof(delays[0]); ++i)
if (tdiff <= delays[i].maximum)
  return delays[i].nonprimary_delay;

  __builtin_abort();
}

then after vectorization the const prop the entire expression is evaluated at
GIMPLE and it gets the right result.

This makes me believe it's a target expansion bug.

[Bug tree-optimization/114151] [14 Regression] weird and inefficient codegen and addressing modes since r14-9193

2024-03-08 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114151

--- Comment #17 from Tamar Christina  ---
> So doing in the vectorizer sth like the following should get us the best
> possible ranges?  Ah, probably only global ranges since the SCEV query
> itself would still lack context sensitive info (but as said we don't have
> a good context we can easily use).
> 

which reminds me.. I have been playing around with using ranger directly in the
vectorizer.

It looks like we get better ranges for the overwidening detection.

It feels like using ranger.range_of_expr or similar would solve the problem
that we don't currently get ranged for structural widening, i.e.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102188

[Bug tree-optimization/114234] [14 Regression] verify_ssa failure with early-break vectorisation

2024-03-05 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114234

Tamar Christina  changed:

   What|Removed |Added

   Last reconfirmed||2024-03-05
 Status|UNCONFIRMED |NEW
 Ever confirmed|0   |1

--- Comment #3 from Tamar Christina  ---
started with

commit g:324d2907c86f05e40dc52d226940308f53a956c2
Author: Richard Biener 
Date:   Mon Mar 4 09:46:13 2024 +0100

tree-optimization/114192 - scalar reduction kept live with early break vect

The following fixes a missing replacement of the reduction value
used in the epilog, causing the scalar reduction to be kept live
across the early break exit path.

PR tree-optimization/114192
* tree-vect-loop.cc (vect_create_epilog_for_reduction): Use the
appropriate def for the live out stmt in case of an alternate
exit.

the reduction seems to propagate out a single value for both exits:

  # a_39 = PHI 

but since the PHI nodes carry their reduction values in the exits we have
different reduction statements, so using the same reduction value for all exit
is wrong.

should be

  # a_39 = PHI 

[Bug tree-optimization/114192] scalar code left around following early break vectorization of reduction

2024-03-01 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114192

Tamar Christina  changed:

   What|Removed |Added

 Ever confirmed|0   |1
 Status|UNCONFIRMED |NEW
   Last reconfirmed||2024-03-01

--- Comment #1 from Tamar Christina  ---
Confirmed.

It looks like DCE6 no longer thinks:

  # sum_10 = PHI 

  _1 = aD.4432[i_12];
  sum_7 = _1 + sum_11;

is dead after vectorization.

it removes the only dead consumer of sum_7,
a PHI node left over in the guard block which becomes unused after the
reduction is vectorized.

DCE says:

marking necessary through sum_11 stmt sum_11 = PHI 
processing: sum_11 = PHI 

marking necessary through sum_7 stmt sum_7 = _1 + sum_11;
processing: sum_7 = _1 + sum_11;

marking necessary through _1 stmt _1 = a[i_12];
processing: _1 = a[i_12];

so it thinks the closed definition is needed?

This seems to only happen with reductions, other live operations look fine:

extern int a[1024];
int f4(int *x, int n)
{
int sum = 0;
for (int i = 0; i < n; i++)
{
sum = a[i];
if (a[i] == 42)
break;
}
return sum;
}

[Bug target/98877] [AArch64] Inefficient code generated for tbl NEON intrinsics

2024-02-28 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98877

--- Comment #12 from Tamar Christina  ---
and it's not the first time we have conditional lowering. We already do so for
e.g. shifts, where shifting by an amount => bitsize of a vector element is
defined behavior or AArch64.

[Bug target/98877] [AArch64] Inefficient code generated for tbl NEON intrinsics

2024-02-28 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98877

--- Comment #11 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #10)
> (In reply to Tamar Christina from comment #9)
> > While RA should be able to deal with this,
> > shouldn't we also just lower TBLs in gimple?
> > 
> > This no reason why this can't be a VEC_PERM_EXPR which would also get the
> > copies
> > removed at the gimple level and allows us to optimize this to something else
> > depending on the index.
> 
> Yes there is a reason, `out of range` values for VEC_PERM is undefined while
> tbl is well defined  ( If an index is out of range for the table, the result
> for that lookup is 0.).
> 

I don't think that's not a good reason. The out of range values can be made
implementation defined. i.e. mid-end shouldn't care about them.

not lowering this in gimple means we lose a heck of a lot of optimizations that
are impossible to cover in RTL.

[Bug tree-optimization/114151] [14 Regression] weird and inefficient codegen and addressing modes since g:a0b1798042d033fd2cc2c806afbb77875dd2909b

2024-02-28 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114151

--- Comment #3 from Tamar Christina  ---
> 
> This was a correctness fix btw, so I'm not sure we can easily recover - we
> could try using niter information for CHREC_VARIABLE but then there's
> variable niter here so I don't see a chance.
> 

It looks like it's mostly SVE suffering here. Adv. SIMD an scalar codegen seems
to have vastly improved with it. we see ~10% improvements due to better
addressing for scalar.

It also only happens at -O3 but -O2 is fine, which is weird as I was expected
IVopts to be enabled at -O2 too.

> 
> OTOH the +1 could make it overflow for large size.
> 
> Can you test the above?  It should be an incremental improvement.
> 

Applying the changes does not seem to make a difference for the final codegen
:(

[Bug tree-optimization/114151] New: [14 Regression] weird and inefficient codegen and addressing modes since g:a0b1798042d033fd2cc2c806afbb77875dd2909b

2024-02-28 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114151

Bug ID: 114151
   Summary: [14 Regression] weird and inefficient codegen and
addressing modes since
g:a0b1798042d033fd2cc2c806afbb77875dd2909b
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
CC: rguenth at gcc dot gnu.org
  Target Milestone: ---
Target: aarch64*

Created attachment 57559
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57559=edit
testcase

The attached C++ testcase compiled with: -O3 -mcpu=neoverse-n2

used to compile a nice and simple loop.  But after
g:a0b1798042d033fd2cc2c806afbb77875dd2909b

The codegen is weird and it uses horrible addressing modes.

The first odd part is that it's decided to split the loop, the "main" loop has
a guard after it to branch to the exit is the iteration count is 1.

If not instead of just loop again it falls through the a copy of the main loop,
but has destroyed addressing modes.

The copy of the loop seems to have unshared the address calculations. Before we
had:

  _128 = (void *) ivtmp.11_20;
  _54 = MEM <__SVFloat16_t> [(__fp16 *)_128];
  _10 = MEM <__SVFloat16_t> [(__fp16 *)_128 + POLY_INT_CST [16B, 16B]];
  _75 = MEM <__SVFloat16_t> [(__fp16 *)_128 + POLY_INT_CST [32B, 32B]];

etc, so all as an offset from _128.  Now we have:

  col_i_61 = (int) ivtmp.11_100;
  _60 = (long unsigned int) col_i_61;
  _59 = _60 * 2;
  _58 = a_j_69 + _59;
  _54 = MEM <__SVFloat16_t> [(__fp16 *)_58];
  _53 = _59 + POLY_INT_CST [16, 16];
  _13 = a_j_69 + _53;
  _10 = MEM <__SVFloat16_t> [(__fp16 *)_13];
  _74 = _59 + POLY_INT_CST [32, 32];
  _19 = a_j_69 + _74;
  _75 = MEM <__SVFloat16_t> [(__fp16 *)_19];

and similarly for the stores as well.

it also weirdly creates some very complicated addressing computations. Before
we had:

  _144 = p_mat_16(D) + 6; 
  _64 = MEM <__SVFloat16_t> [(__fp16 *)_144 + ivtmp.10_100 * 2];
  _143 = p_mat_16(D) + 4;
  _84 = MEM <__SVFloat16_t> [(__fp16 *)_143 + ivtmp.10_100 * 2];

and after:

  ivtmp.23_130 = (unsigned long) p_mat_16(D);
  _123 = 2 - ivtmp.23_130;
  _124 =  <__SVFloat16_t> [(__fp16 *)0B + _123 + ivtmp.12_109 * 2];
  _64 = MEM <__SVFloat16_t> [(__fp16 *)_124];

  _122 = -ivtmp.23_130;
  _120 =  <__SVFloat16_t> [(__fp16 *)0B + _122 + ivtmp.12_109 * 2];
  _84 = MEM <__SVFloat16_t> [(__fp16 *)_120];

This results in quite the codesize increase, and a 7-10% performance loss.

[Bug target/98877] [AArch64] Inefficient code generated for tbl NEON intrinsics

2024-02-28 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98877

--- Comment #9 from Tamar Christina  ---
While RA should be able to deal with this,
shouldn't we also just lower TBLs in gimple?

This no reason why this can't be a VEC_PERM_EXPR which would also get the
copies
removed at the gimple level and allows us to optimize this to something else
depending on the index.

[Bug target/102171] vget_low_*/vget_high_* intrinsics should become BIT_FIELD_REF during gimple

2024-02-27 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102171

--- Comment #3 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #2)
> I think I am going to implement this (or assign it interally to someone else
> to implement).

If you do, please also remove them from arm_neon.h and use the new intrinsics
framework.

We're gradually trying to get this file empty.

[Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7

2024-02-27 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Tamar Christina  changed:

   What|Removed |Added

 CC||rsandifo at gcc dot gnu.org

--- Comment #29 from Tamar Christina  ---
(In reply to rguent...@suse.de from comment #28)
> On Mon, 26 Feb 2024, tnfchris at gcc dot gnu.org wrote:
> 
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441
> > 
> > --- Comment #27 from Tamar Christina  ---
> > Created attachment 57538 [details]
> >   --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57538=edit
> > proposed1.patch
> > 
> > proposed patch, this gets the gathers and scatters back. doing regression 
> > run.
> 
> I don't think this will fly.

Well.. I don't really know what the do here I guess.

per the discussion on irc, we only used to try gather/scatters when SCEV fails.

Now that it succeeds we no longer try using the pattern and try to handle it
during vectorizable_load/vectorizable_stores as recognizing the gather/scatters
inline through VMAT_GATHER_SCATTER.

This works fine for normal gather and scatters but doesn't work for widening
gathers and narrowing scatters which only the pattern seems to handle.

I don't know how to get this to be detected through get_load_store_type since
well, that's very late.  among others we've already determined the VF and the
unpacks have already been marked relevant. So
vectorizable_load/vectorizable_store would have to actively change the IL.

So I don't know how widening and narrowing operations are supposed to work
here.  given that.. I will leave it up to the maintainers I guess.

[Bug tree-optimization/86530] Vectorization failure for a simple loop

2024-02-26 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86530

--- Comment #8 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #6)
> With my patch for V4QI, we still don't get the best code:
>   vect_perm_even_271 = VEC_PERM_EXPR  4, 6 }>;
>   vect_perm_even_273 = VEC_PERM_EXPR  4, 6 }>;
>   vect_perm_even_275 = VEC_PERM_EXPR  vect_perm_even_273, { 0, 2, 4, 6 }>;
> 
> _275={_264[0], _264[2], _268[0], _268[2]} or
> VEC_PERM<_264, _268, {0, 2, 4, 6}>
> 
> but for some reason we don't reduce it to that perm
> 
> And there is still a lot of extra PERMS than there should be.

Because this loop is not something that can be fixed by using V4QI (we tried
before).

This loop requires improvements to SCEV and SLP. It's loading 16 sequential
bytes as there's no gap between the p1 and p2 values across iterations..

so this loop should vectorized with V16QI and widening additions. So I don't
think this is related to the other example.

So I'll take it back as it requires actual vectorizer work and part of things
we're trying to address in GCC 15.

[Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7

2024-02-26 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #27 from Tamar Christina  ---
Created attachment 57538
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57538=edit
proposed1.patch

proposed patch, this gets the gathers and scatters back. doing regression run.

[Bug tree-optimization/114099] [14 regression] ICE in find_uses_to_rename_use when building darktable-4.6.1

2024-02-25 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114099

--- Comment #8 from Tamar Christina  ---
Created attachment 57537
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57537=edit
uses.patch

new code seems sensitive to visitation order as get_virtual_phi returns NULL
for blocks which don't define or use a virtual PHI.

testing patch.

[Bug middle-end/114081] [14 regression] ICE in verify_dominators when building php-8.3.3 (error: dominator of 16 should be 111, not 3) since r14-6822

2024-02-23 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114081

Tamar Christina  changed:

   What|Removed |Added

   Assignee|unassigned at gcc dot gnu.org  |tnfchris at gcc dot 
gnu.org

--- Comment #6 from Tamar Christina  ---
slightly cleaned up testcase

---
typedef struct filter_list_entry {
  const char *name;
  int id;
  void (*function)();
} filter_list_entry;

static const filter_list_entry filter_list[9] = {0};

void php_zval_filter(int filter, int id1) {
  filter_list_entry filter_func;

  int size = 9;
  for (int i = 0; i < size; ++i) {
if (filter_list[i].id == filter) {
  filter_func = filter_list[i];
  goto done;
}
  }

#pragma GCC novector
  for (int i = 0; i < size; ++i) {
if (filter_list[i].id == 0x0204) {
  filter_func = filter_list[i];
  goto done;
}
  }
done:
  if (!filter_func.id)
filter_func.function();
}
---

seems to happen because it's doing an inner loop vect with two loops that have
early exits to the same destination.

  if (multiple_exits_p)
{
  update_loop = new_loop;
  doms = get_all_dominated_blocks (CDI_DOMINATORS, loop->header);
  for (unsigned i = 0; i < doms.length (); ++i)
if (flow_bb_inside_loop_p (loop, doms[i]))
  doms.unordered_remove (i);
}

was supposed to update the dominators but something goes wrong.

Mine, but for monday..

[Bug tree-optimization/114068] [14 regression] ICE when building darktable-4.6.1 (error: PHI node with wrong VUSE on edge from BB 25) since r14-8768

2024-02-23 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114068

--- Comment #14 from Tamar Christina  ---
patch submitted
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646415.html

[Bug tree-optimization/114068] [14 regression] ICE when building darktable-4.6.1 (error: PHI node with wrong VUSE on edge from BB 25) since r14-8768

2024-02-23 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114068

--- Comment #13 from Tamar Christina  ---
Created attachment 57510
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57510=edit
candidate-patch1.patch

candidate patch being tested.

I was hoping to correct it during peeling itself when the merge block is
created, but I don't have enough information there to do so.

[Bug tree-optimization/114068] [14 regression] ICE when building darktable-4.6.1 (error: PHI node with wrong VUSE on edge from BB 25) since r14-8768

2024-02-23 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114068

--- Comment #12 from Tamar Christina  ---
looks like the moving of the store didn't update a stray out of block use of
the MEM.

working on patch.

[Bug tree-optimization/114068] [14 regression] ICE when building darktable-4.6.1 (error: PHI node with wrong VUSE on edge from BB 25) since r14-8768

2024-02-23 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114068

Tamar Christina  changed:

   What|Removed |Added

   Assignee|unassigned at gcc dot gnu.org  |tnfchris at gcc dot 
gnu.org
 Status|NEW |ASSIGNED

--- Comment #11 from Tamar Christina  ---
reduced to

---
struct h {
  int b;
  int f;
} k;

void n(int m) {
  struct h a = k;
  for (int o = m; o; ++o) {
if (a.f)
  __builtin_unreachable();
if (o > 1)
  __builtin_unreachable();
*( + o) = 1;
  }
}
---

which fails on aarch64 too

[Bug target/114063] New: Use IFN_CHECK_RAW_PTRS/IFN_CHECK_WAR_PTRS for Advanced. SIMD

2024-02-22 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114063

Bug ID: 114063
   Summary: Use IFN_CHECK_RAW_PTRS/IFN_CHECK_WAR_PTRS for
Advanced. SIMD
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: target
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---
Target: aarch64*

The following example:

void fn (short *a, short *b, short *c, int n)
{
for (int i = 0; i < n; i += 2)
{
short b0 = b[i + 0];
short b1 = b[i + 1];
a[i + 0] = b0 + 2;
a[i + 1] = b1 + 3;
}
}

when compiled with -O3 -march=armv9-a

will generate an alias check using whilewr which is the corresponding
instruction for doing alias checks using CHECK_WAR_PTRS.

But when compiling for a known CPU, such as -mcpu=neoveser-v2 the vectorizer
chooses to use Adv. SIMD for the main loop, and so the check for supporting
IFN_CHECK_WAR_PTRS fails as we only support SVE modes.

Since all that really matters is the element size, I believe we can still use
IFN_CHECK_WAR_PTRS for Adv. SIMD using the SVE instruction if SVE is allowed.

That is, we should implement the pattern for Advanced SIMD modes as well gated
on TARGET_SVE.

[Bug tree-optimization/114061] GCC fails vectorization when using __builtin_prefetch

2024-02-22 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114061

--- Comment #4 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #3)
> Confirmed.
> 
> Though maybe we should drop them in the vectorized version of the loop. HW
> prefetchers usually do a decent job and sometimes (maybe most) SW hinted
> prefetches interfere with the HW prefetchers.

definitely agree that I'm not sure how useful they are, but some customers
definitely seem to want them.

[Bug tree-optimization/114061] GCC fails vectorization when using __builtin_prefetch

2024-02-22 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114061

--- Comment #2 from Tamar Christina  ---
(In reply to Andrew Pinski from comment #1)
> I thought there was already one recorded about this.

I could only find https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103938 about an
ICE when prefetching a vector address.

[Bug tree-optimization/114061] New: GCC fails vectorization when using __builtin_prefetch

2024-02-22 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114061

Bug ID: 114061
   Summary: GCC fails vectorization when using __builtin_prefetch
   Product: gcc
   Version: 14.0
Status: UNCONFIRMED
  Keywords: missed-optimization
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

The following example:

void foo(double * restrict a, double * restrict b, int n){
  int i;
  for(i=0; i

[Bug target/113257] -march=native or -mcpu=native are ineffective, but -march=native -mcpu=native works on arm64 M2 Ultra

2024-02-22 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113257

--- Comment #5 from Tamar Christina  ---
(In reply to Sam James from comment #3)
> (In reply to Richard Earnshaw from comment #2)
> I'm missing why the combination then works though?

So we've made several changes here over time.

-mcpu=native does attempt to find the core id to know what it is, but since
GCC-12 we also enable extensions that don't have a requirement on an
architecture.

So for instance on an SVE capable core we would enable it, but we of course
can't enable a tuning model.

But between GCC 13 and GCC 14 many things have changes.  It looks like GCC-14
works as expected:

> ./install/bin/gcc -xc -S -o - - -march=native < /dev/null
.arch
armv8-a+flagm+dotprod+rdma+lse+crc+aes+sha3+fp16fml+rcpc+i8mm+bf16+sb+ssbs+pauth
.file   ""
.text
.ident  "GCC: (GNU) 14.0.1 20240129 (experimental)"
   
   
 .section   
.note.GNU-stack,"",@progbits

> ./install/bin/gcc -xc -S -o - - -mcpu=native < /dev/null
.arch
armv8-a+flagm+dotprod+rdma+lse+crc+aes+sha3+fp16fml+rcpc+i8mm+bf16+sb+ssbs+pauth
.file   ""
.text
.ident  "GCC: (GNU) 14.0.1 20240129 (experimental)"
   
   
 .section   
.note.GNU-stack,"",@progbits

> ./install/bin/gcc -xc -S -o - - -mcpu=native -march=native < /dev/null
.arch
armv8-a+flagm+dotprod+rdma+lse+crc+aes+sha3+fp16fml+rcpc+i8mm+bf16+sb+ssbs+pauth
.file   ""
.text
.ident  "GCC: (GNU) 14.0.1 20240129 (experimental)"
   
   
 .section   
.note.GNU-stack,"",@progbits

So first question is, can you confirm it does for GCC-14 for you too?

In the meantime I'll try GCC-13

[Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7

2024-02-22 Thread tnfchris at gcc dot gnu.org via Gcc-bugs
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Tamar Christina  changed:

   What|Removed |Added

 Ever confirmed|0   |1
Summary|[14 Regression] Fail to |[14 Regression] Fail to
   |fold the last element with  |fold the last element with
   |multiple loop   |multiple loop since
   ||g:2efe3a7de0107618397264017
   ||fb045f237764cc7
   Last reconfirmed||2024-02-22
 Status|UNCONFIRMED |NEW
   Keywords|needs-bisection |

--- Comment #26 from Tamar Christina  ---
(In reply to Richard Biener from comment #18)
> (In reply to Tamar Christina from comment #17)
> > Ok, bisected to
> > 
> > g:2efe3a7de0107618397264017fb045f237764cc7 is the first bad commit
> > commit 2efe3a7de0107618397264017fb045f237764cc7
> > Author: Hao Liu 
> > Date:   Wed Dec 6 14:52:19 2023 +0800
> > 
> > tree-optimization/112774: extend the SCEV CHREC tree with a nonwrapping
> > flag
> > 
> > Before this commit we were unable to analyse the stride of the access.
> > After this niters seems to estimate the loop trip count at 4 and after that
> > the logs diverge enormously.
> 
> Hum, but that's backward and would match to what I said in comment#2 - we
> should get better code with that.
> 

Ok, so I've dug more into this today.  It's definitely this commit that's
causing it.  The reason is we no longer consider masked gather/scatters.

Before this commit we the gather pattern would trigger:

tresg.i:3:275: note:   gather/scatter pattern: detected: a[_2] = b.3_3;
   
   
 tresg.i:3:275: note:   gather_scatter pattern
recognized: .SCATTER_STORE ((sizetype) , _2, 4, b.3_3);   

and the use of the masked scatter is what's causing the epilogue to not be
required and why it generates better code.  It's not the loads.

The issue is that vect_analyze_data_refs only considers gather/scatters IF DR
analysis fails, which it did before:

tresg.c:31:29: missed:  failed: evolution of offset is not affine.
base_address:
offset from base address:
constant offset from base address:
step:
base alignment: 0
base misalignment: 0
offset alignment: 0
step alignment: 0
base_object: array1
Access function 0: {{m_112 * 2, +, 24}_3, +, 2}_4
Access function 1: 0
Creating dr for array1[0][_8]

this now succeeds after the quoted commit:

success.
base_address: 
offset from base address: (ssizetype) ((sizetype) (m_111 * 2) * 2)
constant offset from base address: 0
step: 4
base alignment: 8
base misalignment: 0
offset alignment: 4
step alignment: 4
base_object: array1
Access function 0: {{m_112 * 2, +, 24}_3, +, 2}_4
Access function 1: 0
Creating dr for array1[0][_8]

so we never enter

  /* Check that analysis of the data-ref succeeded.  */
  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
  || !DR_STEP (dr))
{

and without the IFN scatters it tries deinterleaving scalar stores to scatters:

tresg.c:29:22: note:   Detected single element interleaving array1[0][_8] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[1][_8] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[2][_8] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[3][_8] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[0][_1] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[1][_1] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[2][_1] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[3][_1] step
4
tresg.c:29:22: missed:   not consecutive access array2[_4][_8] = _70;
tresg.c:29:22: note:   using strided accesses
tresg.c:29:22: missed:   not consecutive access array2[_4][_1] = _68;
tresg.c:29:22: note:   using strided accesses

...

tresg.c:29:22: note:   using gather/scatter for strided/grouped access, scale =
2

but without the SCATTER_STORE IFN it never tries masking the scatter, so we
lose MASK_SCATTER_STORE and hence we generate worse code because the whole loop
can no longer be predicated

However trying to force it generates an ICE so I guess it's not that simple.

  1   2   3   4   5   6   7   8   >