| Issue |
185224
|
| Summary |
[AArch64] Suboptimal code for testing if all elements of a NEON vector are equal
|
| Labels |
backend:AArch64,
missed-optimization
|
| Assignees |
|
| Reporter |
Kmeakin
|
# C code
https://godbolt.org/z/Wa61E4dTf
```c++
#include <arm_neon.h>
bool src1(uint32x4_t v) {
bool ret = true;
for (int i = 0; i < 4; i++) ret &= (v[i] == v[0]);
return ret;
}
bool tgt1(uint32x4_t v) {
auto max = vmaxvq_u32(v);
auto min = vminvq_u32(v);
return max == min;
}
```
# Generated assembly
```asm
src1:
dup v1.4s, v0.s[0]
mov w8, #1
cmeq v0.4s, v0.4s, v1.4s
mvn v0.16b, v0.16b
umaxv s0, v0.4s
fmov w9, s0
bic w0, w8, w9
ret
tgt1:
umaxv s1, v0.4s
uminv s0, v0.4s
fmov w8, s1
fmov w9, s0
cmp w8, w9
cset w0, eq
ret
```
# MCA timings
```asm
Iterations: 100
Instructions: 800
Total Cycles: 1602
Total uOps: 800
Dispatch Width: 3
uOps Per Cycle: 0.50
IPC: 0.50
Block RThroughput: 2.7
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 3 0.50 dup v1.4s, v0.s[0]
1 1 0.33 mov w8, #1
1 3 0.50 cmeq v0.4s, v0.4s, v1.4s
1 3 0.50 mvn v0.16b, v0.16b
1 4 0.50 umaxv s0, v0.4s
1 3 0.50 fmov w9, s0
1 1 0.33 bic w0, w8, w9
1 1 1.00 U ret
```
```asm
Iterations: 100
Instructions: 700
Total Cycles: 901
Total uOps: 700
Dispatch Width: 3
uOps Per Cycle: 0.78
IPC: 0.78
Block RThroughput: 2.3
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 4 0.50 umaxv s1, v0.4s
1 4 0.50 uminv s0, v0.4s
1 3 0.50 fmov w8, s1
1 3 0.50 fmov w9, s0
1 1 0.33 cmp w8, w9
1 1 0.33 cset w0, eq
1 1 1.00 U ret
```
# Alive proof
https://alive2.llvm.org/ce/z/Q9ghYB
```llvm-ir
----------------------------------------
define i1 @src1(<4 x i32> noundef %v) noundef {
entry:
%#0 = shufflevector <4 x i32> noundef %v, <4 x i32> poison, 0, 0, 0, 0
%#1 = icmp ne <4 x i32> noundef %v, %#0
%#2 = bitcast <4 x i1> %#1 to i4
%#3 = icmp eq i4 %#2, 0
ret i1 %#3
}
=>
define i1 @tgt1(<4 x i32> noundef %v) noundef {
entry:
%vmaxvq_u32.i = reduce_umax <4 x i32> noundef %v
%vminvq_u32.i = reduce_umin <4 x i32> noundef %v
%cmp = icmp eq i32 %vmaxvq_u32.i, %vminvq_u32.i
ret i1 %cmp
}
Transformation seems to be correct!
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs