Issue 185224
Summary [AArch64] Suboptimal code for testing if all elements of a NEON vector are equal
Labels backend:AArch64, missed-optimization
Assignees
Reporter Kmeakin
    # C code
https://godbolt.org/z/Wa61E4dTf
```c++
#include <arm_neon.h>

bool src1(uint32x4_t v) {
    bool ret = true;
    for (int i = 0; i < 4; i++) ret &= (v[i] == v[0]);
    return ret;
}

bool tgt1(uint32x4_t v) {
    auto max = vmaxvq_u32(v);
    auto min = vminvq_u32(v);
    return max == min;
}
```

# Generated assembly
```asm
src1:
        dup     v1.4s, v0.s[0]
        mov     w8, #1
        cmeq    v0.4s, v0.4s, v1.4s
        mvn     v0.16b, v0.16b
 umaxv   s0, v0.4s
        fmov    w9, s0
        bic     w0, w8, w9
 ret

tgt1:
        umaxv   s1, v0.4s
        uminv   s0, v0.4s
 fmov    w8, s1
        fmov    w9, s0
        cmp     w8, w9
 cset    w0, eq
        ret
```

# MCA timings
```asm
Iterations: 100
Instructions:      800
Total Cycles:      1602
Total uOps: 800

Dispatch Width:    3
uOps Per Cycle:    0.50
IPC: 0.50
Block RThroughput: 2.7


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      3 0.50                        dup	v1.4s, v0.s[0]
 1      1     0.33 mov	w8, #1
 1      3     0.50 cmeq	v0.4s, v0.4s, v1.4s
 1      3     0.50 mvn	v0.16b, v0.16b
 1      4     0.50                        umaxv	s0, v0.4s
 1      3     0.50                        fmov	w9, s0
 1      1 0.33                        bic	w0, w8, w9
 1      1     1.00 U     ret
```

```asm
Iterations:        100
Instructions: 700
Total Cycles:      901
Total uOps:        700

Dispatch Width: 3
uOps Per Cycle:    0.78
IPC:               0.78
Block RThroughput: 2.3


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4] [5]    [6]    Instructions:
 1      4     0.50 umaxv	s1, v0.4s
 1      4     0.50                        uminv	s0, v0.4s
 1      3     0.50                        fmov	w8, s1
 1      3 0.50                        fmov	w9, s0
 1      1     0.33 cmp	w8, w9
 1      1     0.33                        cset	w0, eq
 1 1     1.00                  U     ret
```

# Alive proof
https://alive2.llvm.org/ce/z/Q9ghYB
```llvm-ir

----------------------------------------
define i1 @src1(<4 x i32> noundef %v) noundef {
entry:
  %#0 = shufflevector <4 x i32> noundef %v, <4 x i32> poison, 0, 0, 0, 0
  %#1 = icmp ne <4 x i32> noundef %v, %#0
  %#2 = bitcast <4 x i1> %#1 to i4
  %#3 = icmp eq i4 %#2, 0
  ret i1 %#3
}
=>
define i1 @tgt1(<4 x i32> noundef %v) noundef {
entry:
  %vmaxvq_u32.i = reduce_umax <4 x i32> noundef %v
 %vminvq_u32.i = reduce_umin <4 x i32> noundef %v
  %cmp = icmp eq i32 %vmaxvq_u32.i, %vminvq_u32.i
  ret i1 %cmp
}
Transformation seems to be correct!
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to