| Issue |
174500
|
| Summary |
[AArch64] Use `@llvm.vector.reduce.umax.v16i8` instead of `@llvm.vector.reduce.or.v16i8` where possible
|
| Labels |
backend:AArch64,
missed-optimization
|
| Assignees |
|
| Reporter |
Kmeakin
|
https://godbolt.org/z/rcEGWdc1z
Consider this code for checking if 16 bytes are ASCII:
```c++
#include <cstdint>
using u8 = uint8_t;
using u8x16 = u8 __attribute__((vector_size(16)));
using u64 = uint64_t;
using usize = unsigned long;
auto src16(u8* p) {
auto ret = true;
for (usize i = 0; i < 16; i++) {
ret &= p[i] < 0x80;
}
return ret;
}
```
This is auto-vectorised to
```llvm
define dso_local noundef i1 @src16(unsigned char*)(ptr noundef readonly captures(none) %0) local_unnamed_addr #1 {
%2 = load <16 x i8>, ptr %0, align 1
%3 = tail call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %2)
%4 = icmp sgt i8 %3, -1
ret i1 %4
}
```
but since AArch64 does not have an instruction for horizontal or, it produces this assembly:
```asm
src16(unsigned char*):
ldr q0, [x0]
ext v1.16b, v0.16b, v0.16b, #8
orr v0.8b, v0.8b, v1.8b
fmov x8, d0
orr x8, x8, x8, lsr #32
lsr x9, x8, #16
orr w8, w8, w9
orr w8, w8, w8, lsr #8
ubfx w8, w8, #7, #1
eor w0, w8, #0x1
ret
```
Using horizontal umax produces much better assembly, since AArch64 has an instruction for doing this natively:
```c++
auto tgt16(u8x16* p) {
u8x16 v0 = p[0];
return __builtin_reduce_max(v0) < 0x80;
}
```
```llvm
define dso_local noundef i1 @tgt16(unsigned char vector[16]*)(ptr noundef readonly captures(none) %0) local_unnamed_addr #0 {
%2 = load <16 x i8>, ptr %0, align 16
%3 = tail call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %2)
%4 = icmp sgt i8 %3, -1
ret i1 %4
}
```
```asm
tgt16(unsigned char vector[16]*):
ldr q0, [x0]
umaxv b0, v0.16b
fmov w8, s0
ubfx w8, w8, #7, #1
eor w0, w8, #0x1
ret
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs