| Issue |
174504
|
| Summary |
Use `pmovmskb` instead of `@llvm.vector.reduce.or.v16i8` where possible
|
| Labels |
backend:X86,
missed-optimization
|
| Assignees |
|
| Reporter |
Kmeakin
|
Same as https://github.com/llvm/llvm-project/issues/174500, but x86 doesn't have horizontal umax either. However, we can use `pmovmskb` to extract sign bit from each byte:
```c++
auto src16(u8* p) {
auto ret = true;
for (usize i = 0; i < 16; i++) {
ret &= p[i] < 0x80;
}
return ret;
}
auto tgt16(i8x16* p) {
i8x16 v0 = p[0];
return _mm_movemask_epi8(v0) == 0;
}
```
```llvm
define dso_local noundef zeroext i1 @src16(unsigned char*)(ptr noundef readonly captures(none) %0) local_unnamed_addr #0 {
%2 = load <16 x i8>, ptr %0, align 1
%3 = tail call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %2)
%4 = icmp sgt i8 %3, -1
ret i1 %4
}
define dso_local noundef zeroext i1 @tgt16(signed char vector[16]*)(ptr noundef readonly captures(none) %0) local_unnamed_addr #1 {
%2 = load <16 x i8>, ptr %0, align 16
%3 = icmp slt <16 x i8> %2, zeroinitializer
%4 = bitcast <16 x i1> %3 to i16
%5 = icmp eq i16 %4, 0
ret i1 %5
}
```
```asm
src16(unsigned char*):
movdqu xmm0, xmmword ptr [rdi]
pshufd xmm1, xmm0, 238
por xmm1, xmm0
pshufd xmm0, xmm1, 85
por xmm0, xmm1
movdqa xmm1, xmm0
psrld xmm1, 16
por xmm1, xmm0
movdqa xmm0, xmm1
psrlw xmm0, 8
por xmm0, xmm1
movd eax, xmm0
not al
shr al, 7
ret
tgt16(signed char vector[16]*):
movdqa xmm0, xmmword ptr [rdi]
pmovmskb eax, xmm0
test eax, eax
sete al
ret
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs