Issue 174504
Summary Use `pmovmskb` instead of `@llvm.vector.reduce.or.v16i8` where possible
Labels backend:X86, missed-optimization
Assignees
Reporter Kmeakin
    Same as https://github.com/llvm/llvm-project/issues/174500, but x86 doesn't have horizontal umax either. However, we can use `pmovmskb` to extract sign bit from each byte:

```c++
auto src16(u8* p) {
    auto ret = true;
    for (usize i = 0; i < 16; i++) {
        ret &= p[i] < 0x80;
    }
    return ret;
}

auto tgt16(i8x16* p) {
    i8x16 v0 = p[0];
    return _mm_movemask_epi8(v0) == 0;
}
```

```llvm
define dso_local noundef zeroext i1 @src16(unsigned char*)(ptr noundef readonly captures(none) %0) local_unnamed_addr #0 {
  %2 = load <16 x i8>, ptr %0, align 1
  %3 = tail call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %2)
  %4 = icmp sgt i8 %3, -1
  ret i1 %4
}

define dso_local noundef zeroext i1 @tgt16(signed char vector[16]*)(ptr noundef readonly captures(none) %0) local_unnamed_addr #1 {
  %2 = load <16 x i8>, ptr %0, align 16
  %3 = icmp slt <16 x i8> %2, zeroinitializer
  %4 = bitcast <16 x i1> %3 to i16
  %5 = icmp eq i16 %4, 0
  ret i1 %5
}
```

```asm
src16(unsigned char*):
        movdqu xmm0, xmmword ptr [rdi]
        pshufd  xmm1, xmm0, 238
        por xmm1, xmm0
        pshufd  xmm0, xmm1, 85
        por     xmm0, xmm1
 movdqa  xmm1, xmm0
        psrld   xmm1, 16
        por     xmm1, xmm0
 movdqa  xmm0, xmm1
        psrlw   xmm0, 8
        por     xmm0, xmm1
        movd    eax, xmm0
        not     al
        shr     al, 7
 ret

tgt16(signed char vector[16]*):
        movdqa  xmm0, xmmword ptr [rdi]
        pmovmskb        eax, xmm0
        test    eax, eax
 sete    al
        ret
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to