| Issue |
61683
|
| Summary |
x86 AVX2: Inefficient code when extracting vector values as a bitmask (through boolean vector, aiming for `movmsk`)
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
He3lixxx
|
Consider this piece of code with `as_bitmask` being a function that should simply extract the truthiness of each vector element and return them as a bitmask.
```c++
#include <cstddef>
#include <cstdint>
template<class VectorT>
uint8_t as_bitmask(VectorT vec) {
constexpr size_t NUM_MASK_BITS = sizeof(VectorT) / sizeof(VectorT{}[0]);
using MaskVecT __attribute__((ext_vector_type(NUM_MASK_BITS))) = bool;
MaskVecT mask_vector = __builtin_convertvector(vec, MaskVecT);
return reinterpret_cast<uint8_t&>(mask_vector);
}
using uint32x4_t __attribute__((vector_size(16))) = uint32_t;
using uint32x8_t __attribute__((vector_size(32))) = uint32_t;
template uint8_t as_bitmask(uint32x4_t);
template uint8_t as_bitmask(uint32x8_t);
```
When compiling with `-O3 -mavx2`, clang currently generates this code ([godbolt](https://godbolt.org/z/PMKMfx1vx))
```asm
unsigned char as_bitmask<unsigned int __vector(4)>(unsigned int __vector(4)):
vpxor xmm1, xmm1, xmm1 ; xmm1 = [0] * 4
vpcmpeqd xmm0, xmm0, xmm1 ; xmm0 = [-1 if el == 0 else 0 for el in input]
vpcmpeqd xmm1, xmm1, xmm1 ; xmm1 = [1] * 4
vpxor xmm0, xmm0, xmm1 ; invert xmm1 -> xmm1 = [0 if el == 0 else -1 for el in input]
vpackssdw xmm0, xmm0, xmm0 ; compress 4B values to 2B, repeating values
vpsllw xmm0, xmm0, 15 ; superfluous left-shift?
vpacksswb xmm0, xmm0, xmm0 ; compress 2B values to 1B, again repeating values
vpmovmskb eax, xmm0 ; extract from xmm0, giving [0 if el == 0 else 1 for el in input] * 4
ret
unsigned char as_bitmask<unsigned int __vector(8)>(unsigned int __vector(8)):
vpxor xmm1, xmm1, xmm1 ; ymm1 = [0] * 8
vpcmpeqd ymm0, ymm0, ymm1 ; ymm0 = [-1 if el == 0 else 0 for el in input]
vmovmskps eax, ymm0 ; extract from ymm0, giving [1 if el == 0 else 0 for el in input]
not eax ; negate result -> [0 if el == 0 else 1 for el in input]
vzeroupper
ret
```
It looks to me as if the `uint32x4_t`/`xmm`-variant could be simplified here, using the same approach that is used in the `uint32x8_t`/`ymm`-variant, saving 4 instructions:
```asm
unsigned char as_bitmask<unsigned int __vector(4)>(unsigned int __vector(4)):
vpxor xmm1, xmm1, xmm1 ; xmm1 = [0] * 4
vpcmpeqd xmm0, xmm0, xmm1 ; xmm0 = [-1 if el == 0 else 0 for el in input]
movmskps eax, xmm0 ; extract from xmm0, giving [1 if el == 0 else 0 for el in input]
not eax ; negate result -> [0 if el == 0 else 1 for el in input]
ret
```
The problem does not occur when compiling for newer architectures supporting AVX512, as LLVM uses vector masks then.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs