Issue |
89600
|
Summary |
vpand deferral eliminates earlier vpand elimination
|
Labels |
new issue
|
Assignees |
|
Reporter |
Validark
|
In my code, I have the following function:
```zig
export fn expand8xu8To16xu4AsByteVector(vec: @Vector(8, u8)) @Vector(16, u8) {
return std.simd.interlace(.{ vec, vec >> @splat(4) }) & @as(@Vector(16, u8), @splat(0xF));
}
```
Here is the Zen 3 assembly:
```asm
.LCPI0_0:
.zero 16,15
expand8xu8To16xu4AsByteVector:
vpsrlw xmm1, xmm0, 4
vpunpcklbw xmm0, xmm0, xmm1
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
ret
```
This implementation avoids an extraneous `vpand`, which LLVM unfortunately inserts in an implementation like:
```zig
export fn expand8xu8To16xu4AsByteVectorBad(vec: @Vector(8, u8)) @Vector(16, u8) {
return std.simd.interlace(.{ vec & @as(@Vector(8, u8), @splat(0xF)), vec >> @splat(4) })
}
```
```asm
.LCPI0_0:
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.byte 15
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.zero 1
.LCPI0_1:
.zero 16,15
expand8xu8To16xu4AsByteVectorBad:
vpand xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
vpsrlw xmm0, xmm0, 4
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI0_1]
vpunpcklbw xmm0, xmm1, xmm0
ret
```
The optimization in my first version is helpful because x86 does not have a `vpsrlb` instruction, it only has `vpsrlw`, which means the lower byte in each 2-byte pair will have 4 bits shifted in from the upper byte, requiring us to use a `vpand` to zero out the bits we don't want. But in this case, because we have to `vpand` to get the lowest 4 bits of the other vector we are interleaving anyway, we may as well interleave first, and then isolate the lowest 4 bits of all the bytes simultaneously.
Next, I define this function:
```zig
fn foo(vec: @Vector(16, u8)) [2]@Vector(16, u8) {
const vec2 = vec + vec;
const vec3 = vec2 | @as(@Vector(16, u8), @splat(1));
return @bitCast(std.simd.interlace(.{ vec2, vec3 }));
}
```
Zen 3 emit:
```asm
.LCPI1_0:
.zero 16,1
foo:
vpaddb xmm0, xmm0, xmm0; equivalent to multiply by 2 or shift left by 1
vpor xmm1, xmm0, xmmword ptr [rip + .LCPI1_0]
vpunpckhbw xmm2, xmm0, xmm1
vpunpcklbw xmm0, xmm0, xmm1
```
The problem comes when I compose the two aforementioned functions:
```zig
export fn baz(x: u64) [2]@Vector(16, u8) {
return foo(expand8xu8To16xu4AsByteVector(@bitCast(x)));
}
```
```asm
.LCPI2_0:
.zero 16,15
.LCPI2_1:
.zero 16,30
.LCPI2_2:
.zero 16,1
baz:
vmovq xmm0, rdi
vpsrlw xmm1, xmm0, 4
vpand xmm1, xmm1, xmmword ptr [rip + .LCPI2_0]; Unnecessary! We wanted to avoid this instruction!
vpunpcklbw xmm0, xmm0, xmm1
vpaddb xmm0, xmm0, xmm0
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI2_1]; LLVM tries to be cool here and use `0xF << 1` after `xmm0 + xmm0`
vpor xmm1, xmm0, xmmword ptr [rip + .LCPI2_2]
vpunpckhbw xmm2, xmm0, xmm1
vpunpcklbw xmm0, xmm0, xmm1
```
Expected emit:
```asm
.LCPI2_1:
.zero 16,30
.LCPI2_2:
.zero 16,1
baz:
vmovq xmm0, rdi
vpsrlw xmm1, xmm0, 4
vpunpcklbw xmm0, xmm0, xmm1
vpaddb xmm0, xmm0, xmm0
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI2_1]
vpor xmm1, xmm0, xmmword ptr [rip + .LCPI2_2]
vpunpckhbw xmm2, xmm0, xmm1
vpunpcklbw xmm0, xmm0, xmm1
```
Alternatively (a less cool, but straightforward concatenation of the implementations of `expand8xu8To16xu4AsByteVector` and `foo`, where we `vpand` with `0xF` before the `vpaddb` rather than `vpand` with `0xF << 1` after the `vpaddb`):
```asm
.LCPI2_1:
.zero 16,15
.LCPI2_2:
.zero 16,1
baz:
vmovq xmm0, rdi
vpsrlw xmm1, xmm0, 4
vpunpcklbw xmm0, xmm0, xmm1
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI2_1]
vpaddb xmm0, xmm0, xmm0
vpor xmm1, xmm0, xmmword ptr [rip + .LCPI2_2]
vpunpckhbw xmm2, xmm0, xmm1
vpunpcklbw xmm0, xmm0, xmm1
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs