Issue 89600
Summary vpand deferral eliminates earlier vpand elimination
Labels new issue
Assignees
Reporter Validark
    In my code, I have the following function:

```zig
export fn expand8xu8To16xu4AsByteVector(vec: @Vector(8, u8)) @Vector(16, u8) {
    return std.simd.interlace(.{ vec, vec >> @splat(4) }) & @as(@Vector(16, u8), @splat(0xF));
}
```

Here is the Zen 3 assembly:

```asm
.LCPI0_0:
        .zero 16,15
expand8xu8To16xu4AsByteVector:
        vpsrlw  xmm1, xmm0, 4
 vpunpcklbw      xmm0, xmm0, xmm1
        vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI0_0]
        ret
```

This implementation avoids an extraneous `vpand`, which LLVM unfortunately inserts in an implementation like:

```zig
export fn expand8xu8To16xu4AsByteVectorBad(vec: @Vector(8, u8)) @Vector(16, u8) {
    return std.simd.interlace(.{ vec & @as(@Vector(8, u8), @splat(0xF)), vec >> @splat(4) })
}
```

```asm
.LCPI0_0:
        .byte   15
 .byte   15
        .byte   15
        .byte   15
        .byte 15
        .byte   15
        .byte   15
        .byte   15
 .zero   1
        .zero   1
        .zero   1
        .zero 1
        .zero   1
        .zero   1
        .zero   1
 .zero   1
.LCPI0_1:
        .zero 16,15
expand8xu8To16xu4AsByteVectorBad:
        vpand   xmm1, xmm0, xmmword ptr [rip + .LCPI0_0]
        vpsrlw  xmm0, xmm0, 4
 vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI0_1]
        vpunpcklbw xmm0, xmm1, xmm0
        ret
```

The optimization in my first version is helpful because x86 does not have a `vpsrlb` instruction, it only has `vpsrlw`, which means the lower byte in each 2-byte pair will have 4 bits shifted in from the upper byte, requiring us to use a `vpand` to zero out the bits we don't want. But in this case, because we have to `vpand` to get the lowest 4 bits of the other vector we are interleaving anyway, we may as well interleave first, and then isolate the lowest 4 bits of all the bytes simultaneously.

Next, I define this function:

```zig
fn foo(vec: @Vector(16, u8)) [2]@Vector(16, u8) {
    const vec2 = vec + vec;
    const vec3 = vec2 | @as(@Vector(16, u8), @splat(1));
 return @bitCast(std.simd.interlace(.{ vec2, vec3 }));
}
```

Zen 3 emit:

```asm
.LCPI1_0:
        .zero   16,1
foo:
 vpaddb  xmm0, xmm0, xmm0; equivalent to multiply by 2 or shift left by 1
 vpor    xmm1, xmm0, xmmword ptr [rip + .LCPI1_0]
        vpunpckhbw xmm2, xmm0, xmm1
        vpunpcklbw      xmm0, xmm0, xmm1
```

The problem comes when I compose the two aforementioned functions:

```zig
export fn baz(x: u64) [2]@Vector(16, u8) {
 return foo(expand8xu8To16xu4AsByteVector(@bitCast(x)));
}
```

```asm
.LCPI2_0:
 .zero   16,15
.LCPI2_1:
        .zero   16,30
.LCPI2_2:
 .zero   16,1
baz:
        vmovq   xmm0, rdi
        vpsrlw xmm1, xmm0, 4
        vpand   xmm1, xmm1, xmmword ptr [rip + .LCPI2_0]; Unnecessary! We wanted to avoid this instruction!
        vpunpcklbw xmm0, xmm0, xmm1
        vpaddb  xmm0, xmm0, xmm0
        vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI2_1]; LLVM tries to be cool here and use `0xF << 1` after `xmm0 + xmm0`
        vpor    xmm1, xmm0, xmmword ptr [rip + .LCPI2_2]
        vpunpckhbw      xmm2, xmm0, xmm1
        vpunpcklbw xmm0, xmm0, xmm1
```

Expected emit:

```asm
.LCPI2_1:
        .zero   16,30
.LCPI2_2:
 .zero   16,1
baz:
        vmovq   xmm0, rdi
        vpsrlw xmm1, xmm0, 4
        vpunpcklbw      xmm0, xmm0, xmm1
        vpaddb xmm0, xmm0, xmm0
        vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI2_1]
        vpor    xmm1, xmm0, xmmword ptr [rip + .LCPI2_2]
 vpunpckhbw      xmm2, xmm0, xmm1
        vpunpcklbw      xmm0, xmm0, xmm1
```

Alternatively (a less cool, but straightforward concatenation of the implementations of `expand8xu8To16xu4AsByteVector` and `foo`, where we `vpand` with `0xF` before the `vpaddb` rather than `vpand` with `0xF << 1` after the `vpaddb`):

```asm
.LCPI2_1:
 .zero   16,15
.LCPI2_2:
        .zero   16,1
baz:
        vmovq xmm0, rdi
        vpsrlw  xmm1, xmm0, 4
        vpunpcklbw      xmm0, xmm0, xmm1
        vpand   xmm0, xmm0, xmmword ptr [rip + .LCPI2_1]
 vpaddb  xmm0, xmm0, xmm0
        vpor    xmm1, xmm0, xmmword ptr [rip + .LCPI2_2]
        vpunpckhbw      xmm2, xmm0, xmm1
        vpunpcklbw xmm0, xmm0, xmm1
```
_______________________________________________
llvm-bugs mailing list
llvm-bugs@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to