| Issue |
182034
|
| Summary |
[x86] Missed optimizations for truncation+masked store sequences lowering to AVX-512F/VL instructions
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
okaneco
|
Signed-saturating, unsigned-saturating, and regular truncation followed by a masked store sometimes does not optimize to the expected `vpmov/vpmovs/vpmovus`.
The IR comes from trying to rewrite some Rust core library intrinsics with portable code. Of the 15 truncating patterns, `_mm512_mask_cvtepi32_storeu_epi16` and `_mm512_mask_cvtepi32_storeu_epi8` similarly fail to optimize while the rest produced expected results.
https://llvm.godbolt.org/z/81aY6x4hq
```llvm
define void @_mm512_mask_cvtepi32_storeu_epi16_intrinsic(ptr noundef %mem_addr, i16 noundef %k, ptr dead_on_return noalias noundef readonly align 64 captures(none) dereferenceable(64) %a) unnamed_addr #0 {
start:
%0 = load <16 x i32>, ptr %a, align 64
tail call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %mem_addr, <16 x i32> %0, i16 %k)
ret void
}
define void @_mm512_mask_cvtepi32_storeu_epi16_portable(ptr noundef writeonly captures(none) %mem_addr, i16 noundef %k, ptr dead_on_return noalias noundef readonly align 64 captures(none) dereferenceable(64) %a) unnamed_addr #0 {
start:
%0 = bitcast i16 %k to <16 x i1>
%1 = load <16 x i32>, ptr %a, align 64
%2 = trunc <16 x i32> %1 to <16 x i16>
tail call void @llvm.masked.store.v16i16.p0(<16 x i16> %2, ptr align 1 %mem_addr, <16 x i1> %0)
ret void
}
declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr, <16 x i32>, i16) unnamed_addr #1
declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr captures(none), <16 x i1>) #2
attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(argmem: readwrite) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512f,+f16c,+fma,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+crc32,+ssse3,+avx,+avx2,+avx512f,+avx512vl,+f16c,+fma,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+crc32,+ssse3,+avx,+avx2,+avx512f,+f16c,+fma,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+crc32,+ssse3" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
```
<details><summary>Assembly</summary>
```asm
_mm512_mask_cvtepi32_storeu_epi16_intrinsic: # @_mm512_mask_cvtepi32_storeu_epi16_intrinsic
kmovw k1, esi
vmovdqa64 zmm0, zmmword ptr [rdx]
vpmovdw ymmword ptr [rdi] {k1}, zmm0
vzeroupper
ret
_mm512_mask_cvtepi32_storeu_epi16_portable: # @_mm512_mask_cvtepi32_storeu_epi16_portable
vmovdqa64 zmm0, zmmword ptr [rdx]
vpmovdw ymm0, zmm0
test sil, 1
jne .LBB1_1
test sil, 2
jne .LBB1_3
.LBB1_4: # %else2
test sil, 4
jne .LBB1_5
.LBB1_6: # %else4
test sil, 8
jne .LBB1_7
.LBB1_8: # %else6
test sil, 16
jne .LBB1_9
.LBB1_10: # %else8
test sil, 32
jne .LBB1_11
.LBB1_12: # %else10
test sil, 64
jne .LBB1_13
.LBB1_14: # %else12
test sil, sil
jns .LBB1_16
.LBB1_15: # %cond.store13
vpextrw word ptr [rdi + 14], xmm0, 7
.LBB1_16: # %else14
vextracti128 xmm0, ymm0, 1
test esi, 256
jne .LBB1_17
test esi, 512
jne .LBB1_19
.LBB1_20: # %else18
test esi, 1024
jne .LBB1_21
.LBB1_22: # %else20
test esi, 2048
jne .LBB1_23
.LBB1_24: # %else22
test esi, 4096
jne .LBB1_25
.LBB1_26: # %else24
test esi, 8192
jne .LBB1_27
.LBB1_28: # %else26
test esi, 16384
jne .LBB1_29
.LBB1_30: # %else28
test esi, 32768
jne .LBB1_31
.LBB1_32: # %else30
vzeroupper
ret
.LBB1_1: # %cond.store
vpextrw word ptr [rdi], xmm0, 0
test sil, 2
je .LBB1_4
.LBB1_3: # %cond.store1
vpextrw word ptr [rdi + 2], xmm0, 1
test sil, 4
je .LBB1_6
.LBB1_5: # %cond.store3
vpextrw word ptr [rdi + 4], xmm0, 2
test sil, 8
je .LBB1_8
.LBB1_7: # %cond.store5
vpextrw word ptr [rdi + 6], xmm0, 3
test sil, 16
je .LBB1_10
.LBB1_9: # %cond.store7
vpextrw word ptr [rdi + 8], xmm0, 4
test sil, 32
je .LBB1_12
.LBB1_11: # %cond.store9
vpextrw word ptr [rdi + 10], xmm0, 5
test sil, 64
je .LBB1_14
.LBB1_13: # %cond.store11
vpextrw word ptr [rdi + 12], xmm0, 6
test sil, sil
js .LBB1_15
jmp .LBB1_16
.LBB1_17: # %cond.store15
vpextrw word ptr [rdi + 16], xmm0, 0
test esi, 512
je .LBB1_20
.LBB1_19: # %cond.store17
vpextrw word ptr [rdi + 18], xmm0, 1
test esi, 1024
je .LBB1_22
.LBB1_21: # %cond.store19
vpextrw word ptr [rdi + 20], xmm0, 2
test esi, 2048
je .LBB1_24
.LBB1_23: # %cond.store21
vpextrw word ptr [rdi + 22], xmm0, 3
test esi, 4096
je .LBB1_26
.LBB1_25: # %cond.store23
vpextrw word ptr [rdi + 24], xmm0, 4
test esi, 8192
je .LBB1_28
.LBB1_27: # %cond.store25
vpextrw word ptr [rdi + 26], xmm0, 5
test esi, 16384
je .LBB1_30
.LBB1_29: # %cond.store27
vpextrw word ptr [rdi + 28], xmm0, 6
test esi, 32768
je .LBB1_32
.LBB1_31: # %cond.store29
vpextrw word ptr [rdi + 30], xmm0, 7
vzeroupper
ret
```
</details>
---
I've only investigated signed and unsigned saturation patterns for __m128i. Of the 10 patterns, 4 failed.
- `_mm_mask_cvtsepi32_storeu_epi8` (*vpmovsdb*) - vpackssdw+vpacksswb+branches https://llvm.godbolt.org/z/c5WzTKPhc
- `_mm_mask_cvtsepi32_storeu_epi16` (*vpmovsdw*) - similar to above, vpackssdw+branches
- `_mm_mask_cvtsepi64_storeu_epi32` (*vpmovsqd*) - vpmaxsq+vpminsq+vpmovqd https://llvm.godbolt.org/z/GvTfGh4PK
- `_mm_mask_cvtusepi64_storeu_epi32` (*vpmovusqd*) - vpminuq+vpmovqd https://llvm.godbolt.org/z/nhETvrv19
---
The intrinsic patterns we're looking to match. Each has an _mm, _mm256, and _m512 version
Unsigned
- `_mask_cvtusepi64_storeu_epi8`
- `_mask_cvtusepi64_storeu_epi32`
- `_mask_cvtusepi64_storeu_epi16`
- `_mask_cvtusepi32_storeu_epi8`
- `_mask_cvtusepi32_storeu_epi16`
Signed
- `_mask_cvtsepi64_storeu_epi8`
- `_mask_cvtsepi64_storeu_epi32`
- `_mask_cvtsepi64_storeu_epi16`
- `_mask_cvtsepi32_storeu_epi8`
- `_mask_cvtsepi32_storeu_epi16`
Truncating
- `_mask_cvtepi64_storeu_epi8`
- `_mask_cvtepi64_storeu_epi32`
- `_mask_cvtepi64_storeu_epi16`
- `_mask_cvtepi32_storeu_epi8`
- `_mask_cvtepi32_storeu_epi16`
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs