[llvm-bugs] [Bug 182034] [x86] Missed optimizations for truncation+masked store sequences lowering to AVX-512F/VL instructions

LLVM Bugs via llvm-bugs Wed, 18 Feb 2026 06:44:50 -0800

Issue	182034
Summary	[x86] Missed optimizations for truncation+masked store sequences lowering to AVX-512F/VL instructions
Labels	new issue
Assignees
Reporter	okaneco

    Signed-saturating, unsigned-saturating, and regular truncation followed by a masked store sometimes does not optimize to the expected `vpmov/vpmovs/vpmovus`.


The IR comes from trying to rewrite some Rust core library intrinsics with portable code. Of the 15 truncating patterns, `_mm512_mask_cvtepi32_storeu_epi16` and `_mm512_mask_cvtepi32_storeu_epi8` similarly fail to optimize while the rest produced expected results.

https://llvm.godbolt.org/z/81aY6x4hq

```llvm
define void @_mm512_mask_cvtepi32_storeu_epi16_intrinsic(ptr noundef %mem_addr, i16 noundef %k, ptr dead_on_return noalias noundef readonly align 64 captures(none) dereferenceable(64) %a) unnamed_addr #0 {
start:
  %0 = load <16 x i32>, ptr %a, align 64
  tail call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %mem_addr, <16 x i32> %0, i16 %k)
 ret void
}

define void @_mm512_mask_cvtepi32_storeu_epi16_portable(ptr noundef writeonly captures(none) %mem_addr, i16 noundef %k, ptr dead_on_return noalias noundef readonly align 64 captures(none) dereferenceable(64) %a) unnamed_addr #0 {
start:
  %0 = bitcast i16 %k to <16 x i1>
  %1 = load <16 x i32>, ptr %a, align 64
  %2 = trunc <16 x i32> %1 to <16 x i16>
  tail call void @llvm.masked.store.v16i16.p0(<16 x i16> %2, ptr align 1 %mem_addr, <16 x i1> %0)
  ret void
}

declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr, <16 x i32>, i16) unnamed_addr #1
declare void @llvm.masked.store.v16i16.p0(<16 x i16>, ptr captures(none), <16 x i1>) #2

attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(argmem: readwrite) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+avx512f,+f16c,+fma,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+crc32,+ssse3,+avx,+avx2,+avx512f,+avx512vl,+f16c,+fma,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+crc32,+ssse3,+avx,+avx2,+avx512f,+f16c,+fma,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+crc32,+ssse3" }
attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: write) }
```



<details><summary>Assembly</summary>

```asm
_mm512_mask_cvtepi32_storeu_epi16_intrinsic: # @_mm512_mask_cvtepi32_storeu_epi16_intrinsic
        kmovw   k1, esi
 vmovdqa64       zmm0, zmmword ptr [rdx]
        vpmovdw ymmword ptr [rdi] {k1}, zmm0
        vzeroupper
 ret

_mm512_mask_cvtepi32_storeu_epi16_portable: # @_mm512_mask_cvtepi32_storeu_epi16_portable
        vmovdqa64       zmm0, zmmword ptr [rdx]
        vpmovdw ymm0, zmm0
        test    sil, 1
 jne     .LBB1_1
        test    sil, 2
        jne     .LBB1_3
.LBB1_4: # %else2
        test    sil, 4
        jne .LBB1_5
.LBB1_6:                                # %else4
        test sil, 8
        jne     .LBB1_7
.LBB1_8:                                # %else6
        test    sil, 16
        jne     .LBB1_9
.LBB1_10: # %else8
        test    sil, 32
        jne .LBB1_11
.LBB1_12:                               # %else10
        test sil, 64
        jne     .LBB1_13
.LBB1_14:                               # %else12
        test    sil, sil
        jns     .LBB1_16
.LBB1_15: # %cond.store13
        vpextrw word ptr [rdi + 14], xmm0, 7
.LBB1_16:                               # %else14
 vextracti128    xmm0, ymm0, 1
        test    esi, 256
        jne .LBB1_17
        test    esi, 512
        jne     .LBB1_19
.LBB1_20: # %else18
        test    esi, 1024
        jne .LBB1_21
.LBB1_22:                               # %else20
        test esi, 2048
        jne     .LBB1_23
.LBB1_24: # %else22
        test    esi, 4096
        jne     .LBB1_25
.LBB1_26: # %else24
        test    esi, 8192
        jne .LBB1_27
.LBB1_28:                               # %else26
        test esi, 16384
        jne     .LBB1_29
.LBB1_30: # %else28
        test    esi, 32768
        jne     .LBB1_31
.LBB1_32: # %else30
        vzeroupper
 ret
.LBB1_1:                                # %cond.store
        vpextrw word ptr [rdi], xmm0, 0
        test    sil, 2
        je .LBB1_4
.LBB1_3:                                # %cond.store1
 vpextrw word ptr [rdi + 2], xmm0, 1
        test    sil, 4
        je .LBB1_6
.LBB1_5:                                # %cond.store3
 vpextrw word ptr [rdi + 4], xmm0, 2
        test    sil, 8
        je .LBB1_8
.LBB1_7:                                # %cond.store5
 vpextrw word ptr [rdi + 6], xmm0, 3
        test    sil, 16
        je .LBB1_10
.LBB1_9:                                # %cond.store7
 vpextrw word ptr [rdi + 8], xmm0, 4
        test    sil, 32
        je .LBB1_12
.LBB1_11:                               # %cond.store9
 vpextrw word ptr [rdi + 10], xmm0, 5
        test    sil, 64
        je .LBB1_14
.LBB1_13:                               # %cond.store11
 vpextrw word ptr [rdi + 12], xmm0, 6
        test    sil, sil
        js .LBB1_15
        jmp     .LBB1_16
.LBB1_17: # %cond.store15
        vpextrw word ptr [rdi + 16], xmm0, 0
        test esi, 512
        je      .LBB1_20
.LBB1_19: # %cond.store17
        vpextrw word ptr [rdi + 18], xmm0, 1
        test esi, 1024
        je      .LBB1_22
.LBB1_21: # %cond.store19
        vpextrw word ptr [rdi + 20], xmm0, 2
        test esi, 2048
        je      .LBB1_24
.LBB1_23: # %cond.store21
        vpextrw word ptr [rdi + 22], xmm0, 3
        test esi, 4096
        je      .LBB1_26
.LBB1_25: # %cond.store23
        vpextrw word ptr [rdi + 24], xmm0, 4
        test esi, 8192
        je      .LBB1_28
.LBB1_27: # %cond.store25
        vpextrw word ptr [rdi + 26], xmm0, 5
        test esi, 16384
        je      .LBB1_30
.LBB1_29: # %cond.store27
        vpextrw word ptr [rdi + 28], xmm0, 6
        test esi, 32768
        je      .LBB1_32
.LBB1_31: # %cond.store29
        vpextrw word ptr [rdi + 30], xmm0, 7
 vzeroupper
        ret
```

</details>

---

I've only investigated signed and unsigned saturation patterns for __m128i. Of the 10 patterns, 4 failed.
- `_mm_mask_cvtsepi32_storeu_epi8` (*vpmovsdb*) - vpackssdw+vpacksswb+branches https://llvm.godbolt.org/z/c5WzTKPhc
- `_mm_mask_cvtsepi32_storeu_epi16` (*vpmovsdw*) - similar to above, vpackssdw+branches
- `_mm_mask_cvtsepi64_storeu_epi32` (*vpmovsqd*) - vpmaxsq+vpminsq+vpmovqd https://llvm.godbolt.org/z/GvTfGh4PK
- `_mm_mask_cvtusepi64_storeu_epi32` (*vpmovusqd*) - vpminuq+vpmovqd https://llvm.godbolt.org/z/nhETvrv19

---

The intrinsic patterns we're looking to match. Each has an _mm, _mm256, and _m512 version

Unsigned
- `_mask_cvtusepi64_storeu_epi8`
- `_mask_cvtusepi64_storeu_epi32`
- `_mask_cvtusepi64_storeu_epi16`
- `_mask_cvtusepi32_storeu_epi8`
- `_mask_cvtusepi32_storeu_epi16`

Signed
- `_mask_cvtsepi64_storeu_epi8`
- `_mask_cvtsepi64_storeu_epi32`
- `_mask_cvtsepi64_storeu_epi16`
- `_mask_cvtsepi32_storeu_epi8`
- `_mask_cvtsepi32_storeu_epi16`

Truncating
- `_mask_cvtepi64_storeu_epi8`
- `_mask_cvtepi64_storeu_epi32`
- `_mask_cvtepi64_storeu_epi16`
- `_mask_cvtepi32_storeu_epi8`
- `_mask_cvtepi32_storeu_epi16`

_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

[llvm-bugs] [Bug 182034] [x86] Missed optimizations for truncation+masked store sequences lowering to AVX-512F/VL instructions

Reply via email to