Issue 53124
Summary Suboptimal codegen for x86 vector rotate
Labels new issue
Assignees
Reporter philipc
    https://godbolt.org/z/Y3PqrWexa

(Note: as seen in the godbolt link, my issue originally is in Rust, but it seems to apply to C++ as well.)

This code rotates the lanes and inserts a new first element.
```C++
#include <x86intrin.h>

__m256 element_shift(__m256 x, const float *shift_in) {
    __m256 y = _mm256_permute_ps(x, 0x93);
    __m256 low = _mm256_insertf128_ps(
        _mm256_castps128_ps256(_mm_load_ss(shift_in)),
        _mm256_extractf128_ps(y, 0),
        1
    );
    return _mm256_blend_ps(y, low, 0x11);
}
```

Clang (suboptimal):
```asm
        vpermilps       xmm1, xmm0, 144         # xmm1 = xmm0[0,0,1,2]
        vextractf128    xmm2, ymm0, 1
        vblendps        xmm0, xmm2, xmm0, 8             # xmm0 = xmm2[0,1,2],xmm0[3]
        vpermilps       xmm0, xmm0, 147         # xmm0 = xmm0[3,0,1,2]
        vinsertf128     ymm0, ymm1, xmm0, 1
        vmovss  xmm1, dword ptr [rdi]           # xmm1 = mem[0],zero,zero,zero
        vblendps        ymm0, ymm0, ymm1, 1             # ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]

```

GCC/ICC (expected result):
```asm
        vpermilps       ymm0, ymm0, 147
        vmovss  xmm1, DWORD PTR [rdi]
        vinsertf128     ymm1, ymm1, xmm0, 0x1
        vblendps        ymm0, ymm0, ymm1, 17
```

This IR gives the expected result when passed to `llc`:
```
; ModuleID = 'shift32x8.4f1ccf74-cgu.0'
source_filename = "shift32x8.4f1ccf74-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

; Function Attrs: mustprogress nofree nosync nounwind nonlazybind uwtable willreturn
define <8 x float> @element_shift(<8 x float> %x, float* %shift_in) unnamed_addr #0 {
start:
  ; vpermilps	ymm0, ymm0, 147         # ymm0 = ymm0[3,0,1,2,7,4,5,6]
  %0 = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>

  ; vmovss	xmm1, dword ptr [rdi]           # xmm1 = mem[0],zero,zero,zero
  %1 = load float, float* %shift_in, align 4
  %2 = insertelement <4 x float> poison, float %1, i32 0

  ; vinsertf128	ymm1, ymm1, xmm0, 1
  %3 = shufflevector <4 x float> %2, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
  %4 = shufflevector <8 x float> %0, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %5 = shufflevector <4 x float> %4, <4 x float> poison, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
  %6 = shufflevector <8 x float> %3, <8 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>

  ; vblendps	ymm0, ymm0, ymm1, 17            # ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
  %7 = shufflevector <8 x float> %6, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 15>

  ret <8 x float> %7
}

attributes #0 = { mustprogress nofree nosync nounwind nonlazybind uwtable willreturn "target-cpu"="x86-64" "target-features"="+avx" }
```

However, optimizing it first with `opt -O1` results in different suboptimal instructions:
```asm
	vpermilps	xmm1, xmm0, 255         # xmm1 = xmm0[3,3,3,3]
	vinsertf128	ymm1, ymm0, xmm1, 1
	vmovss	xmm2, dword ptr [rdi]           # xmm2 = mem[0],zero,zero,zero
	vblendps	ymm1, ymm1, ymm2, 1             # ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7]
	vunpcklpd	ymm1, ymm1, ymm0        # ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
	vshufps	ymm0, ymm1, ymm0, 152           # ymm0 = ymm1[0,2],ymm0[1,2],ymm1[4,6],ymm0[5,6]
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to