| Issue |
53124
|
| Summary |
Suboptimal codegen for x86 vector rotate
|
| Labels |
new issue
|
| Assignees |
|
| Reporter |
philipc
|
https://godbolt.org/z/Y3PqrWexa
(Note: as seen in the godbolt link, my issue originally is in Rust, but it seems to apply to C++ as well.)
This code rotates the lanes and inserts a new first element.
```C++
#include <x86intrin.h>
__m256 element_shift(__m256 x, const float *shift_in) {
__m256 y = _mm256_permute_ps(x, 0x93);
__m256 low = _mm256_insertf128_ps(
_mm256_castps128_ps256(_mm_load_ss(shift_in)),
_mm256_extractf128_ps(y, 0),
1
);
return _mm256_blend_ps(y, low, 0x11);
}
```
Clang (suboptimal):
```asm
vpermilps xmm1, xmm0, 144 # xmm1 = xmm0[0,0,1,2]
vextractf128 xmm2, ymm0, 1
vblendps xmm0, xmm2, xmm0, 8 # xmm0 = xmm2[0,1,2],xmm0[3]
vpermilps xmm0, xmm0, 147 # xmm0 = xmm0[3,0,1,2]
vinsertf128 ymm0, ymm1, xmm0, 1
vmovss xmm1, dword ptr [rdi] # xmm1 = mem[0],zero,zero,zero
vblendps ymm0, ymm0, ymm1, 1 # ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
```
GCC/ICC (expected result):
```asm
vpermilps ymm0, ymm0, 147
vmovss xmm1, DWORD PTR [rdi]
vinsertf128 ymm1, ymm1, xmm0, 0x1
vblendps ymm0, ymm0, ymm1, 17
```
This IR gives the expected result when passed to `llc`:
```
; ModuleID = 'shift32x8.4f1ccf74-cgu.0'
source_filename = "shift32x8.4f1ccf74-cgu.0"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: mustprogress nofree nosync nounwind nonlazybind uwtable willreturn
define <8 x float> @element_shift(<8 x float> %x, float* %shift_in) unnamed_addr #0 {
start:
; vpermilps ymm0, ymm0, 147 # ymm0 = ymm0[3,0,1,2,7,4,5,6]
%0 = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
; vmovss xmm1, dword ptr [rdi] # xmm1 = mem[0],zero,zero,zero
%1 = load float, float* %shift_in, align 4
%2 = insertelement <4 x float> poison, float %1, i32 0
; vinsertf128 ymm1, ymm1, xmm0, 1
%3 = shufflevector <4 x float> %2, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%4 = shufflevector <8 x float> %0, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%5 = shufflevector <4 x float> %4, <4 x float> poison, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 2, i32 3>
%6 = shufflevector <8 x float> %3, <8 x float> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
; vblendps ymm0, ymm0, ymm1, 17 # ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
%7 = shufflevector <8 x float> %6, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 15>
ret <8 x float> %7
}
attributes #0 = { mustprogress nofree nosync nounwind nonlazybind uwtable willreturn "target-cpu"="x86-64" "target-features"="+avx" }
```
However, optimizing it first with `opt -O1` results in different suboptimal instructions:
```asm
vpermilps xmm1, xmm0, 255 # xmm1 = xmm0[3,3,3,3]
vinsertf128 ymm1, ymm0, xmm1, 1
vmovss xmm2, dword ptr [rdi] # xmm2 = mem[0],zero,zero,zero
vblendps ymm1, ymm1, ymm2, 1 # ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7]
vunpcklpd ymm1, ymm1, ymm0 # ymm1 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
vshufps ymm0, ymm1, ymm0, 152 # ymm0 = ymm1[0,2],ymm0[1,2],ymm1[4,6],ymm0[5,6]
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs