| Issue |
170867
|
| Summary |
[VectorCombine] foldShuffleOfIntrinsics - failure to recognise repeated operands will reduce costs
|
| Labels |
missed-optimization,
llvm::vectorcombine
|
| Assignees |
|
| Reporter |
RKSimon
|
```ll
define <8 x float> @src(<4 x float> %x0, <4 x float> %x1, <4 x float> %y0, <4 x float> %y1) {
%l0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x0, <4 x float> %x0, <4 x float> zeroinitializer)
%l1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %x1, <4 x float> %x1, <4 x float> %l0)
%h0 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y0, <4 x float> %y0, <4 x float> zeroinitializer)
%h1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %y1, <4 x float> %y1, <4 x float> %h0)
%res = shufflevector <4 x float> %l1, <4 x float> %h1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %res
}
```
We fail to fold `@src` to `@tgt` as the cost analysis assume the shuffle of each operand of the fma are independent:
```ll
define <8 x float> @tgt(<4 x float> %x0, <4 x float> %x1, <4 x float> %y0, <4 x float> %y1) {
%xy0 = shufflevector <4 x float> %x0, <4 x float> %y0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%xy1 = shufflevector <4 x float> %x1, <4 x float> %y1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%r0 = call <8 x float> @llvm.fma.v8f32(<8 x float> %xy0, <8 x float> %xy0, <8 x float> zeroinitializer)
%r1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %xy1, <8 x float> %xy1, <8 x float> %r0)
ret <8 x float> %r1
}
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs