| Issue |
56684
|
| Summary |
[X86] Failure to reassociate PMULUDQ mul-by-constant pairs of nodes
|
| Labels |
backend:X86,
missed-optimization
|
| Assignees |
|
| Reporter |
RKSimon
|
Noticed while working on https://reviews.llvm.org/D129765
```
define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) nounwind {
%urem = urem <4 x i32> %X, <i32 14, i32 4294967295, i32 16, i32 1>
%cmp = icmp eq <4 x i32> %urem, <i32 0, i32 0, i32 0, i32 0>
%ret = zext <4 x i1> %cmp to <4 x i32>
ret <4 x i32> %ret
}
```
llc -mtriple=x86_64--
```
.LCPI0_0:
.long 4294967295 # 0xffffffff
.long 0 # 0x0
.long 0 # 0x0
.long 0 # 0x0
.LCPI0_1:
.long 1 # 0x1
.long 1 # 0x1
.long 1 # 0x1
.long 1 # 0x1
.LCPI0_2:
.long 3067833783 # 0xb6db6db7
.long 4294967295 # 0xffffffff
.long 1 # 0x1
.long 0 # 0x0
.LCPI0_3:
.long 2147483648 # 0x80000000
.zero 4
.long 268435456 # 0x10000000
.zero 4
.LCPI0_4:
.long 2147483648 # 0x80000000
.long 2147483648 # 0x80000000
.long 2147483648 # 0x80000000
.long 2147483648 # 0x80000000
.LCPI0_5:
.long 2454267026 # 0x92492492
.long 2147483649 # 0x80000001
.long 2415919103 # 0x8fffffff
.long 2147483647 # 0x7fffffff
test_urem_even_allones_and_poweroftwo_and_one: # @test_urem_even_allones_and_poweroftwo_and_one
pshufd $245, %xmm0, %xmm1 # xmm1 = xmm0[1,1,3,3]
pmuludq .LCPI0_0(%rip), %xmm1
movdqa .LCPI0_1(%rip), %xmm2 # xmm2 = [1,1,1,1]
pmuludq %xmm2, %xmm1
pshufd $237, %xmm1, %xmm3 # xmm3 = xmm1[1,3,2,3]
pmuludq .LCPI0_2(%rip), %xmm0
pmuludq .LCPI0_3(%rip), %xmm0
pshufd $237, %xmm0, %xmm4 # xmm4 = xmm0[1,3,2,3]
punpckldq %xmm3, %xmm4 # xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
pshufd $232, %xmm1, %xmm1 # xmm1 = xmm1[0,2,2,3]
pshufd $232, %xmm0, %xmm0 # xmm0 = xmm0[0,2,2,3]
punpckldq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
por %xmm4, %xmm0
pxor .LCPI0_4(%rip), %xmm0
pcmpgtd .LCPI0_5(%rip), %xmm0
pandn %xmm2, %xmm0
retq
```
It should be possible to merge the pmuludq mul-by-constant pairs. Naturally we have to be careful due to the implicit zero-extension of the instruction, but in many of these cases at least one of the pairs of elements are multiply-by-one. PMULDQ possibly has similar cases but I haven't found any (the PMULUDQ cases appear due to a expansion of a mixture of v4i32 multiplies and rotates).
https://gcc.godbolt.org/z/nfq8K7sTT
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs