| Issue |
165752
|
| Summary |
[X86][AVX512] Failure to rematerialize smaller predicate masks
|
| Labels |
good first issue,
backend:X86,
missed-optimization
|
| Assignees |
|
| Reporter |
RKSimon
|
```ll
define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
%broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
%broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float>undef)
ret <16 x float> %res
}
define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
%broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0
%broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer
%sext_ind = sext <16 x i32> %ind to <16 x i64>
%gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float>undef)
ret <16 x float> %res
}
```
llc -mcpu=x86-64-v4
```asm
gather_all: # @gather_all
kxnorw %k0, %k0, %k1
vxorps %xmm1, %xmm1, %xmm1
vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
vmovaps %zmm1, %zmm0
retq
gather_lower: # @gather_lower
vxorps %xmm1, %xmm1, %xmm1
movw $255, %ax
kmovd %eax, %k1
vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
vmovaps %zmm1, %zmm0
retq
```
The gather_lower code - needs to initialize the lower 8 bits of a <16 x i1> k-reg mask (with all upper bits zero) - - we should be able to use ` kxnorb %k0, %k0, %k1` on AVX512DQ targets to handle this instead of the MOV+KMOVD.
There will be similar cases where we want just the lower 8/16/32 bits sets of a <32 x i1> or <64 x i1> mask on AVX512DQ/BW targets that can be handled with kxnorw/kxnord.
Hopefully this can be be handled as tablegen patterns extending the existing 'KSET1' patterns.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs