john-brawn-arm wrote:
This change has caused problems for Arm MVE code generation, specifically the
arm-mve-gather-scatter-lowering pass. Simple example:
```
void test(int n, signed int *p, signed short *q) {
for (int i=0; i<n; i++)
{
*p += q[i*n];
}
}
```
compiled with `` clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard
-O3`` the loop previously was compiled as
```
.LBB0_2: @ =>This Inner Loop Header: Depth=1
vldrh.s32 q2, [r2, q0, uxtw #1]
vaddva.u32 r12, q2
vadd.i32 q0, q0, q1
letp lr, .LBB0_2
```
now we have
```
.LBB0_2: @ =>This Inner Loop Header: Depth=1
vctp.32 r0
vmrs r4, p0
and r3, r4, #1
subs r0, #4
ubfx r6, r4, #4, #1
rsbs r5, r3, #0
movs r3, #0
bfi r3, r5, #0, #1
rsbs r5, r6, #0
ubfx r6, r4, #8, #1
ubfx r4, r4, #12, #1
bfi r3, r5, #1, #1
rsbs r5, r6, #0
bfi r3, r5, #2, #1
rsbs r4, r4, #0
vshl.i32 q2, q0, #1
bfi r3, r4, #3, #1
vadd.i32 q3, q2, r2
@ implicit-def: $q2
lsls r4, r3, #31
ittt ne
vmovne r4, s12
ldrhne r4, [r4]
vmovne.32 q2[0], r4
lsls r4, r3, #30
ittt mi
vmovmi r4, s13
ldrhmi r4, [r4]
vmovmi.32 q2[1], r4
lsls r4, r3, #29
ittt mi
vmovmi r4, s14
ldrhmi r4, [r4]
vmovmi.32 q2[2], r4
lsls r3, r3, #28
ittt mi
vmovmi r3, s15
ldrhmi r3, [r3]
vmovmi.32 q2[3], r3
vmovlb.s16 q2, q2
vpst
vaddvat.u32 r12, q2
vadd.i32 q0, q0, q1
le lr, .LBB0_2
```
IR example showing what's going on:
```
target triple = "thumbv8.1m.main-unknown-none-eabi"
define <4 x i16> @test_ok(ptr %src, <4 x i32> %offs, i32 %x, i32 %y) #0 {
entry:
%mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %x, i32 %y)
%gep = getelementptr inbounds nuw i16, ptr %src, <4 x i32> %offs
%val = tail call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2
%gep, <4 x i1> %mask, <4 x i16> poison)
ret <4 x i16> %val
}
define <4 x i16> @test_bad(ptr %src, <4 x i32> %offs, i32 %x, i32 %y) #0 {
entry:
%mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %x, i32 %y)
%gep = getelementptr inbounds nuw [2 x i8], ptr %src, <4 x i32> %offs
%val = tail call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2
%gep, <4 x i1> %mask, <4 x i16> poison)
ret <4 x i16> %val
}
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, <4 x i1>, <4 x i16>)
attributes #0 = { "target-cpu"="cortex-m55" }
```
The arm-mve-gather-scatter-lowering pass ``-debug`` output for the first is
```
masked gathers/scatters: trying to optimize: <4 x i32> %offs
masked gathers: checking transform preconditions
%val = tail call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2
%gep, <4 x i1> %mask, <4 x i16> poison)
masked gathers: Small input type, truncing to: <4 x i32>
masked gathers/scatters: getelementpointer found. Looking at intrinsic for base
+ vector of offsets
masked gathers/scatters: found correct offsets
masked gathers: successfully built masked gather
```
for the second is
```
masked gathers/scatters: trying to optimize: <4 x i32> %offs
masked gathers: checking transform preconditions
%val = tail call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2
%gep, <4 x i1> %mask, <4 x i16> poison)
masked gathers: Small input type, truncing to: <4 x i32>
masked gathers/scatters: getelementpointer found. Looking at intrinsic for base
+ vector of offsets
masked gathers/scatters: found correct offsets
masked gathers/scatters: incorrect scale. Can't create intrinsic
masked gathers: loading from vector of pointers
```
https://github.com/llvm/llvm-project/pull/180745
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits