john-brawn-arm wrote:

This change has caused problems for Arm MVE code generation, specifically the 
arm-mve-gather-scatter-lowering pass. Simple example:
```
void test(int n, signed int *p, signed short *q) {
  for (int i=0; i<n; i++)
  {
    *p += q[i*n];
  }
}
```
compiled with `` clang --target=arm-none-eabi -mcpu=cortex-m55 -mfloat-abi=hard 
-O3`` the loop previously was compiled as
```
.LBB0_2:                                @ =>This Inner Loop Header: Depth=1
        vldrh.s32       q2, [r2, q0, uxtw #1]
        vaddva.u32      r12, q2
        vadd.i32        q0, q0, q1
        letp    lr, .LBB0_2
```
now we have
```
.LBB0_2:                                @ =>This Inner Loop Header: Depth=1
        vctp.32 r0
        vmrs    r4, p0
        and     r3, r4, #1
        subs    r0, #4
        ubfx    r6, r4, #4, #1
        rsbs    r5, r3, #0
        movs    r3, #0
        bfi     r3, r5, #0, #1
        rsbs    r5, r6, #0
        ubfx    r6, r4, #8, #1
        ubfx    r4, r4, #12, #1
        bfi     r3, r5, #1, #1
        rsbs    r5, r6, #0
        bfi     r3, r5, #2, #1
        rsbs    r4, r4, #0
        vshl.i32        q2, q0, #1
        bfi     r3, r4, #3, #1
        vadd.i32        q3, q2, r2
                                        @ implicit-def: $q2
        lsls    r4, r3, #31
        ittt    ne
        vmovne  r4, s12
        ldrhne  r4, [r4]
        vmovne.32       q2[0], r4
        lsls    r4, r3, #30
        ittt    mi
        vmovmi  r4, s13
        ldrhmi  r4, [r4]
        vmovmi.32       q2[1], r4
        lsls    r4, r3, #29
        ittt    mi
        vmovmi  r4, s14
        ldrhmi  r4, [r4]
        vmovmi.32       q2[2], r4
        lsls    r3, r3, #28
        ittt    mi
        vmovmi  r3, s15
        ldrhmi  r3, [r3]
        vmovmi.32       q2[3], r3
        vmovlb.s16      q2, q2
        vpst
        vaddvat.u32     r12, q2
        vadd.i32        q0, q0, q1
        le      lr, .LBB0_2
```

IR example showing what's going on:
```
target triple = "thumbv8.1m.main-unknown-none-eabi"

define <4 x i16> @test_ok(ptr %src, <4 x i32> %offs, i32 %x, i32 %y) #0 {
entry:
  %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %x, i32 %y)
  %gep = getelementptr inbounds nuw i16, ptr %src, <4 x i32> %offs
  %val = tail call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 
%gep, <4 x i1> %mask, <4 x i16> poison)
  ret <4 x i16> %val
}

define <4 x i16> @test_bad(ptr %src, <4 x i32> %offs, i32 %x, i32 %y) #0 {
entry:
  %mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %x, i32 %y)
  %gep = getelementptr inbounds nuw [2 x i8], ptr %src, <4 x i32> %offs
  %val = tail call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 
%gep, <4 x i1> %mask, <4 x i16> poison)
  ret <4 x i16> %val
}

declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, <4 x i1>, <4 x i16>)

attributes #0 = { "target-cpu"="cortex-m55" }
```
The arm-mve-gather-scatter-lowering pass ``-debug`` output for the first is
```
masked gathers/scatters: trying to optimize: <4 x i32> %offs
masked gathers: checking transform preconditions
  %val = tail call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 
%gep, <4 x i1> %mask, <4 x i16> poison)
masked gathers: Small input type, truncing to: <4 x i32>
masked gathers/scatters: getelementpointer found. Looking at intrinsic for base 
+ vector of offsets
masked gathers/scatters: found correct offsets
masked gathers: successfully built masked gather
```
for the second is
```
masked gathers/scatters: trying to optimize: <4 x i32> %offs
masked gathers: checking transform preconditions
  %val = tail call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> align 2 
%gep, <4 x i1> %mask, <4 x i16> poison)
masked gathers: Small input type, truncing to: <4 x i32>
masked gathers/scatters: getelementpointer found. Looking at intrinsic for base 
+ vector of offsets
masked gathers/scatters: found correct offsets
masked gathers/scatters: incorrect scale. Can't create intrinsic
masked gathers: loading from vector of pointers
```


https://github.com/llvm/llvm-project/pull/180745
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to