Issue 173887
Summary [SystemZ] v3i24 argument passing
Labels backend:SystemZ
Assignees
Reporter JonPsson1
    I found that this function:

```
typedef long vec_long_24 __attribute__((__vector_size__(24)));
vec_long_24 global_long_24;

extern void passCallee_long_24(vec_long_24 x);

void loadAndPass_long_24(void) {
 passCallee_long_24(global_long_24);
}
```
gets emitted by clang with:

```
define dso_local void @loadAndPass_long_24() #0 {
entry:
 %byval-temp = alloca <3 x i64>, align 8
  %loadVecN = load <4 x i64>, ptr @global_long_24, align 8
  %extractVec = shufflevector <4 x i64> %loadVecN, <4 x i64> poison, <3 x i32> <i32 0, i32 1, i32 2>
  call void @llvm.lifetime.start.p0(ptr %byval-temp) #3
  %extractVec1 = shufflevector <3 x i64> %extractVec, <3 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x i64> %extractVec1, ptr %byval-temp, align 8, !tbaa !8
 call void @passCallee_long_24(ptr dead_on_return noundef %byval-temp)
  call void @llvm.lifetime.end.p0(ptr %byval-temp) #3
  ret void
}
```
The alloca used for the outgoing argument is 24 bytes, like the @global_long_24 variable. However, it is loaded and passed as 32 bytes, which doesn't seem right.

If I change the function to have vector size 32, the codegen is identical more or less:

```
loadAndPass_long_24:                    #   | loadAndPass_long_32:                    # 
# %bb.0: #       # %bb.0:                                # 
        stmg %r11, %r15, 88(%r15)                    stmg    %r11, %r15, 88(%r15)
 aghi    %r15, -160                              aghi    %r15, -160
 lgr     %r11, %r15                              lgr     %r11, %r15
 aghik   %r1, %r15, -56                          aghik   %r1, %r15, -56
 la      %r2, 184(%r1)                           la      %r2, 184(%r1)
 nill    %r2, 65504                              nill    %r2, 65504
 lgr     %r15, %r1                               lgr     %r15, %r1
 larl    %r1, global_long_24         |           larl    %r1, global_long_32
 vl      %v0, 0(%r1), 4                          vl      %v0, 0(%r1), 4
        vl      %v1, 16(%r1), 4                         vl      %v1, 16(%r1), 4
        vst     %v1, 16(%r2), 4                         vst %v1, 16(%r2), 4
        vst     %v0, 0(%r2), 4                          vst %v0, 0(%r2), 4
        brasl   %r14, passCallee_long_24@P  | brasl   %r14, passCallee_long_32@P
        lmg     %r11, %r15, 248(%r11) lmg     %r11, %r15, 248(%r11)
        br      %r14 br      %r14
.Lfunc_end0: .Lfunc_end0:
        .size   loadAndPass_long_24, .Lfun  | .size   loadAndPass_long_32, .Lfun
                                        # # 
        .type global_long_24,@object      |           .type   global_long_32,@object    
 .section        .bss,"aw",@nobits               .section .bss,"aw",@nobits
        .globl  global_long_24              | .globl  global_long_32
        .p2align        5, 0x0 .p2align        5, 0x0
global_long_24:                             | global_long_32:
        .space  32 .space  32
        .size   global_long_24, 32          |           .size global_long_32, 32
```

It seems that there is an assumption somehow that v3i64 vectors should be implicitly extended to v4i64. Seems to be the same with v3f64. The alloca has the original size all the way to ISel, and the variable is declared in the module as a <3 x i64>. It doesn't seem to work exactly the same with float and same vector sizes - then the IR stores and loads are actually still just 24 bytes, but the global has a 32 byte size.
Is this all in order, or is there an issue here? Maybe the alloca would be better sized-up as well to match the backend behavior? I guess the initial IR looks broken - storing 32 bytes to a 24 byte alloca.

@uweigand 
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to