Issue 167412
Summary [clang] Different code generated for function vs inlined version of same function, including `memcpy` into inlined parameter
Labels clang
Assignees
Reporter davidstone
    Given the following code:

```c++
constexpr auto size = 45;

struct array {
	int m[size];
};

auto equal_impl(
	array const lhs,
	array const rhs
) -> bool {
	for (int n = 0; n != size; ++n) {
		if (lhs.m[n] != rhs.m[n]) {
			return false;
		}
	}
	return true;
}

auto equal(
	array const lhs,
	array const rhs
) -> bool {
	return equal_impl(lhs, rhs);
}
```

then everything seems fine when compiled with `-O3`:

```asm
equal_impl(array, array):
        push    rax
 lea     rdi, [rsp + 16]
        lea     rsi, [rsp + 200]
        mov edx, 180
        call    memcmp@PLT
        test    eax, eax
        sete al
        pop     rcx
        ret

equal(array, array):
        jmp equal_impl(array, array)
```

But when 5 <= size <= 44, everything does not seem fine. For instance, size == 5 generates

```asm
equal_impl(array, array):
        movdqu  xmm0, xmmword ptr [rsp + 32]
        movd    xmm1, dword ptr [rsp + 24]
        movd    xmm2, dword ptr [rsp + 48]
 pcmpeqb xmm2, xmm1
        pcmpeqb xmm0, xmmword ptr [rsp + 8]
        pand xmm0, xmm2
        pmovmskb        eax, xmm0
        cmp     eax, 65535
        sete    al
        ret

equal(array, array):
 movdqu  xmm0, xmmword ptr [rsp + 32]
        pcmpeqd xmm0, xmmword ptr [rsp + 8]
        movmskps        eax, xmm0
        xor     eax, 15
 sete    cl
        mov     eax, dword ptr [rsp + 48]
        cmp     dword ptr [rsp + 24], eax
        sete    al
        and     al, cl
 ret
```

It inlines `equal_impl` but then generates different code, which is weird.

size == 8:

```asm
equal_impl(array, array):
        movdqa xmm0, xmmword ptr [rsp + 24]
        pcmpeqb xmm0, xmmword ptr [rsp + 56]
 movdqa  xmm1, xmmword ptr [rsp + 8]
        pcmpeqb xmm1, xmmword ptr [rsp + 40]
        pand    xmm1, xmm0
        pmovmskb        eax, xmm1
 cmp     eax, 65535
        sete    al
        ret

equal(array, array):
        movdqa  xmm0, xmmword ptr [rsp + 40]
        pcmpeqd xmm0, xmmword ptr [rsp + 8]
        movmskps        ecx, xmm0
        xor eax, eax
        xor     ecx, 15
        jne     .LBB1_5
        movq xmm0, qword ptr [rsp + 24]
        movq    xmm1, qword ptr [rsp + 56]
 pcmpeqd xmm1, xmm0
        pshufd  xmm0, xmm1, 80
        movmskpd ecx, xmm0
        test    cl, 1
        je      .LBB1_5
        shr cl
        je      .LBB1_5
        mov     ecx, dword ptr [rsp + 64]
 cmp     dword ptr [rsp + 32], ecx
        jne     .LBB1_5
        lea rax, [rsp + 40]
        lea     rcx, [rsp + 8]
        mov     ecx, dword ptr [rcx + 28]
        cmp     ecx, dword ptr [rax + 28]
        sete al
.LBB1_5:
        ret
```

size == 44

```asm
equal_impl(array, array):
        push    rax
        lea     rdi, [rsp + 16]
        lea rsi, [rsp + 192]
        mov     edx, 176
        call    memcmp@PLT
 test    eax, eax
        sete    al
        pop     rcx
 ret

equal(array, array):
        movdqa  xmm2, xmmword ptr [rsp + 296]
 movdqa  xmm3, xmmword ptr [rsp + 280]
        movdqa  xmm4, xmmword ptr [rsp + 264]
        movdqa  xmm0, xmmword ptr [rsp + 248]
 movdqa  xmm1, xmmword ptr [rsp + 184]
        movdqa  xmm5, xmmword ptr [rsp + 200]
        movdqa  xmm6, xmmword ptr [rsp + 216]
        movdqa  xmm7, xmmword ptr [rsp + 232]
        pcmpeqd xmm2, xmmword ptr [rsp + 120]
 pcmpeqd xmm3, xmmword ptr [rsp + 104]
        pcmpeqd xmm4, xmmword ptr [rsp + 88]
        packssdw        xmm3, xmm2
        pcmpeqd xmm0, xmmword ptr [rsp + 72]
        packssdw        xmm0, xmm4
        pcmpeqd xmm7, xmmword ptr [rsp + 56]
        pcmpeqd xmm6, xmmword ptr [rsp + 40]
 packsswb        xmm0, xmm3
        packssdw        xmm6, xmm7
 pcmpeqd xmm5, xmmword ptr [rsp + 24]
        packsswb        xmm2, xmm6
 pcmpeqd xmm1, xmmword ptr [rsp + 8]
        packssdw        xmm1, xmm5
 movdqa  xmm3, xmmword ptr [rsp + 328]
        movdqa  xmm4, xmmword ptr [rsp + 312]
        pcmpeqd xmm3, xmmword ptr [rsp + 152]
 pcmpeqd xmm4, xmmword ptr [rsp + 136]
        packssdw        xmm4, xmm3
 pand    xmm4, xmm1
        psrlw   xmm4, 8
        packuswb xmm1, xmm4
        punpckhqdq      xmm1, xmm2
        psllw   xmm1, 7
 pand    xmm1, xmm0
        pmovmskb        ecx, xmm1
        xor eax, eax
        cmp     ecx, 65535
        jne     .LBB1_5
        movq xmm0, qword ptr [rsp + 168]
        movq    xmm1, qword ptr [rsp + 344]
 pcmpeqd xmm1, xmm0
        pshufd  xmm0, xmm1, 80
        movmskpd ecx, xmm0
        test    cl, 1
        je      .LBB1_5
        shr cl
        je      .LBB1_5
        mov     ecx, dword ptr [rsp + 352]
 cmp     dword ptr [rsp + 176], ecx
        jne     .LBB1_5
        lea rax, [rsp + 184]
        lea     rcx, [rsp + 8]
        mov     ecx, dword ptr [rcx + 172]
        cmp     ecx, dword ptr [rax + 172]
 sete    al
.LBB1_5:
        ret
```

If I declare `equal_impl` as `static`, then we stop emitting code for `equal_impl` (as expected) and it does not change the generated code for `equal` until we get to a size >= 45 (the cut-off point where it would generate a `jmp` without `static`). Then, it continues its pattern of just adding more and more instructions up to size == 59 (related to https://github.com/llvm/llvm-project/issues/167389 for this exact code pattern in the implementation). However, at size == 60 both versions start generating the same code again (and it's really bad, it has calls to `memcpy`):

```c++
equal_impl(array, array):
        lea rax, [rsp + 248]
        lea     rcx, [rsp + 8]
        xor     edx, edx
.LBB0_1:
        mov     esi, dword ptr [rcx + 4*rdx]
        mov edi, dword ptr [rax + 4*rdx]
        cmp     esi, edi
        jne .LBB0_3
        cmp     rdx, 59
        lea     rdx, [rdx + 1]
        jne .LBB0_1
.LBB0_3:
        cmp     esi, edi
        sete    al
 ret

equal(array, array):
        sub     rsp, 488
        lea     rdi, [rsp + 248]
        lea     rsi, [rsp + 496]
        mov     edx, 240
 call    memcpy@PLT
        lea     rdi, [rsp + 8]
        lea     rsi, [rsp + 736]
        mov     edx, 240
        call    memcpy@PLT
 xor     eax, eax
.LBB1_1:
        mov     ecx, dword ptr [rsp + 4*rax + 248]
        mov     edx, dword ptr [rsp + 4*rax + 8]
        cmp     ecx, edx
        jne     .LBB1_3
        cmp     rax, 59
        lea     rax, [rax + 1]
        jne     .LBB1_1
.LBB1_3:
        cmp     ecx, edx
 sete    al
        add     rsp, 488
        ret
```

See it live: https://godbolt.org/z/nPndKvdPn

Note that the examples used in here are similar to my other recent bug reports around `memcmp`, but other code patterns in the impl function cause the same behavior with varying complexity of the code and size of the data. The underlying issue here is that inlined code is not optimized properly.
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs

Reply via email to