Re: [__mulvti3] register allocator plays shell game

Stefan Kanthak Mon, 26 Oct 2020 16:01:45 -0700

Richard Biener <[email protected]> wrote:

> On Sun, Oct 25, 2020 at 8:37 PM Stefan Kanthak <[email protected]> 
> wrote:
>>
>> Hi,
>>
>> for the AMD64 alias x86_64 platform and the __int128_t [DW]type,
>> the first few lines of the __mulvDI3() function from libgcc2.c
>>
>> | DWtype
>> | __mulvDI3 (DWtype u, DWtype v)
>> | {
>> |   /* The unchecked multiplication needs 3 Wtype x Wtype multiplications,
>> |      but the checked multiplication needs only two.  */
>> |   const DWunion uu = {.ll = u};
>> |   const DWunion vv = {.ll = v};
>> |
>> |   if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1))
>> |     {
>> |       /* u fits in a single Wtype.  */
>> |       if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1))
>> |  {
>> |    /* v fits in a single Wtype as well.  */
>> |    /* A single multiplication.  No overflow risk.  */
>> |    return (DWtype) uu.s.low * (DWtype) vv.s.low;
>> |  }
>>
>> are compiled to this braindead code (obtained from libgcc.a of
>> GCC 10.2.0 installed on Debian):
>>
>> 0000000000000000 <__mulvti3>:
>>    0: 41 55                 push   %r13
>>    2: 49 89 cb              mov    %rcx,%r11
>>    5: 48 89 d0              mov    %rdx,%rax
>>    8: 49 89 d2              mov    %rdx,%r10
>>    b: 41 54                 push   %r12
>>    d: 49 89 fc              mov    %rdi,%r12
>>   10: 48 89 d1              mov    %rdx,%rcx
>>   13: 49 89 f0              mov    %rsi,%r8
>>   16: 4c 89 e2              mov    %r12,%rdx
>>   19: 49 89 f5              mov    %rsi,%r13
>>   1c: 53                    push   %rbx
>>   1d: 48 89 fe              mov    %rdi,%rsi
>>   20: 48 c1 fa 3f           sar    $0x3f,%rdx
>>   24: 48 c1 f8 3f           sar    $0x3f,%rax
>>   28: 4c 89 df              mov    %r11,%rdi
>>   2b: 4c 39 c2              cmp    %r8,%rdx
>>   2e: 75 18                 jne    48 <__mulvti3+0x48>
>>   30: 4c 39 d8              cmp    %r11,%rax
>>   33: 75 6b                 jne    a0 <__mulvti3+0xa0>
>>   35: 4c 89 e0              mov    %r12,%rax
>>   38: 49 f7 ea              imul   %r10
>>   3b: 5b                    pop    %rbx
>>   3c: 41 5c                 pop    %r12
>>   3e: 41 5d                 pop    %r13
>>   40: c3                    retq
>> ...
>>
>> There are EIGHT superfluous MOV instructions here, clobbering the
>> non-volatile registers RBX, R12 and R13, plus THREE superfluous
>> PUSH/POP pairs.
>>
>> What stops GCC from generating the following straightforward code
>> (11 instructions in 31 bytes instead of 25 instructions in 65 bytes)?
>>
>> .intel_syntax noprefix
>> __mulvti3:
>>     mov   r8, rdi
>>     mov   r9, rdx
>>     sra   r8, 63
>>     sra   r9, 63
>>     cmp   r8, rsi
>>     jne   __mulvti3+0x48+65-31
>>     cmp   r9, rcx
>>     jne   __mulvti3+0xa0+65-31
>>     mov   rax, rdi
>>     imul  rdx
>>     ret
>> ...
>>
>>
>> not amused
> 
> can you open a bugreport please?



I'd like to discuss alternative implementations of __mulvti3() first:
the very first lines of libgcc2.c reads

| /* More subroutines needed by GCC output code on some machines.  */
| /* Compile this one with gcc.  */


Why don't you take advantage of that and implement __mulvDI3() as
follows?

DWtype
__mulvDI3 (DWtype multiplicand, DWtype multiplier)
{
    DWtype product;

    if (__builtin_mul_overflow(multiplicand, multiplier, &product))
        abort();

    return product;
}


For the AMD64 platform, instead of the 131 instructions in 439 bytes
(partially cited above) GCC now generates 107 instructions in 324 bytes
 ... (un)fortunately showing this bug again, now clobbering RBP and RBX,
pushing/popping RBP, RBX, R12, R14 and R15, and still moving them around
without necessity:


__mulvti3:
        movq    %rdi, %r8
        movq    %rdx, %rdi
        pushq   %r15
        movq    %rcx, %r9
        movq    %r8, %rdx
        movq    %rdi, %rax
        pushq   %r14
        sarq    $63, %rdx
        pushq   %r12
        sarq    $63, %rax
        pushq   %rbp
        xorl    %ebp, %ebp
        pushq   %rbx
        cmpq    %rsi, %rdx
        jne     .L4
        cmpq    %rcx, %rax
        jne     .L5
        movq    %r8, %rax
        imulq   %rdi
        movq    %rax, %rbx
.L2:
        testq   %rbp, %rbp
        jne     __mulvti3.cold
        movq    %rbx, %rax
        popq    %rbx
        popq    %rbp
        popq    %r12
        popq    %r14
        popq    %r15
        ret
...
.L4:
        cmpq    %rcx, %rax
        jne     .L7
...
        jns     .L8
        xorl    %r10d, %r10d
        subq    %r10, %rax
        sbbq    %r12, %rdx
.L8:
        testq    %r12, %r12
        jns     .L9
...
        jne     .L10
        movq    %rcx, %rbx
        movq    %r10, %rdx
        jmp     .L2
...
        jmp     .L6
.L7:
...
        ja      .L3
...
        ja      .L3
...
        jne     .L11
...
        jl      .L2
.L3:
        movl        $1, %ebp
        jmp     .L2
.L11:
        testq    %r10, %r10
        js      .L2
        jmp     .L3
.L10:
...
        jmp     .L3
__mulvti3.cold:
.L13:
        call    abort

Besides the poor register allocation, this code shows 12 conditional
plus 5 unconditional branches.


Let's try a second alternative implementation:

DWtype
__mulvDI3 (DWtype multiplicand, DWtype multiplier)
{
    UDWtype product;
    UDWtype sign = multiplicand >> (DW_TYPE_SIZE - 1);
    UDWtype tmp = multiplier >> (DW_TYPE_SIZE - 1);

    if (__builtin_mul_overflow((multiplicand ^ sign) - sign,
                               (multiplier ^ tmp) - tmp,
                               &product))
        abort();

    sign ^= tmp;
    product = (product ^ sign) - sign;

    if ((__int128_t) (product ^ sign) < 0)
        abort();

    return product;
}


This lets GCC generate 98 instructions in 293 bytes, with 6 conditional
plus 3 unconditional branches:

__mulvti3:
        movq    %rsi, %rax
        pushq   %r15
        sarq    $63, %rax
        pushq   %r14
        movq    %rdx, %r14
        movq    %rax, %r10
        pushq   %r13
        movq    %rax, %r11
        movq    %rcx, %rax
        pushq   %r12
        sarq    $63, %rax
        pushq   %rbp
        movq    %rdi, %rbp
        movq    %rsi, %rdi
        movq    %rax, %r8
        xorq    %r10, %rbp
        xorq    %r10, %rdi
        movq    %rax, %r9
        pushq   %rbx
        movq    %rbp, %rax
        movq    %rdi, %rdx
        subq    %r10, %rax
        sbbq    %r10, %rdx
        xorq    %r8, %r14
        xorq    %r8, %rcx
        movq    %rax, %rsi
        movq    %r14, %r12
        movq    %rcx, %r13
        movq    %rdx, %rdi
        subq    %r8, %r12
        sbbq    %r8, %r13
        xorl    %ebp, %ebp
        movq    %r13, %rcx
        testq   %rdx, %rdx
        jne     .L4
        testq   %r13, %r13
        jne     .L5
        mulq    %r12
        movq    %rax, %r14
        movq    %rdx, %rcx
.L2:
        testq   %rbp, %rbp
        jne     .L10
        movq    %r10, %rax
        xorq    %r8, %rax
        movq    %rax, %r12
        movq    %r11, %rax
        xorq    %r9, %rax
        xorq    %r12, %r14
        movq    %rax, %r13
        movq    %r14, %rax
        xorq    %r13, %rcx
        subq    %r12, %rax
        movq    %r13, %rbx
        movq    %rcx, %rdx
        sbbq    %r13, %rdx
        xorq    %rdx, %rbx
        js      .L10
        popq    %rbx
        popq    %rbp
        popq    %r12
        popq    %r13
        popq    %r14
        popq    %r15
        ret
        .p2align 4,,10
        .p2align 3
.L4:
        testq   %r13, %r13
        jne     .L8
        movq    %rdx, %rcx
        movq    %r12, %rbx
.L6:
        movq    %rsi, %rax
        mulq    %r12
        movq    %rax, %r14
        movq    %rbx, %rax
        movq    %rdx, %r15
        xorl    %ebx, %ebx
        mulq    %rcx
        addq    %r15, %rax
        adcq    %rbx, %rdx
        testq   %rdx, %rdx
        jne     .L8
        movq    %rax, %rcx
        jmp     .L2
        .p2align 4,,10
        .p2align 3
.L5:
        movq    %rax, %rbx
        jmp     .L6
        .p2align 4,,10
        .p2align 3
.L8:
        movq    %rdi, %rcx
        movq    %r13, %rax
        movl    $1, %ebp
        imulq   %rsi, %rax
        imulq   %r12, %rcx
        addq    %rax, %rcx
        movq    %rsi, %rax
        mulq    %r12
        addq    %rdx, %rcx
        movq    %rax, %r14
        jmp     .L2
__mulvti3.cold:
.L10:
        call    abort


PROPER (assembly) code but uses only 44 instructions in just 121 bytes,
with ONE conditional branch!

--- mulvti3.s ---
# Copyright (C) 2014-2020, Stefan Kanthak
        .arch   generic64
        .code64
        .intel_syntax noprefix
        .global abort
        .global __mulvti3
        .type   __mulvti3, @function
        .text
                            # rsi:rdi = multiplicand
                            # rcx:rdx = multiplier
__mulvti3:
        mov     r10, rdx
        mov     r11, rcx
        mov     rax, rcx
        cqo                 # rdx = tmp
        mov     r9, rdx
        xor     r10, rdx
        xor     r11, rdx
        sub     r10, rdx
        sbb     r11, rdx    # r11:r10 = |multiplier|
        mov     rax, rsi
        cqo                 # rdx = sign
        xor     r9, rdx     # r9 = sign ^ tmp
        xor     rdi, rdx
        xor     rsi, rdx
        sub     rdi, rdx
        sbb     rsi, rdx    # rsi:rdi = |multiplicand|
        xor     ecx, ecx
        cmp     rcx, rsi
        sbb     edx, edx    # edx = (|multiplicand| < 2**64) ? 0 : -1
        cmp     rcx, r11
        sbb     ecx, ecx    # ecx = (|multiplier| < 2**64) ? 0 : -1
        and     ecx, edx    # ecx = (|product| < 2**128) ? 0 : -1
        mov     rax, rsi
        mul     r10
        adc     ecx, ecx    # ecx = (|product| < 2**128) ? 0 : *
        mov     rsi, rax
        mov     rax, rdi
        mul     r11
        adc     ecx, ecx    # ecx = (|product| < 2**128) ? 0 : *
        add     rsi, rax
#       adc     ecx, ecx    # ecx = (|product| < 2**128) ? 0 : *
        mov     rax, rdi
        mul     r10
        add     rdx, rsi
        adc     ecx, ecx    # ecx = (|product| < 2**128) ? 0 : *
        add     rax, r9
        adc     rdx, r9     # rdx:rax = (product < 0) ? ~product % 2**128 : 
product % 2**128
        mov     rsi, rdx    # rsi = (product % 2**128 < -2**127)
                            #     | (product % 2**128 >= 2**127) ? negative : 
positive
        xor     rax, r9
        xor     rdx, r9     # rdx:rax = product % 2**128
        add     rsi, rsi
        adc     ecx, ecx    # ecx = (product >= -2**127)
                            #     & (product <   2**127) ? 0 : *
        jnz     .overflow
        ret
.overflow:
        call    abort
        .end
--- EOF ---

Re: [__mulvti3] register allocator plays shell game

Reply via email to