Re: [__mulvti3] register allocator plays shell game

Richard Biener via Gcc Tue, 27 Oct 2020 01:34:03 -0700

On Tue, Oct 27, 2020 at 12:01 AM Stefan Kanthak <stefan.kant...@nexgo.de> wrote:
>
> Richard Biener <richard.guent...@gmail.com> wrote:
>
> > On Sun, Oct 25, 2020 at 8:37 PM Stefan Kanthak <stefan.kant...@nexgo.de> 
> > wrote:
> >>
> >> Hi,
> >>
> >> for the AMD64 alias x86_64 platform and the __int128_t [DW]type,
> >> the first few lines of the __mulvDI3() function from libgcc2.c
> >>
> >> | DWtype
> >> | __mulvDI3 (DWtype u, DWtype v)
> >> | {
> >> |   /* The unchecked multiplication needs 3 Wtype x Wtype multiplications,
> >> |      but the checked multiplication needs only two.  */
> >> |   const DWunion uu = {.ll = u};
> >> |   const DWunion vv = {.ll = v};
> >> |
> >> |   if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1))
> >> |     {
> >> |       /* u fits in a single Wtype.  */
> >> |       if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 
> >> 1))
> >> |  {
> >> |    /* v fits in a single Wtype as well.  */
> >> |    /* A single multiplication.  No overflow risk.  */
> >> |    return (DWtype) uu.s.low * (DWtype) vv.s.low;
> >> |  }
> >>
> >> are compiled to this braindead code (obtained from libgcc.a of
> >> GCC 10.2.0 installed on Debian):
> >>
> >> 0000000000000000 <__mulvti3>:
> >>    0: 41 55                 push   %r13
> >>    2: 49 89 cb              mov    %rcx,%r11
> >>    5: 48 89 d0              mov    %rdx,%rax
> >>    8: 49 89 d2              mov    %rdx,%r10
> >>    b: 41 54                 push   %r12
> >>    d: 49 89 fc              mov    %rdi,%r12
> >>   10: 48 89 d1              mov    %rdx,%rcx
> >>   13: 49 89 f0              mov    %rsi,%r8
> >>   16: 4c 89 e2              mov    %r12,%rdx
> >>   19: 49 89 f5              mov    %rsi,%r13
> >>   1c: 53                    push   %rbx
> >>   1d: 48 89 fe              mov    %rdi,%rsi
> >>   20: 48 c1 fa 3f           sar    $0x3f,%rdx
> >>   24: 48 c1 f8 3f           sar    $0x3f,%rax
> >>   28: 4c 89 df              mov    %r11,%rdi
> >>   2b: 4c 39 c2              cmp    %r8,%rdx
> >>   2e: 75 18                 jne    48 <__mulvti3+0x48>
> >>   30: 4c 39 d8              cmp    %r11,%rax
> >>   33: 75 6b                 jne    a0 <__mulvti3+0xa0>
> >>   35: 4c 89 e0              mov    %r12,%rax
> >>   38: 49 f7 ea              imul   %r10
> >>   3b: 5b                    pop    %rbx
> >>   3c: 41 5c                 pop    %r12
> >>   3e: 41 5d                 pop    %r13
> >>   40: c3                    retq
> >> ...
> >>
> >> There are EIGHT superfluous MOV instructions here, clobbering the
> >> non-volatile registers RBX, R12 and R13, plus THREE superfluous
> >> PUSH/POP pairs.
> >>
> >> What stops GCC from generating the following straightforward code
> >> (11 instructions in 31 bytes instead of 25 instructions in 65 bytes)?
> >>
> >> .intel_syntax noprefix
> >> __mulvti3:
> >>     mov   r8, rdi
> >>     mov   r9, rdx
> >>     sra   r8, 63
> >>     sra   r9, 63
> >>     cmp   r8, rsi
> >>     jne   __mulvti3+0x48+65-31
> >>     cmp   r9, rcx
> >>     jne   __mulvti3+0xa0+65-31
> >>     mov   rax, rdi
> >>     imul  rdx
> >>     ret
> >> ...
> >>
> >>
> >> not amused
> >
> > can you open a bugreport please?
>
>
> I'd like to discuss alternative implementations of __mulvti3() first:
> the very first lines of libgcc2.c reads
>
> | /* More subroutines needed by GCC output code on some machines.  */
> | /* Compile this one with gcc.  */
>
>
> Why don't you take advantage of that and implement __mulvDI3() as
> follows?
>
> DWtype
> __mulvDI3 (DWtype multiplicand, DWtype multiplier)
> {
>     DWtype product;
>
>     if (__builtin_mul_overflow(multiplicand, multiplier, &product))
>         abort();
>
>     return product;
> }


Sure, that's possible - but the main concern is that such fancy builtins
can end up calling libgcc.  Which isn't really relevant for the
trap-on-overflow variants though.

I guess such change is desirable (also for the other arithmetic ops)
and code generation improvements should be done for the
__builtin_mul_overflow expansion, outside of libgcc.

Note __OPvMODE are considered legacy and -ftrapv dysfunctional
in general ...

>
> For the AMD64 platform, instead of the 131 instructions in 439 bytes
> (partially cited above) GCC now generates 107 instructions in 324 bytes
>  ... (un)fortunately showing this bug again, now clobbering RBP and RBX,
> pushing/popping RBP, RBX, R12, R14 and R15, and still moving them around
> without necessity:
>
>
> __mulvti3:
>         movq    %rdi, %r8
>         movq    %rdx, %rdi
>         pushq   %r15
>         movq    %rcx, %r9
>         movq    %r8, %rdx
>         movq    %rdi, %rax
>         pushq   %r14
>         sarq    $63, %rdx
>         pushq   %r12
>         sarq    $63, %rax
>         pushq   %rbp
>         xorl    %ebp, %ebp
>         pushq   %rbx
>         cmpq    %rsi, %rdx
>         jne     .L4
>         cmpq    %rcx, %rax
>         jne     .L5
>         movq    %r8, %rax
>         imulq   %rdi
>         movq    %rax, %rbx
> .L2:
>         testq   %rbp, %rbp
>         jne     __mulvti3.cold
>         movq    %rbx, %rax
>         popq    %rbx
>         popq    %rbp
>         popq    %r12
>         popq    %r14
>         popq    %r15
>         ret
> ...
> .L4:
>         cmpq    %rcx, %rax
>         jne     .L7
> ...
>         jns     .L8
>         xorl    %r10d, %r10d
>         subq    %r10, %rax
>         sbbq    %r12, %rdx
> .L8:
>         testq    %r12, %r12
>         jns     .L9
> ...
>         jne     .L10
>         movq    %rcx, %rbx
>         movq    %r10, %rdx
>         jmp     .L2
> ...
>         jmp     .L6
> .L7:
> ...
>         ja      .L3
> ...
>         ja      .L3
> ...
>         jne     .L11
> ...
>         jl      .L2
> .L3:
>         movl        $1, %ebp
>         jmp     .L2
> .L11:
>         testq    %r10, %r10
>         js      .L2
>         jmp     .L3
> .L10:
> ...
>         jmp     .L3
> __mulvti3.cold:
> .L13:
>         call    abort
>
> Besides the poor register allocation, this code shows 12 conditional
> plus 5 unconditional branches.
>
>
> Let's try a second alternative implementation:
>
> DWtype
> __mulvDI3 (DWtype multiplicand, DWtype multiplier)
> {
>     UDWtype product;
>     UDWtype sign = multiplicand >> (DW_TYPE_SIZE - 1);
>     UDWtype tmp = multiplier >> (DW_TYPE_SIZE - 1);
>
>     if (__builtin_mul_overflow((multiplicand ^ sign) - sign,
>                                (multiplier ^ tmp) - tmp,
>                                &product))
>         abort();
>
>     sign ^= tmp;
>     product = (product ^ sign) - sign;
>
>     if ((__int128_t) (product ^ sign) < 0)
>         abort();
>
>     return product;
> }
>
>
> This lets GCC generate 98 instructions in 293 bytes, with 6 conditional
> plus 3 unconditional branches:
>
> __mulvti3:
>         movq    %rsi, %rax
>         pushq   %r15
>         sarq    $63, %rax
>         pushq   %r14
>         movq    %rdx, %r14
>         movq    %rax, %r10
>         pushq   %r13
>         movq    %rax, %r11
>         movq    %rcx, %rax
>         pushq   %r12
>         sarq    $63, %rax
>         pushq   %rbp
>         movq    %rdi, %rbp
>         movq    %rsi, %rdi
>         movq    %rax, %r8
>         xorq    %r10, %rbp
>         xorq    %r10, %rdi
>         movq    %rax, %r9
>         pushq   %rbx
>         movq    %rbp, %rax
>         movq    %rdi, %rdx
>         subq    %r10, %rax
>         sbbq    %r10, %rdx
>         xorq    %r8, %r14
>         xorq    %r8, %rcx
>         movq    %rax, %rsi
>         movq    %r14, %r12
>         movq    %rcx, %r13
>         movq    %rdx, %rdi
>         subq    %r8, %r12
>         sbbq    %r8, %r13
>         xorl    %ebp, %ebp
>         movq    %r13, %rcx
>         testq   %rdx, %rdx
>         jne     .L4
>         testq   %r13, %r13
>         jne     .L5
>         mulq    %r12
>         movq    %rax, %r14
>         movq    %rdx, %rcx
> .L2:
>         testq   %rbp, %rbp
>         jne     .L10
>         movq    %r10, %rax
>         xorq    %r8, %rax
>         movq    %rax, %r12
>         movq    %r11, %rax
>         xorq    %r9, %rax
>         xorq    %r12, %r14
>         movq    %rax, %r13
>         movq    %r14, %rax
>         xorq    %r13, %rcx
>         subq    %r12, %rax
>         movq    %r13, %rbx
>         movq    %rcx, %rdx
>         sbbq    %r13, %rdx
>         xorq    %rdx, %rbx
>         js      .L10
>         popq    %rbx
>         popq    %rbp
>         popq    %r12
>         popq    %r13
>         popq    %r14
>         popq    %r15
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L4:
>         testq   %r13, %r13
>         jne     .L8
>         movq    %rdx, %rcx
>         movq    %r12, %rbx
> .L6:
>         movq    %rsi, %rax
>         mulq    %r12
>         movq    %rax, %r14
>         movq    %rbx, %rax
>         movq    %rdx, %r15
>         xorl    %ebx, %ebx
>         mulq    %rcx
>         addq    %r15, %rax
>         adcq    %rbx, %rdx
>         testq   %rdx, %rdx
>         jne     .L8
>         movq    %rax, %rcx
>         jmp     .L2
>         .p2align 4,,10
>         .p2align 3
> .L5:
>         movq    %rax, %rbx
>         jmp     .L6
>         .p2align 4,,10
>         .p2align 3
> .L8:
>         movq    %rdi, %rcx
>         movq    %r13, %rax
>         movl    $1, %ebp
>         imulq   %rsi, %rax
>         imulq   %r12, %rcx
>         addq    %rax, %rcx
>         movq    %rsi, %rax
>         mulq    %r12
>         addq    %rdx, %rcx
>         movq    %rax, %r14
>         jmp     .L2
> __mulvti3.cold:
> .L10:
>         call    abort
>
>
> PROPER (assembly) code but uses only 44 instructions in just 121 bytes,
> with ONE conditional branch!
>
> --- mulvti3.s ---
> # Copyright (C) 2014-2020, Stefan Kanthak
>         .arch   generic64
>         .code64
>         .intel_syntax noprefix
>         .global abort
>         .global __mulvti3
>         .type   __mulvti3, @function
>         .text
>                             # rsi:rdi = multiplicand
>                             # rcx:rdx = multiplier
> __mulvti3:
>         mov     r10, rdx
>         mov     r11, rcx
>         mov     rax, rcx
>         cqo                 # rdx = tmp
>         mov     r9, rdx
>         xor     r10, rdx
>         xor     r11, rdx
>         sub     r10, rdx
>         sbb     r11, rdx    # r11:r10 = |multiplier|
>         mov     rax, rsi
>         cqo                 # rdx = sign
>         xor     r9, rdx     # r9 = sign ^ tmp
>         xor     rdi, rdx
>         xor     rsi, rdx
>         sub     rdi, rdx
>         sbb     rsi, rdx    # rsi:rdi = |multiplicand|
>         xor     ecx, ecx
>         cmp     rcx, rsi
>         sbb     edx, edx    # edx = (|multiplicand| < 2**64) ? 0 : -1
>         cmp     rcx, r11
>         sbb     ecx, ecx    # ecx = (|multiplier| < 2**64) ? 0 : -1
>         and     ecx, edx    # ecx = (|product| < 2**128) ? 0 : -1
>         mov     rax, rsi
>         mul     r10
>         adc     ecx, ecx    # ecx = (|product| < 2**128) ? 0 : *
>         mov     rsi, rax
>         mov     rax, rdi
>         mul     r11
>         adc     ecx, ecx    # ecx = (|product| < 2**128) ? 0 : *
>         add     rsi, rax
> #       adc     ecx, ecx    # ecx = (|product| < 2**128) ? 0 : *
>         mov     rax, rdi
>         mul     r10
>         add     rdx, rsi
>         adc     ecx, ecx    # ecx = (|product| < 2**128) ? 0 : *
>         add     rax, r9
>         adc     rdx, r9     # rdx:rax = (product < 0) ? ~product % 2**128 : 
> product % 2**128
>         mov     rsi, rdx    # rsi = (product % 2**128 < -2**127)
>                             #     | (product % 2**128 >= 2**127) ? negative : 
> positive
>         xor     rax, r9
>         xor     rdx, r9     # rdx:rax = product % 2**128
>         add     rsi, rsi
>         adc     ecx, ecx    # ecx = (product >= -2**127)
>                             #     & (product <   2**127) ? 0 : *
>         jnz     .overflow
>         ret
> .overflow:
>         call    abort
>         .end
> --- EOF ---

Re: [__mulvti3] register allocator plays shell game

Reply via email to