Richard Biener <[email protected]> wrote:
> On Sun, Oct 25, 2020 at 8:37 PM Stefan Kanthak <[email protected]>
> wrote:
>>
>> Hi,
>>
>> for the AMD64 alias x86_64 platform and the __int128_t [DW]type,
>> the first few lines of the __mulvDI3() function from libgcc2.c
>>
>> | DWtype
>> | __mulvDI3 (DWtype u, DWtype v)
>> | {
>> | /* The unchecked multiplication needs 3 Wtype x Wtype multiplications,
>> | but the checked multiplication needs only two. */
>> | const DWunion uu = {.ll = u};
>> | const DWunion vv = {.ll = v};
>> |
>> | if (__builtin_expect (uu.s.high == uu.s.low >> (W_TYPE_SIZE - 1), 1))
>> | {
>> | /* u fits in a single Wtype. */
>> | if (__builtin_expect (vv.s.high == vv.s.low >> (W_TYPE_SIZE - 1), 1))
>> | {
>> | /* v fits in a single Wtype as well. */
>> | /* A single multiplication. No overflow risk. */
>> | return (DWtype) uu.s.low * (DWtype) vv.s.low;
>> | }
>>
>> are compiled to this braindead code (obtained from libgcc.a of
>> GCC 10.2.0 installed on Debian):
>>
>> 0000000000000000 <__mulvti3>:
>> 0: 41 55 push %r13
>> 2: 49 89 cb mov %rcx,%r11
>> 5: 48 89 d0 mov %rdx,%rax
>> 8: 49 89 d2 mov %rdx,%r10
>> b: 41 54 push %r12
>> d: 49 89 fc mov %rdi,%r12
>> 10: 48 89 d1 mov %rdx,%rcx
>> 13: 49 89 f0 mov %rsi,%r8
>> 16: 4c 89 e2 mov %r12,%rdx
>> 19: 49 89 f5 mov %rsi,%r13
>> 1c: 53 push %rbx
>> 1d: 48 89 fe mov %rdi,%rsi
>> 20: 48 c1 fa 3f sar $0x3f,%rdx
>> 24: 48 c1 f8 3f sar $0x3f,%rax
>> 28: 4c 89 df mov %r11,%rdi
>> 2b: 4c 39 c2 cmp %r8,%rdx
>> 2e: 75 18 jne 48 <__mulvti3+0x48>
>> 30: 4c 39 d8 cmp %r11,%rax
>> 33: 75 6b jne a0 <__mulvti3+0xa0>
>> 35: 4c 89 e0 mov %r12,%rax
>> 38: 49 f7 ea imul %r10
>> 3b: 5b pop %rbx
>> 3c: 41 5c pop %r12
>> 3e: 41 5d pop %r13
>> 40: c3 retq
>> ...
>>
>> There are EIGHT superfluous MOV instructions here, clobbering the
>> non-volatile registers RBX, R12 and R13, plus THREE superfluous
>> PUSH/POP pairs.
>>
>> What stops GCC from generating the following straightforward code
>> (11 instructions in 31 bytes instead of 25 instructions in 65 bytes)?
>>
>> .intel_syntax noprefix
>> __mulvti3:
>> mov r8, rdi
>> mov r9, rdx
>> sra r8, 63
>> sra r9, 63
>> cmp r8, rsi
>> jne __mulvti3+0x48+65-31
>> cmp r9, rcx
>> jne __mulvti3+0xa0+65-31
>> mov rax, rdi
>> imul rdx
>> ret
>> ...
>>
>>
>> not amused
>
> can you open a bugreport please?
I'd like to discuss alternative implementations of __mulvti3() first:
the very first lines of libgcc2.c reads
| /* More subroutines needed by GCC output code on some machines. */
| /* Compile this one with gcc. */
Why don't you take advantage of that and implement __mulvDI3() as
follows?
DWtype
__mulvDI3 (DWtype multiplicand, DWtype multiplier)
{
DWtype product;
if (__builtin_mul_overflow(multiplicand, multiplier, &product))
abort();
return product;
}
For the AMD64 platform, instead of the 131 instructions in 439 bytes
(partially cited above) GCC now generates 107 instructions in 324 bytes
... (un)fortunately showing this bug again, now clobbering RBP and RBX,
pushing/popping RBP, RBX, R12, R14 and R15, and still moving them around
without necessity:
__mulvti3:
movq %rdi, %r8
movq %rdx, %rdi
pushq %r15
movq %rcx, %r9
movq %r8, %rdx
movq %rdi, %rax
pushq %r14
sarq $63, %rdx
pushq %r12
sarq $63, %rax
pushq %rbp
xorl %ebp, %ebp
pushq %rbx
cmpq %rsi, %rdx
jne .L4
cmpq %rcx, %rax
jne .L5
movq %r8, %rax
imulq %rdi
movq %rax, %rbx
.L2:
testq %rbp, %rbp
jne __mulvti3.cold
movq %rbx, %rax
popq %rbx
popq %rbp
popq %r12
popq %r14
popq %r15
ret
...
.L4:
cmpq %rcx, %rax
jne .L7
...
jns .L8
xorl %r10d, %r10d
subq %r10, %rax
sbbq %r12, %rdx
.L8:
testq %r12, %r12
jns .L9
...
jne .L10
movq %rcx, %rbx
movq %r10, %rdx
jmp .L2
...
jmp .L6
.L7:
...
ja .L3
...
ja .L3
...
jne .L11
...
jl .L2
.L3:
movl $1, %ebp
jmp .L2
.L11:
testq %r10, %r10
js .L2
jmp .L3
.L10:
...
jmp .L3
__mulvti3.cold:
.L13:
call abort
Besides the poor register allocation, this code shows 12 conditional
plus 5 unconditional branches.
Let's try a second alternative implementation:
DWtype
__mulvDI3 (DWtype multiplicand, DWtype multiplier)
{
UDWtype product;
UDWtype sign = multiplicand >> (DW_TYPE_SIZE - 1);
UDWtype tmp = multiplier >> (DW_TYPE_SIZE - 1);
if (__builtin_mul_overflow((multiplicand ^ sign) - sign,
(multiplier ^ tmp) - tmp,
&product))
abort();
sign ^= tmp;
product = (product ^ sign) - sign;
if ((__int128_t) (product ^ sign) < 0)
abort();
return product;
}
This lets GCC generate 98 instructions in 293 bytes, with 6 conditional
plus 3 unconditional branches:
__mulvti3:
movq %rsi, %rax
pushq %r15
sarq $63, %rax
pushq %r14
movq %rdx, %r14
movq %rax, %r10
pushq %r13
movq %rax, %r11
movq %rcx, %rax
pushq %r12
sarq $63, %rax
pushq %rbp
movq %rdi, %rbp
movq %rsi, %rdi
movq %rax, %r8
xorq %r10, %rbp
xorq %r10, %rdi
movq %rax, %r9
pushq %rbx
movq %rbp, %rax
movq %rdi, %rdx
subq %r10, %rax
sbbq %r10, %rdx
xorq %r8, %r14
xorq %r8, %rcx
movq %rax, %rsi
movq %r14, %r12
movq %rcx, %r13
movq %rdx, %rdi
subq %r8, %r12
sbbq %r8, %r13
xorl %ebp, %ebp
movq %r13, %rcx
testq %rdx, %rdx
jne .L4
testq %r13, %r13
jne .L5
mulq %r12
movq %rax, %r14
movq %rdx, %rcx
.L2:
testq %rbp, %rbp
jne .L10
movq %r10, %rax
xorq %r8, %rax
movq %rax, %r12
movq %r11, %rax
xorq %r9, %rax
xorq %r12, %r14
movq %rax, %r13
movq %r14, %rax
xorq %r13, %rcx
subq %r12, %rax
movq %r13, %rbx
movq %rcx, %rdx
sbbq %r13, %rdx
xorq %rdx, %rbx
js .L10
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
ret
.p2align 4,,10
.p2align 3
.L4:
testq %r13, %r13
jne .L8
movq %rdx, %rcx
movq %r12, %rbx
.L6:
movq %rsi, %rax
mulq %r12
movq %rax, %r14
movq %rbx, %rax
movq %rdx, %r15
xorl %ebx, %ebx
mulq %rcx
addq %r15, %rax
adcq %rbx, %rdx
testq %rdx, %rdx
jne .L8
movq %rax, %rcx
jmp .L2
.p2align 4,,10
.p2align 3
.L5:
movq %rax, %rbx
jmp .L6
.p2align 4,,10
.p2align 3
.L8:
movq %rdi, %rcx
movq %r13, %rax
movl $1, %ebp
imulq %rsi, %rax
imulq %r12, %rcx
addq %rax, %rcx
movq %rsi, %rax
mulq %r12
addq %rdx, %rcx
movq %rax, %r14
jmp .L2
__mulvti3.cold:
.L10:
call abort
PROPER (assembly) code but uses only 44 instructions in just 121 bytes,
with ONE conditional branch!
--- mulvti3.s ---
# Copyright (C) 2014-2020, Stefan Kanthak
.arch generic64
.code64
.intel_syntax noprefix
.global abort
.global __mulvti3
.type __mulvti3, @function
.text
# rsi:rdi = multiplicand
# rcx:rdx = multiplier
__mulvti3:
mov r10, rdx
mov r11, rcx
mov rax, rcx
cqo # rdx = tmp
mov r9, rdx
xor r10, rdx
xor r11, rdx
sub r10, rdx
sbb r11, rdx # r11:r10 = |multiplier|
mov rax, rsi
cqo # rdx = sign
xor r9, rdx # r9 = sign ^ tmp
xor rdi, rdx
xor rsi, rdx
sub rdi, rdx
sbb rsi, rdx # rsi:rdi = |multiplicand|
xor ecx, ecx
cmp rcx, rsi
sbb edx, edx # edx = (|multiplicand| < 2**64) ? 0 : -1
cmp rcx, r11
sbb ecx, ecx # ecx = (|multiplier| < 2**64) ? 0 : -1
and ecx, edx # ecx = (|product| < 2**128) ? 0 : -1
mov rax, rsi
mul r10
adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : *
mov rsi, rax
mov rax, rdi
mul r11
adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : *
add rsi, rax
# adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : *
mov rax, rdi
mul r10
add rdx, rsi
adc ecx, ecx # ecx = (|product| < 2**128) ? 0 : *
add rax, r9
adc rdx, r9 # rdx:rax = (product < 0) ? ~product % 2**128 :
product % 2**128
mov rsi, rdx # rsi = (product % 2**128 < -2**127)
# | (product % 2**128 >= 2**127) ? negative :
positive
xor rax, r9
xor rdx, r9 # rdx:rax = product % 2**128
add rsi, rsi
adc ecx, ecx # ecx = (product >= -2**127)
# & (product < 2**127) ? 0 : *
jnz .overflow
ret
.overflow:
call abort
.end
--- EOF ---