https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84719

--- Comment #11 from gpnuma at centaurean dot com ---
Yes it's not the init loop the problem. Just to make sure, with the following
code :

#include <sys/stat.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

int main(int argc, char *argv[]) {
    const uint64_t size = 1000000000;
    const size_t alloc_mem = size * sizeof(uint8_t);
    uint8_t *mem = malloc(alloc_mem);
//    for (uint_fast64_t i = 0; i < size; i++)
//        mem[i] = (uint8_t) (i >> 7);

    uint_fast64_t counter = 0;
    uint64_t total = 0x123456789abcdefllu;
    uint64_t receiver = 0;

    printf("%u ...\n", 3);
    counter = 0;
    while (counter < size - 8) {
        __builtin_memcpy(&receiver, &mem[counter], 3);
//        receiver &= (0xffffffffffffffffllu >> (64 - ((3) << 3)));
        total += ((receiver * 0x321654987cbafedllu) >> 48);
        counter += 3;
    }

    printf("=> %llu\n", total);
    return EXIT_SUCCESS;
}

The result is (the calculated sum is unreliable since we do not init memory) :
gcc
3 ...
=> 81985529216486895

real    0m3.180s
user    0m2.822s
sys     0m0.328s

clang
time ./a.out
3 ...
=> 81985529216486895

real    0m0.972s
user    0m0.621s
sys     0m0.338s

Still 4x faster

(In reply to Richard Biener from comment #9)
> So with 2 bytes we get
> 
> .L3:
>         movzwl  (%rax), %edx
>         addq    $3, %rax
>         movw    %dx, 8(%rsp)
>         movq    8(%rsp), %rdx
>         imulq   %rcx, %rdx
>         shrq    $48, %rdx
>         addq    %rdx, %rsi
>         cmpq    %rdi, %rax
>         jne     .L3
> 
> while with 3 bytes we see
> 
> .L3:
>         movzwl  (%rax), %edx
>         addq    $3, %rax
>         movw    %dx, 8(%rsp)
>         movzbl  -1(%rax), %edx
>         movb    %dl, 10(%rsp)
>         movq    8(%rsp), %rdx
>         imulq   %rcx, %rdx
>         shrq    $48, %rdx
>         addq    %rdx, %rsi
>         cmpq    %rdi, %rax
>         jne     .L3
> 
> while clang outputs
> 
> .LBB0_3:                                # =>This Inner Loop Header: Depth=1
>         movzwl  (%r14,%rcx), %edx
>         movzbl  2(%r14,%rcx), %edi
>         shlq    $16, %rdi
>         orq     %rdx, %rdi
>         andq    $-16777216, %rbx        # imm = 0xFFFFFFFFFF000000
>         orq     %rdi, %rbx
>         movq    %rbx, %rdx
>         imulq   %rax, %rdx
>         shrq    $48, %rdx
>         addq    %rdx, %rsi
>         addq    $3, %rcx
>         cmpq    $999999992, %rcx        # imm = 0x3B9AC9F8
>         jb      .LBB0_3
> 
> that _looks_ slower.  Are you sure performance isn't dominated by the
> first init loop (both GCC and clang vectorize it).  I notice we spill
> in the above loop for the bitfield insert where clang uses register
> operations.  We refuse to inline the memcpy at the GIMPLE level
> and further refuse to optimzie it to a BIT_INSERT_EXPR which would
> be a possibility.

Reply via email to