https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84719

--- Comment #6 from gpnuma at centaurean dot com ---
If you compile the following code (-O3 being the only flag used) :

#include <sys/stat.h>
#include <sys/types.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>

int main(int argc, char *argv[]) {
    const uint64_t size = 1000000000;
    const size_t alloc_mem = size * sizeof(uint8_t);
    uint8_t *mem = malloc(alloc_mem);
    for (uint_fast64_t i = 0; i < size; i++)
        mem[i] = (uint8_t) (i >> 7);

    uint_fast64_t counter = 0;
    uint64_t total = 0x123456789abcdefllu;
    uint64_t receiver = 0;

    printf("%u ...\n", 3);
    counter = 0;
    while (counter < size - 8) {
        __builtin_memcpy(&receiver, &mem[counter], 3);
        receiver &= (0xffffffffffffffffllu >> (64 - ((3) << 3)));
        total += ((receiver * 0x321654987cbafedllu) >> 48);
        counter += 3;
    }

    printf("=> %llu\n", total);
    return EXIT_SUCCESS;
}

Here are the results :
gcc
time ./a.out
3 ...
=> 81996806116422545

real    0m4.145s
user    0m3.691s
sys     0m0.396s

clang
time ./a.out
3 ...
=> 81996806116422545

real    0m1.246s
user    0m0.855s
sys     0m0.374s

4x faster

Reply via email to