https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84719
--- Comment #11 from gpnuma at centaurean dot com --- Yes it's not the init loop the problem. Just to make sure, with the following code : #include <sys/stat.h> #include <sys/types.h> #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <stdbool.h> #include <string.h> int main(int argc, char *argv[]) { const uint64_t size = 1000000000; const size_t alloc_mem = size * sizeof(uint8_t); uint8_t *mem = malloc(alloc_mem); // for (uint_fast64_t i = 0; i < size; i++) // mem[i] = (uint8_t) (i >> 7); uint_fast64_t counter = 0; uint64_t total = 0x123456789abcdefllu; uint64_t receiver = 0; printf("%u ...\n", 3); counter = 0; while (counter < size - 8) { __builtin_memcpy(&receiver, &mem[counter], 3); // receiver &= (0xffffffffffffffffllu >> (64 - ((3) << 3))); total += ((receiver * 0x321654987cbafedllu) >> 48); counter += 3; } printf("=> %llu\n", total); return EXIT_SUCCESS; } The result is (the calculated sum is unreliable since we do not init memory) : gcc 3 ... => 81985529216486895 real 0m3.180s user 0m2.822s sys 0m0.328s clang time ./a.out 3 ... => 81985529216486895 real 0m0.972s user 0m0.621s sys 0m0.338s Still 4x faster (In reply to Richard Biener from comment #9) > So with 2 bytes we get > > .L3: > movzwl (%rax), %edx > addq $3, %rax > movw %dx, 8(%rsp) > movq 8(%rsp), %rdx > imulq %rcx, %rdx > shrq $48, %rdx > addq %rdx, %rsi > cmpq %rdi, %rax > jne .L3 > > while with 3 bytes we see > > .L3: > movzwl (%rax), %edx > addq $3, %rax > movw %dx, 8(%rsp) > movzbl -1(%rax), %edx > movb %dl, 10(%rsp) > movq 8(%rsp), %rdx > imulq %rcx, %rdx > shrq $48, %rdx > addq %rdx, %rsi > cmpq %rdi, %rax > jne .L3 > > while clang outputs > > .LBB0_3: # =>This Inner Loop Header: Depth=1 > movzwl (%r14,%rcx), %edx > movzbl 2(%r14,%rcx), %edi > shlq $16, %rdi > orq %rdx, %rdi > andq $-16777216, %rbx # imm = 0xFFFFFFFFFF000000 > orq %rdi, %rbx > movq %rbx, %rdx > imulq %rax, %rdx > shrq $48, %rdx > addq %rdx, %rsi > addq $3, %rcx > cmpq $999999992, %rcx # imm = 0x3B9AC9F8 > jb .LBB0_3 > > that _looks_ slower. Are you sure performance isn't dominated by the > first init loop (both GCC and clang vectorize it). I notice we spill > in the above loop for the bitfield insert where clang uses register > operations. We refuse to inline the memcpy at the GIMPLE level > and further refuse to optimzie it to a BIT_INSERT_EXPR which would > be a possibility.