Looked at disassembly of memcpy() and NetBSD version looks way more complicated. I don't know anything about x86 assembly, but maybe the clue is somewhere here:
NetBSD (gcc 5.5.0): Dump of assembler code for function memcpy: => 0x00007f7e5940b980 <+0>: mov %rdx,%rcx 0x00007f7e5940b983 <+3>: mov %rdi,%rax 0x00007f7e5940b986 <+6>: mov %rdi,%r11 0x00007f7e5940b989 <+9>: shr $0x3,%rcx 0x00007f7e5940b98d <+13>: je 0x7f7e5940b9cc <memcpy+76> 0x00007f7e5940b98f <+15>: lea -0x8(%rdi,%rdx,1),%r9 0x00007f7e5940b994 <+20>: mov -0x8(%rsi,%rdx,1),%r10 0x00007f7e5940b999 <+25>: and $0x7,%r11 0x00007f7e5940b99d <+29>: jne 0x7f7e5940b9a6 <memcpy+38> 0x00007f7e5940b99f <+31>: rep movsq %ds:(%rsi),%es:(%rdi) 0x00007f7e5940b9a2 <+34>: mov %r10,(%r9) 0x00007f7e5940b9a5 <+37>: retq 0x00007f7e5940b9a6 <+38>: lea -0x9(%r11,%rdx,1),%rcx 0x00007f7e5940b9ab <+43>: neg %r11 0x00007f7e5940b9ae <+46>: mov (%rsi),%rdx 0x00007f7e5940b9b1 <+49>: mov %rdi,%r8 0x00007f7e5940b9b4 <+52>: lea 0x8(%rsi,%r11,1),%rsi 0x00007f7e5940b9b9 <+57>: lea 0x8(%rdi,%r11,1),%rdi 0x00007f7e5940b9be <+62>: shr $0x3,%rcx 0x00007f7e5940b9c2 <+66>: rep movsq %ds:(%rsi),%es:(%rdi) 0x00007f7e5940b9c5 <+69>: mov %rdx,(%r8) 0x00007f7e5940b9c8 <+72>: mov %r10,(%r9) 0x00007f7e5940b9cb <+75>: retq 0x00007f7e5940b9cc <+76>: mov %rdx,%rcx 0x00007f7e5940b9cf <+79>: rep movsb %ds:(%rsi),%es:(%rdi) 0x00007f7e5940b9d1 <+81>: retq End of assembler dump. Linux (gcc 6.3.0): Dump of assembler code for function memcpy: => 0x00007ffff78a0e90 <+0>: mov %rdi,%rax 0x00007ffff78a0e93 <+3>: cmp $0x10,%rdx 0x00007ffff78a0e97 <+7>: jb 0x7ffff78a0f77 0x00007ffff78a0e9d <+13>: cmp $0x20,%rdx 0x00007ffff78a0ea1 <+17>: ja 0x7ffff78a0fc6 0x00007ffff78a0ea7 <+23>: movups (%rsi),%xmm0 0x00007ffff78a0eaa <+26>: movups -0x10(%rsi,%rdx,1),%xmm1 0x00007ffff78a0eaf <+31>: movups %xmm0,(%rdi) 0x00007ffff78a0eb2 <+34>: movups %xmm1,-0x10(%rdi,%rdx,1) 0x00007ffff78a0eb7 <+39>: retq End of assembler dump.