On Monday, 11 June 2018 at 01:03:16 UTC, Mike Franklin wrote:
I've modified the test based on the feedback so far, so here's
what it looks like now:
import std.datetime.stopwatch;
import std.stdio;
import core.stdc.string;
import std.random;
import std.algorithm;
enum length = 4096 * 2;
void init(ref ubyte[] a)
{
a.length = length;
for(int i = 0; i < length; i++)
{
a[i] = uniform!ubyte;
}
}
void verifyResults(ubyte[] a, ubyte[] b)
{
assert(memcmp(a.ptr, b.ptr, length) == 0);
}
void memcpyD(ubyte[] dst, ubyte[] src)
{
dst[] = src[];
}
void memcpyDstdAlg(ubyte[] dst, ubyte[] src)
{
copy(src, dst);
}
void memcpyC(ubyte[] dst, ubyte[] src)
{
memcpy(dst.ptr, src.ptr, length);
}
void memcpyNaive(ubyte[] dst, ubyte[] src)
{
for(int i = 0; i < length; i++)
{
dst[i] = src[i];
}
}
void memcpyASM(ubyte[] dst, ubyte[] src)
{
auto s = src.ptr;
auto d = dst.ptr;
size_t len = length;
asm pure nothrow @nogc
{
mov RSI, s;
mov RDI, d;
cld;
mov RCX, len;
rep;
movsb;
}
}
Duration benchmark(alias f)(ubyte[] dst, ubyte[] src, uint n)
{
Duration result;
auto sw = StopWatch(AutoStart.yes);
sw.reset();
foreach (_; 0 .. n)
{
f(dst, src);
}
result = sw.peek();
return result;
}
void main()
{
ubyte[] src;
ubyte[] dst;
// verify the integrity of the algorithm
init(src);
init(dst);
memcpyD(dst, src);
verifyResults(dst, src);
init(src);
init(dst);
memcpyDstdAlg(dst, src);
verifyResults(dst, src);
init(src);
init(dst);
memcpyC(dst, src);
verifyResults(dst, src);
init(src);
init(dst);
memcpyNaive(dst, src);
verifyResults(dst, src);
init(src);
init(dst);
memcpyASM(dst, src);
verifyResults(dst, src);
// test the performance of the algorithm
enum iterations = 1000;
writeln("memcpyD: ", benchmark!memcpyD(dst, src,
iterations));
writeln("memcpyDstdAlg: ", benchmark!memcpyDstdAlg(dst,
src, iterations));
writeln("memcpyC: ", benchmark!memcpyC(dst, src,
iterations));
writeln("memcpyNaive: ", benchmark!memcpyNaive(dst, src,
iterations));
writeln("memcpyASM: ", benchmark!memcpyASM(dst, src,
iterations));
}
The results on my Windows 10 machine (Intel Core i7-6700,
3.4GHz):
memcpyD: 127 ╬╝s and 3 hnsecs
memcpyDstdAlg: 195 ╬╝s and 9 hnsecs
memcpyC: 126 ╬╝s and 7 hnsecs
memcpyNaive: 17 ms, 974 ╬╝s, and 9 hnsecs
memcpyASM: 122 ╬╝s and 8 hnsecs
(Gotta love how windows displays μ)
The results running on Arch Linux 64-bit in a VirtualBox on the
same Windows 10 machine:
memcpyD: 409 μs
memcpyDstdAlg: 400 μs
memcpyC: 404 μs and 4 hnsecs
memcpyNaive: 17 ms, 251 μs, and 6 hnsecs
memcpyASM: 162 μs and 8 hnsecs
The results appear more sane now, but it seems the behavior is
highly platform dependent. Still the ASM is doing well for my
hardware. If I run the test multiple times, I do see a lot of
noise in the results, but each test seems to be affected
proportionally, so I'm gaining a little more confidence in the
benchmark.
I still need to analyze the assembly of C's memcpy (anyone know
where I can find the source code?),
- default win32 OMF:
https://github.com/DigitalMars/dmc/blob/master/src/core/MEMCCPY.C
- default linux:
https://github.com/gcc-mirror/gcc/blob/master/libgcc/memcpy.c
- not used but interesting:
https://github.com/esmil/musl/blob/master/src/string/memcpy.c