I've modified the test based on the feedback so far, so here's what it looks like now:

import std.datetime.stopwatch;
import std.stdio;
import core.stdc.string;
import std.random;
import std.algorithm;

enum length = 4096 * 2;

void init(ref ubyte[] a)
{
    a.length = length;

    for(int i = 0; i < length; i++)
    {
        a[i] = uniform!ubyte;
    }
}

void verifyResults(ubyte[] a, ubyte[] b)
{
    assert(memcmp(a.ptr, b.ptr, length) == 0);
}

void memcpyD(ubyte[] dst, ubyte[] src)
{
    dst[] = src[];
}

void memcpyDstdAlg(ubyte[] dst, ubyte[] src)
{
    copy(src, dst);
}

void memcpyC(ubyte[] dst, ubyte[] src)
{
    memcpy(dst.ptr, src.ptr, length);
}

void memcpyNaive(ubyte[] dst, ubyte[] src)
{
    for(int i = 0; i < length; i++)
    {
        dst[i] = src[i];
    }
}

void memcpyASM(ubyte[] dst, ubyte[] src)
{
    auto s = src.ptr;
    auto d = dst.ptr;
    size_t len = length;
    asm pure nothrow @nogc
    {
        mov RSI, s;
        mov RDI, d;
        cld;
        mov RCX, len;
        rep;
        movsb;
    }
}

Duration benchmark(alias f)(ubyte[] dst, ubyte[] src, uint n)
{
    Duration result;
    auto sw = StopWatch(AutoStart.yes);

    sw.reset();
    foreach (_; 0 .. n)
    {
        f(dst, src);
    }
    result = sw.peek();

    return result;
}

void main()
{
    ubyte[] src;
    ubyte[] dst;

    // verify the integrity of the algorithm
    init(src);
    init(dst);
    memcpyD(dst, src);
    verifyResults(dst, src);

    init(src);
    init(dst);
    memcpyDstdAlg(dst, src);
    verifyResults(dst, src);

    init(src);
    init(dst);
    memcpyC(dst, src);
    verifyResults(dst, src);

    init(src);
    init(dst);
    memcpyNaive(dst, src);
    verifyResults(dst, src);

    init(src);
    init(dst);
    memcpyASM(dst, src);
    verifyResults(dst, src);

    // test the performance of the algorithm
    enum iterations = 1000;
    writeln("memcpyD: ", benchmark!memcpyD(dst, src, iterations));
writeln("memcpyDstdAlg: ", benchmark!memcpyDstdAlg(dst, src, iterations));
    writeln("memcpyC: ", benchmark!memcpyC(dst, src, iterations));
writeln("memcpyNaive: ", benchmark!memcpyNaive(dst, src, iterations)); writeln("memcpyASM: ", benchmark!memcpyASM(dst, src, iterations));
}

The results on my Windows 10 machine (Intel Core i7-6700, 3.4GHz):
memcpyD: 127 ╬╝s and 3 hnsecs
memcpyDstdAlg: 195 ╬╝s and 9 hnsecs
memcpyC: 126 ╬╝s and 7 hnsecs
memcpyNaive: 17 ms, 974 ╬╝s, and 9 hnsecs
memcpyASM: 122 ╬╝s and 8 hnsecs
(Gotta love how windows displays μ)

The results running on Arch Linux 64-bit in a VirtualBox on the same Windows 10 machine:
memcpyD: 409 μs
memcpyDstdAlg: 400 μs
memcpyC: 404 μs and 4 hnsecs
memcpyNaive: 17 ms, 251 μs, and 6 hnsecs
memcpyASM: 162 μs and 8 hnsecs

The results appear more sane now, but it seems the behavior is highly platform dependent. Still the ASM is doing well for my hardware. If I run the test multiple times, I do see a lot of noise in the results, but each test seems to be affected proportionally, so I'm gaining a little more confidence in the benchmark.

I still need to analyze the assembly of C's memcpy (anyone know where I can find the source code?), test on more platforms, and test varying sizes, but I'm just collecting some initial data right now, to learn how to proceed.

I'd be interested in those with other platforms reporting back their results for their hardware, and of course suggestions for how to meet or beat C's memcpy with a pure D implementation.

Thanks for all the feedback so far.

Mike

Reply via email to