On Monday, 11 June 2018 at 01:03:16 UTC, Mike Franklin wrote:
I've modified the test based on the feedback so far, so here's what it looks like now:

import std.datetime.stopwatch;
import std.stdio;
import core.stdc.string;
import std.random;
import std.algorithm;

enum length = 4096 * 2;

void init(ref ubyte[] a)
{
    a.length = length;

    for(int i = 0; i < length; i++)
    {
        a[i] = uniform!ubyte;
    }
}

void verifyResults(ubyte[] a, ubyte[] b)
{
    assert(memcmp(a.ptr, b.ptr, length) == 0);
}

void memcpyD(ubyte[] dst, ubyte[] src)
{
    dst[] = src[];
}

void memcpyDstdAlg(ubyte[] dst, ubyte[] src)
{
    copy(src, dst);
}

void memcpyC(ubyte[] dst, ubyte[] src)
{
    memcpy(dst.ptr, src.ptr, length);
}

void memcpyNaive(ubyte[] dst, ubyte[] src)
{
    for(int i = 0; i < length; i++)
    {
        dst[i] = src[i];
    }
}

void memcpyASM(ubyte[] dst, ubyte[] src)
{
    auto s = src.ptr;
    auto d = dst.ptr;
    size_t len = length;
    asm pure nothrow @nogc
    {
        mov RSI, s;
        mov RDI, d;
        cld;
        mov RCX, len;
        rep;
        movsb;
    }
}

Duration benchmark(alias f)(ubyte[] dst, ubyte[] src, uint n)
{
    Duration result;
    auto sw = StopWatch(AutoStart.yes);

    sw.reset();
    foreach (_; 0 .. n)
    {
        f(dst, src);
    }
    result = sw.peek();

    return result;
}

void main()
{
    ubyte[] src;
    ubyte[] dst;

    // verify the integrity of the algorithm
    init(src);
    init(dst);
    memcpyD(dst, src);
    verifyResults(dst, src);

    init(src);
    init(dst);
    memcpyDstdAlg(dst, src);
    verifyResults(dst, src);

    init(src);
    init(dst);
    memcpyC(dst, src);
    verifyResults(dst, src);

    init(src);
    init(dst);
    memcpyNaive(dst, src);
    verifyResults(dst, src);

    init(src);
    init(dst);
    memcpyASM(dst, src);
    verifyResults(dst, src);

    // test the performance of the algorithm
    enum iterations = 1000;
writeln("memcpyD: ", benchmark!memcpyD(dst, src, iterations)); writeln("memcpyDstdAlg: ", benchmark!memcpyDstdAlg(dst, src, iterations)); writeln("memcpyC: ", benchmark!memcpyC(dst, src, iterations)); writeln("memcpyNaive: ", benchmark!memcpyNaive(dst, src, iterations)); writeln("memcpyASM: ", benchmark!memcpyASM(dst, src, iterations));
}

The results on my Windows 10 machine (Intel Core i7-6700, 3.4GHz):
memcpyD: 127 ╬╝s and 3 hnsecs
memcpyDstdAlg: 195 ╬╝s and 9 hnsecs
memcpyC: 126 ╬╝s and 7 hnsecs
memcpyNaive: 17 ms, 974 ╬╝s, and 9 hnsecs
memcpyASM: 122 ╬╝s and 8 hnsecs
(Gotta love how windows displays μ)

The results running on Arch Linux 64-bit in a VirtualBox on the same Windows 10 machine:
memcpyD: 409 μs
memcpyDstdAlg: 400 μs
memcpyC: 404 μs and 4 hnsecs
memcpyNaive: 17 ms, 251 μs, and 6 hnsecs
memcpyASM: 162 μs and 8 hnsecs

The results appear more sane now, but it seems the behavior is highly platform dependent. Still the ASM is doing well for my hardware. If I run the test multiple times, I do see a lot of noise in the results, but each test seems to be affected proportionally, so I'm gaining a little more confidence in the benchmark.

I still need to analyze the assembly of C's memcpy (anyone know where I can find the source code?),

- default win32 OMF: https://github.com/DigitalMars/dmc/blob/master/src/core/MEMCCPY.C - default linux: https://github.com/gcc-mirror/gcc/blob/master/libgcc/memcpy.c - not used but interesting: https://github.com/esmil/musl/blob/master/src/string/memcpy.c

Reply via email to