I was looking at http://d.puremagic.com/issues/show_bug.cgi?id=4725 and thought I'd experiment a bit with a generic unrolling routine (code at the end of this). So I wrote a general parameterized unroll function that accepts a function, a number of times to unroll, and a random-access range. The idea was to experiment with unroll and see how it works for sums. I also wrote two baselines - a brute force one and one that unrolls but is specialized for addition.

The results are interesting. First, partial unrolling does help a lot - the brute force baseline takes 1.2s on my machine and unrollSum takes only 0.2s.

Second, there is an issue with inlining: unroll takes 0.37s, almost twice as much as the hand-unrolled version.

Third, it looks like larger unrolling limits is better - I only got a plateau at 128!


Andrei

#!/usr/bin/env rdmd
import std.algorithm, std.conv, std.date, std.functional, std.range, std.stdio, std.string, std.traits;

ElementType!R unroll(alias fun, size_t times, R)(R r) if (isRandomAccessRange!R && hasLength!R)
{
    static assert(times > 1 && !(times & (times - 1)));

    static string repeat(string s, string indexPlaceholder, size_t n)
    {
        string r = "";
        foreach (i; 0 .. n)
        {
            r ~= replace(s, indexPlaceholder, .to!string(i));
        }
        return r;
    }

    alias binaryFun!fun f;
    typeof(return) result = 0;
    immutable adjusted = r.length & ~cast(size_t) (times - 1);
    size_t i = 0;
    for (; i < adjusted; i += times)
    {
        ElementType!R[times - 1] temp = void;
mixin(repeat("temp[X] = f(r[i + (X + X)], r[i + (X + X + 1)]);", "X", times / 2)); mixin(repeat("temp[X + times / 2] = f(temp[X + X], temp[X + X + 1]);", "X", times / 2 - 1));
        result = f(result, temp[$ - 1]);
    }
    for (; i != r.length; ++i)
    {
        result += r[i];
    }
    return result;
}

ElementType!R unrollSum(size_t times, R)(R r) if (isRandomAccessRange!R && hasLength!R)
{
    static assert(times > 1 && !(times & (times - 1)));

    static string repeat(string s, string indexPlaceholder, size_t n)
    {
        string r = "";
        foreach (i; 0 .. n)
        {
            r ~= replace(s, indexPlaceholder, .to!string(i));
        }
        return r;
    }

    typeof(return) result = 0;
    immutable adjusted = r.length & ~cast(size_t) (times - 1);
    size_t i = 0;
    for (; i < adjusted; i += times)
    {
        ElementType!R[times - 1] temp = void;
mixin(repeat("temp[X] = r[i + (X + X)] + r[i + (X + X + 1)];", "X", times / 2)); mixin(repeat("temp[X + times / 2] = temp[X + X] + temp[X + X + 1];", "X", times / 2 - 1));
        result += temp[$ - 1];
    }
    for (; i != r.length; ++i)
    {
        result += r[i];
    }
    return result;
}

ElementType!R sum0(R)(R r) if (isInputRange!R)
{
    typeof(return) result = 0;
    for (; !r.empty; r.popFront()) {
        result += r.front;
    }
    return result;
}

void main(string[] args)
{
    enum times = 8;
    auto a = array(iota(100_000_001));
    double t = getUTCtime();
    if (args.length == 1) {
        writeln("baseline");
        writeln(sum0(a));
    } else if (args[1] == "unroll") {
        writeln(args[1]);
        writeln(unroll!("a + b", times)(a));
    } else if (args[1] == "unrollSum") {
        writeln(args[1]);
        writeln(unrollSum!(times)(a));
    } else {
        assert(0);
    }
    writeln((getUTCtime() - t) / ticksPerSecond);
}

Reply via email to