On Thursday, 27 February 2020 at 04:44:56 UTC, Basile B. wrote:
On Thursday, 27 February 2020 at 03:58:15 UTC, Bruce Carneal wrote:

Maybe you talked about another implementation of decimalLength9 ?

Yes.  It's one I wrote after I saw your post. Psuedo-code here:

auto d9_branchless(uint v) { return 1 + (v >= 10) + (v >= 100) ... }

Using ldc to target an x86 with the above yields a series of cmpl, seta instruction pairs in the function body followed by a summing and a return. No branching.

Let me know if the above is unclear or insufficient.

No thanks, it's crystal clear now.

although I don't see the performance gain. Now for me an hybrid version based on a LUT and on the branchless one is the fatest (decimalLength9_5), although still slowest then the original.

updated program, incl your branchless version (decimalLength9_4):

---
#!ldmd -boundscheck=off -release -inline -O -mcpu=native -mattr=+sse2,+sse3,+sse4.1,+sse4.2,+fast-lzcnt,+avx,+avx2,+cmov,+bmi,+bmi2
import core.memory;
import core.bitop;
import std.stdio;
import std.range;
import std.algorithm;
import std.getopt;
import std.random;

ubyte decimalLength9_0(const uint v)
{
    if (v >= 100000000) { return 9; }
    if (v >= 10000000) { return 8; }
    if (v >= 1000000) { return 7; }
    if (v >= 100000) { return 6; }
    if (v >= 10000) { return 5; }
    if (v >= 1000) { return 4; }
    if (v >= 100) { return 3; }
    if (v >= 10) { return 2; }
    return 1;
}

ubyte decimalLength9_1(const uint v) pure nothrow
{
    if (v == 0) // BSR and LZCNT UB when input is 0
        return 1;
    const ubyte lzc = cast(ubyte) bsr(v);
    ubyte result;
    switch (lzc)
    {
        case 0 :
        case 1 :
        case 2 : result =  1; break;
        case 3 : result =  v >= 10 ? 2 : 1; break;
        case 4 :
        case 5 : result =  2; break;
        case 6 : result =  v >= 100 ? 3 : 2; break;
        case 7 :
        case 8 : result =  3; break;
        case 9 : result =  v >= 1000 ? 4 : 3; break;
        case 10:
        case 11:
        case 12: result =  4; break;
        case 13: result =  v >= 10000 ? 5 : 4; break;
        case 14:
        case 15: result =  5; break;
        case 16: result =  v >= 100000 ? 6 : 5; break;
        case 17:
        case 18: result =  6; break;
        case 19: result =  v >= 1000000 ? 7 : 6; break;
        case 20:
        case 21:
        case 22: result =  7; break;
        case 23: result =  v >= 10000000 ? 8 : 7; break;
        case 24:
        case 25: result =  8; break;
        case 26: result =  v >= 100000000 ? 9 : 8; break;
        case 27:
        case 28:
        case 29:
        case 30:
        case 31: result =  9; break;
        default: assert(false);
    }
    return result;
}

private ubyte decimalLength9_2(const uint v) pure nothrow
{
    if (v == 0) // BSR and LZCNT UB when input is 0
        return 1;
    const ubyte lzc = cast(ubyte) bsr(v);
    static immutable pure nothrow ubyte function(uint)[32] tbl =
    [
    0 : (uint a) => ubyte(1),
    1 : (uint a) => ubyte(1),
    2 : (uint a) => ubyte(1),
    3 : (uint a) => a >= 10  ? ubyte(2) : ubyte(1),
    4 : (uint a) => ubyte(2),
    5 : (uint a) => ubyte(2),
    6 : (uint a) => a >= 100  ? ubyte(3) : ubyte(2),
    7 : (uint a) => ubyte(3),
    8 : (uint a) => ubyte(3),
    9 : (uint a) => a >= 1000  ? ubyte(4) : ubyte(3),
    10: (uint a) => ubyte(4),
    11: (uint a) => ubyte(4),
    12: (uint a) => ubyte(4),
    13: (uint a) => a >= 10000  ? ubyte(5) : ubyte(4),
    14: (uint a) => ubyte(5),
    15: (uint a) => ubyte(5),
    16: (uint a) => a >= 100000  ? ubyte(6) : ubyte(5),
    17: (uint a) => ubyte(6),
    18: (uint a) => ubyte(6),
    19: (uint a) => a >= 1000000  ? ubyte(7) : ubyte(6),
    20: (uint a) => ubyte(7),
    21: (uint a) => ubyte(7),
    22: (uint a) => ubyte(7),
    23: (uint a) => a >= 10000000  ? ubyte(8) : ubyte(7),
    24: (uint a) => ubyte(8),
    25: (uint a) => ubyte(8),
    26: (uint a) => a >= 100000000  ? ubyte(9) : ubyte(8),
    27: (uint a) => ubyte(9),
    28: (uint a) => ubyte(9),
    29: (uint a) => ubyte(9),
    30: (uint a) => ubyte(9),
    31: (uint a) => ubyte(9),
    ];
    return tbl[lzc](v);
}

ubyte decimalLength9_3(const uint v) pure nothrow
{
    if (v == 0) // BSR and LZCNT UB when input is 0
        return 1;
    ubyte result;
    enum ubyte doSwitch = ubyte(0);
    const ubyte lzc = cast(ubyte) bsr(v);
    const ubyte[32] decimalLength9LUT =
    [
    0 : ubyte(1),
    1 : ubyte(1),
    2 : ubyte(1),
    3 : doSwitch,
    4 : ubyte(2),
    5 : ubyte(2),
    6 : doSwitch,
    7 : ubyte(3),
    8 : ubyte(3),
    9 : doSwitch,
    10: ubyte(4),
    11: ubyte(4),
    12: ubyte(4),
    13: doSwitch,
    14: ubyte(5),
    15: ubyte(5),
    16: doSwitch,
    17: ubyte(6),
    18: ubyte(6),
    19: doSwitch,
    20: ubyte(7),
    21: ubyte(7),
    22: ubyte(7),
    23: doSwitch,
    24: ubyte(8),
    25: ubyte(8),
    26: doSwitch,
    27: ubyte(9),
    28: ubyte(9),
    29: ubyte(9),
    30: ubyte(9),
    31: ubyte(9),
    ];
    ubyte resultOrSelector = decimalLength9LUT[lzc];
    if (resultOrSelector != doSwitch)
        result = resultOrSelector;
    else switch (lzc)
    {
        case 3 : result =  v >= 10 ? 2 : 1; break;
        case 6 : result =  v >= 100 ? 3 : 2; break;
        case 9 : result =  v >= 1000 ? 4 : 3; break;
        case 13: result =  v >= 10000 ? 5 : 4; break;
        case 16: result =  v >= 100000 ? 6 : 5; break;
        case 19: result =  v >= 1000000 ? 7 : 6; break;
        case 23: result =  v >= 10000000 ? 8 : 7; break;
        case 26: result =  v >= 100000000 ? 9 : 8; break;
        default: assert(false);
    }
    return result;
}

ubyte decimalLength9_4(const uint v) pure nothrow
{
    return 1 +  (v >= 10) +
                (v >= 100) +
                (v >= 1000) +
                (v >= 10000) +
                (v >= 100000) +
                (v >= 1000000) +
                (v >= 10000000) +
                (v >= 100000000) ;
}

ubyte decimalLength9_5(const uint v) pure nothrow
{
    if (v == 0) // BSR and LZCNT UB when input is 0
        return 1;
    ubyte result;
    enum ubyte doBranchlessWay = ubyte(0);
    const ubyte lzc = cast(ubyte) bsr(v);
    const ubyte[32] decimalLength9LUT =
    [
    0 : ubyte(1),
    1 : ubyte(1),
    2 : ubyte(1),
    3 : doBranchlessWay,
    4 : ubyte(2),
    5 : ubyte(2),
    6 : doBranchlessWay,
    7 : ubyte(3),
    8 : ubyte(3),
    9 : doBranchlessWay,
    10: ubyte(4),
    11: ubyte(4),
    12: ubyte(4),
    13: doBranchlessWay,
    14: ubyte(5),
    15: ubyte(5),
    16: doBranchlessWay,
    17: ubyte(6),
    18: ubyte(6),
    19: doBranchlessWay,
    20: ubyte(7),
    21: ubyte(7),
    22: ubyte(7),
    23: doBranchlessWay,
    24: ubyte(8),
    25: ubyte(8),
    26: doBranchlessWay,
    27: ubyte(9),
    28: ubyte(9),
    29: ubyte(9),
    30: ubyte(9),
    31: ubyte(9),
    ];
    ubyte resultOrSelector = decimalLength9LUT[lzc];
    if (resultOrSelector != doBranchlessWay)
        result = resultOrSelector;
    else
    {
    result = 1 + (v >= 10) +
                 (v >= 100) +
                 (v >= 1000) +
                 (v >= 10000) +
                 (v >= 100000) +
                 (v >= 1000000) +
                 (v >= 10000000) +
                 (v >= 100000000) ;
    }
    return result;
}

void main(string[] args)
{
    uint    sum;
    ulong   count;
    uint    seed;
    ubyte   func;

    GC.disable;
getopt(args, config.passThrough, "c|count", &count, "f|func", &func, "s|seed", &seed); static const funcs = [&decimalLength9_0, &decimalLength9_1, &decimalLength9_2, &decimalLength9_3, &decimalLength9_4, &decimalLength9_5];

    rndGen.seed(seed);
    foreach (const ulong i; 0 .. count)
    {
        sum += funcs[func](rndGen.front());
        rndGen.popFront();
    }
    writeln(sum);
}
---

Could you share your benchmarking method maybe ?

Reply via email to