On Friday, 30 October 2015 at 21:33:25 UTC, Andrei Alexandrescu wrote:
Could you please take a look at GCC's generated code and implementation of memchr? -- Andrei

So i did. I rewrite code to do main work in cacheLineSize chunks. And this
is what GLIBC version do.
So main loop looks this:

-----
    do
    {
        // ptr16 is aligned 64
        ubyte16 r1 = __builtin_ia32_pcmpeqb128(ptr16[0], niddles);
        ubyte16 r2 = __builtin_ia32_pcmpeqb128(ptr16[1], niddles);
        ubyte16 r3 = __builtin_ia32_pcmpeqb128(ptr16[2], niddles);
        ubyte16 r4 = __builtin_ia32_pcmpeqb128(ptr16[3], niddles);

        r3 = __builtin_ia32_pmaxub128(r1, r3);
        r4 = __builtin_ia32_pmaxub128(r2, r4);
        r4 = __builtin_ia32_pmaxub128(r3, r4);
        mask = __builtin_ia32_pmovmskb128(r4);

        if (mask != 0)
        {
            mask = __builtin_ia32_pmovmskb128(r1);
            mixin(CheckMask); // Check and return value

            ++ptr16; num -= 16;
            mask = __builtin_ia32_pmovmskb128(r2);
            mixin(CheckMask);

            ++ptr16; num -= 16;
            r3 = __builtin_ia32_pcmpeqb128(*ptr16, niddles);
            mask = __builtin_ia32_pmovmskb128(r3);
            mixin(CheckMask);

            ++ptr16; num -= 16;
            r4 = __builtin_ia32_pcmpeqb128(*ptr16, niddles);
            mask = __builtin_ia32_pmovmskb128(r4);
            mixin(CheckMask);
        }

        num -= 64;
        ptr16 += 4;
    }
    while (num > 0);
-----

and my best result:

-----
Naive:        21.46     TickDuration(132842482)
SIMD:         1.161     TickDuration(7188211)
(was)SIMD:     3.04     TickDuration(18920182)
C:                1     TickDuration(6189222)

Reply via email to