On Saturday, 15 October 2016 at 19:42:03 UTC, Uplink_Coder wrote:
On Saturday, 15 October 2016 at 19:07:50 UTC, Patrick Schluter wrote:
At least with that lookup table below, you can detect isolated continuation bytes (192 and 193) and invalid codes (above 244).

__gshared static immutable ubyte[] charWidthTab = [
            1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
];

length 5 and 6 need not to be tested specifically for your goto.

If you use 0 instead of 1 the length check will suffice for throwing on invalid.

__gshared static immutable ubyte[] charWidthTab = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0,
    0, 0, 0, 0];

dchar myFront2(ref char[] str) pure
{
    auto c1 = str.ptr[0];
    if (c1 & 128)
    {
        if (c1 & 64)
        {
            int idx = 0;
            int l = charWidthTab.ptr[c1 - 192];
            if (str.length < l)
                goto Linvalid;
            dchar c = 0;
            l--;
            while (l)
            {
                l--;
                immutable cc = str.ptr[idx++];
                debug if (cc & 64) goto Linvalid;
                c |= cc;
                c <<= 6;
            }
            c |= str.ptr[idx];
            return c;

        }
    Linvalid:
        throw new Exception("yadayada");

    }
    else
    {
        return c1;
    }
}

This code proofs to be the fastest so far.
On UTF and non-UTF text.
It's also fairly small.

Reply via email to