On Saturday, 15 October 2016 at 19:42:03 UTC, Uplink_Coder wrote:
On Saturday, 15 October 2016 at 19:07:50 UTC, Patrick Schluter wrote:At least with that lookup table below, you can detect isolated continuation bytes (192 and 193) and invalid codes (above 244).__gshared static immutable ubyte[] charWidthTab = [ 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ];length 5 and 6 need not to be tested specifically for your goto.If you use 0 instead of 1 the length check will suffice for throwing on invalid.
__gshared static immutable ubyte[] charWidthTab = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0,
0, 0, 0, 0]; dchar myFront2(ref char[] str) pure { auto c1 = str.ptr[0]; if (c1 & 128) { if (c1 & 64) { int idx = 0; int l = charWidthTab.ptr[c1 - 192]; if (str.length < l) goto Linvalid; dchar c = 0; l--; while (l) { l--; immutable cc = str.ptr[idx++]; debug if (cc & 64) goto Linvalid; c |= cc; c <<= 6; } c |= str.ptr[idx]; return c; } Linvalid: throw new Exception("yadayada"); } else { return c1; } } This code proofs to be the fastest so far. On UTF and non-UTF text. It's also fairly small.