Robert Jacques: > I've spent some time having fun this afternoon optimizing array-equals > using vectorization techniques. I found that vectorizing using ulongs > worked best on my system except with the shortest strings, where a simple > Duff's device edged it out. If you'd like to try it out on your data set:
Thank you for your work :-) A version with your function, D version #8: // D version #8 import std.file: read; import std.c.stdio: printf; import std.exception: assumeUnique; bool arrayComp(bool useBitCompare=true, T) (const T[] a, const T[] b) pure nothrow { if (a.length != b.length) return false; static if (useBitCompare) { auto pab = cast(ubyte*)a.ptr; auto pbb = cast(ubyte*)b.ptr; if (pab is pbb) return true; auto byte_length = a.length * T.sizeof; auto pa_end = cast(ulong*)(pab + byte_length); final switch (byte_length % ulong.sizeof) { case 7: if (*pab++ != *pbb++) return false; case 6: if (*pab++ != *pbb++) return false; case 5: if (*pab++ != *pbb++) return false; case 4: if (*pab++ != *pbb++) return false; case 3: if (*pab++ != *pbb++) return false; case 2: if (*pab++ != *pbb++) return false; case 1: if (*pab++ != *pbb++) return false; case 0: } auto pa = cast(ulong*)pab; auto pb = cast(ulong*)pbb; while (pa < pa_end) { if (*pa++ != *pb++) return false; } } else { // default to a short duff's device auto pa = a.ptr; auto pb = b.ptr; if (pa == pb) return true; auto n = (a.length + 3) / 4; final switch (a.length % 4) { case 0: do { if (*pa++ != *pb++) return false; case 3: if (*pa++ != *pb++) return false; case 2: if (*pa++ != *pb++) return false; case 1: if (*pa++ != *pb++) return false; } while (--n > 0); } } return true; } int test(string data) { int count; foreach (i; 0 .. data.length - 3) { auto codon = data[i .. i + 3]; if (arrayComp(codon, "TAG") || arrayComp(codon, "TGA") || arrayComp(codon, "TAA")) count++; } return count; } void main() { char[] data0 = cast(char[])read("data.txt"); int n = 300; char[] data = new char[data0.length * n]; for (size_t pos; pos < data.length; pos += data0.length) data[pos .. pos+data0.length] = data0; string sdata = assumeUnique(data); printf("%d\n", test(sdata)); } Timings, dmd compiler, best of 4, seconds: D #1: 5.72 D #4: 1.84 D #5: 1.73 Psy: 1.59 D #8: 1.51 D #7: 0.56 (like #6 without length comparisons) D #2: 0.55 D #6: 0.47 D #3: 0.34 Your function can't be inlined because it's big, so this code isn't faster than inlined code like this generated by the compiler: (codon.length == 3 && codon[0] == 'T' && codon[1] == 'A' && codon[2] == 'G') Bye, bearophile