Hey all, I've been experimenting with fixing some low-hanging performance fruit in the ElasticSearch codebase, and came across the fact that the MurmurHash implementation that is used by ByteRef.hashCode() is reading 4 bytes per loop iteration (which is likely an artifact from 32-bit architectures, which are ever-less-important). I made a small fix to change the implementation to read 8 bytes per loop iteration; I expected a very small impact (2-3% CPU or so over an indexing run in ElasticSearch), but got a pretty nontrivial throughput improvement over a few indexing benchmarks.
I tried running Lucene-only benchmarks, and succeeded in running the example from https://github.com/mikemccand/luceneutil - but I couldn't figure out how to run indexing benchmarks and how to interpret the results. Could someone help me in running the benchmarks for the attached patch? Cheers, Thomas
diff --git a/lucene/core/src/java/org/apache/lucene/util/StringHelper.java b/lucene/core/src/java/org/apache/lucene/util/StringHelper.java index 20a8cce420b..db3d8a1796b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/StringHelper.java +++ b/lucene/core/src/java/org/apache/lucene/util/StringHelper.java @@ -152,23 +152,50 @@ public abstract class StringHelper { /** * Returns the MurmurHash3_x86_32 hash. Original source/tests at * https://github.com/yonik/java_util/ + * + * The main loop was modified to load 8 bytes on each loop iteration to save on per-iteration + * overhead; the original version loaded 4 bytes per iteration which was not very sensible + * now that 64-bit architectures are the norm. */ @SuppressWarnings("fallthrough") public static int murmurhash3_x86_32(byte[] data, int offset, int len, int seed) { - final int c1 = 0xcc9e2d51; final int c2 = 0x1b873593; int h1 = seed; - int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte block + int roundedEnd8 = offset + (len & 0xfffffff8); // round down to 8 byte block + int roundedEnd4 = offset + (len & 0xfffffffc); // round down to 4 byte block - for (int i = offset; i < roundedEnd; i += 4) { + for (int i = offset; i < roundedEnd8; i += 8) { // little endian load order - int k1 = (int) BitUtil.VH_LE_INT.get(data, i); + long l1 = (long)BitUtil.VH_LE_LONG.get(data, i); + int k1 = (int)(l1 & 0xffffffff); + int k2 = (int)((l1 & 0xffffffff00000000L) >> 32); + k1 *= c1; k1 = Integer.rotateLeft(k1, 15); k1 *= c2; + k2 *= c1; + k2 = Integer.rotateLeft(k2, 15); + k2 *= c2; + + h1 ^= k1; + + h1 = Integer.rotateLeft(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + + h1 ^= k2; + + h1 = Integer.rotateLeft(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + } + + for (int i = roundedEnd8; i < roundedEnd4; i +=4) { + int k1 = (int) BitUtil.VH_LE_INT.get(data, i); + k1 *= c1; + k1 = Integer.rotateLeft(k1, 15); + k1 *= c2; h1 ^= k1; h1 = Integer.rotateLeft(h1, 13); h1 = h1 * 5 + 0xe6546b64; @@ -179,13 +206,13 @@ public abstract class StringHelper { switch (len & 0x03) { case 3: - k1 = (data[roundedEnd + 2] & 0xff) << 16; + k1 = (data[roundedEnd4 + 2] & 0xff) << 16; // fallthrough case 2: - k1 |= (data[roundedEnd + 1] & 0xff) << 8; + k1 |= (data[roundedEnd4 + 1] & 0xff) << 8; // fallthrough case 1: - k1 |= (data[roundedEnd] & 0xff); + k1 |= (data[roundedEnd4] & 0xff); k1 *= c1; k1 = Integer.rotateLeft(k1, 15); k1 *= c2;
--------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@lucene.apache.org For additional commands, e-mail: dev-h...@lucene.apache.org