This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-codec.git
commit b5ce7fae5bb6794d41216756f080a087cb956e90 Author: aherbert <[email protected]> AuthorDate: Thu Nov 21 14:04:30 2019 +0000 Fully documented MurmurHash3. This attempts no code changes. The highly modified class passes the current tests. mvn clirr:clirr report shows no changes. Default mvn goal passes: mvn clean verify apache-rat:check clirr:check javadoc:javadoc - Documented all public constants. - Documented IncrementalHash32 class. - Added <p> tags to paragraphs. - Renamed input variables for consistency across all method. - Renamed internal variables in the hash32, 64, 128 methods so the similarities in the methods are obvious. - prefix joavadoc @params and @returns with 'The' consistently - Added code examples for the methods that hash primitives for the equivalent using a ByteBuffer to encode the bytes. - Fix links in the class header. - Documented sign extension bugs in hash32 and hash128. - Documented hash64 is not from the reference code and does not match part of hash128. - Documented hash64 as may be released in a future version. - Refactored combination of bytes to little endian int/long into methods. --- .../apache/commons/codec/digest/MurmurHash3.java | 807 +++++++++++++++------ 1 file changed, 568 insertions(+), 239 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java index 6a276f9..95ab8b1 100644 --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java @@ -20,28 +20,46 @@ package org.apache.commons.codec.digest; /** * MurmurHash3 yields a 32-bit or 128-bit value. * - * MurmurHash is a non-cryptographic hash function suitable for general + * <p>MurmurHash is a non-cryptographic hash function suitable for general * hash-based lookup. The name comes from two basic operations, multiply (MU) * and rotate (R), used in its inner loop. Unlike cryptographic hash functions, * it is not specifically designed to be difficult to reverse by an adversary, - * making it unsuitable for cryptographic purposes. + * making it unsuitable for cryptographic purposes.</p> * - * 32-bit Java port of - * https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 - * 128-bit Java port of - * https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 + * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} + * and the 128-bit hash function {@code MurmurHash3_x64_128} from Austin Applyby's + * original {@code c++} code in SMHasher.</p> * - * This is a public domain code with no copyrights. From homepage of MurmurHash - * (https://code.google.com/p/smhasher/), "All MurmurHash versions are public - * domain software, and the author disclaims all copyright to their code." + * <p>This is public domain code with no copyrights. From homepage of + * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p> * - * Copied from Apache Hive: - * https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java + * <blockquote> + * "All MurmurHash versions are public domain software, and the author + * disclaims all copyright to their code." + * </blockquote> + * + * <p>Original adaption from Apache Hive. That adaption contains a {@code hash64} method + * that is not part of the original MurmurHash3 code.<p> * * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> + * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> + * Original MurmurHash3 c++ code</a> + * @see <a href="https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java"> + * Apache Hive Murmer3</a> * @since 1.13 */ public final class MurmurHash3 { + /** + * A random number to use for a hash code. + */ + public static final long NULL_HASHCODE = 2862933555777941757L; + + /** + * A default seed to use for the murmur hash algorithm. + * Has the value {@code 104729}. + * + */ + public static final int DEFAULT_SEED = 104729; /** TODO Replace on Java 8 with Long.BYTES. */ static final int LONG_BYTES = Long.SIZE / Byte.SIZE; @@ -52,10 +70,7 @@ public final class MurmurHash3 { /** TODO Replace on Java 8 with Short.BYTES. */ static final int SHORT_BYTES = Short.SIZE / Byte.SIZE; - // from 64-bit linear congruential generator - public static final long NULL_HASHCODE = 2862933555777941757L; - - // Constants for 32 bit variant + // Constants for 32-bit variant private static final int C1_32 = 0xcc9e2d51; private static final int C2_32 = 0x1b873593; private static final int R1_32 = 15; @@ -63,7 +78,7 @@ public final class MurmurHash3 { private static final int M_32 = 5; private static final int N_32 = 0xe6546b64; - // Constants for 128 bit variant + // Constants for 128-bit variant private static final long C1 = 0x87c37b91114253d5L; private static final long C2 = 0x4cf5ad432745937fL; private static final int R1 = 31; @@ -73,123 +88,223 @@ public final class MurmurHash3 { private static final int N1 = 0x52dce729; private static final int N2 = 0x38495ab5; - public static final int DEFAULT_SEED = 104729; - - // all methods static; private constructor. + /** No instance methods. */ private MurmurHash3() { } /** - * Generates 32 bit hash from two longs with default seed value. + * Generates 32-bit hash from two longs with a default seed value. + * This is a helper method that will produce the same result as: * - * @param l0 long to hash - * @param l1 long to hash - * @return 32 bit hash - */ - public static int hash32(final long l0, final long l1) { - return hash32(l0, l1, DEFAULT_SEED); - } - - /** - * Generates 32 bit hash from a long with default seed value. + * <pre> + * int seed = 104729; + * int hash = hash32(ByteBuffer.allocate(16) + * .putLong(data1) + * .putLong(data2) + * .array(), 16, seed); + * </pre> * - * @param l0 long to hash - * @return 32 bit hash + * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, int)} does not affect + * this result as there are no bytes left over from dividing the length by 4.<p> + * + * @param data1 The first long to hash + * @param data2 The second long to hash + * @return The 32-bit hash + * @see #hash32(byte[], int, int) */ - public static int hash32(final long l0) { - return hash32(l0, DEFAULT_SEED); + public static int hash32(final long data1, final long data2) { + return hash32(data1, data2, DEFAULT_SEED); } /** - * Generates 32 bit hash from a long with the given seed. + * Generates 32-bit hash from two longs with the given seed. + * This is a helper method that will produce the same result as: + * + * <pre> + * int hash = hash32(ByteBuffer.allocate(16) + * .putLong(data1) + * .putLong(data2) + * .array(), 16, seed); + * </pre> * - * @param l0 long to hash - * @param seed initial seed value - * @return 32 bit hash + * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, int)} does not affect + * this result as there are no bytes left over from dividing the length by 4.<p> + * + * @param data1 The first long to hash + * @param data2 The second long to hash + * @param seed The initial seed value + * @return The 32-bit hash + * @see #hash32(byte[], int, int) */ - public static int hash32(final long l0, final int seed) { + public static int hash32(final long data1, final long data2, final int seed) { int hash = seed; - final long r0 = Long.reverseBytes(l0); + final long r0 = Long.reverseBytes(data1); + final long r1 = Long.reverseBytes(data2); hash = mix32((int) r0, hash); hash = mix32((int) (r0 >>> 32), hash); + hash = mix32((int) (r1), hash); + hash = mix32((int) (r1 >>> 32), hash); - return fmix32(LONG_BYTES, hash); + hash ^= LONG_BYTES * 2; + return fmix32(hash); } /** - * Generates 32 bit hash from two longs with the given seed. + * Generates 32-bit hash from a long with a default seed value. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 104729; + * int hash = hash32(ByteBuffer.allocate(8) + * .putLong(data) + * .array(), 8, seed); + * </pre> * - * @param l0 long to hash - * @param l1 long to hash - * @param seed initial seed value - * @return 32 bit hash + * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, int)} does not affect + * this result as there are no bytes left over from dividing the length by 4.<p> + * + * @param data The long to hash + * @return The 32-bit hash + * @see #hash32(byte[], int, int) */ - public static int hash32(final long l0, final long l1, final int seed) { + public static int hash32(final long data) { + return hash32(data, DEFAULT_SEED); + } + + /** + * Generates 32-bit hash from a long with the given seed. + * This is a helper method that will produce the same result as: + * + * <pre> + * int hash = hash32(ByteBuffer.allocate(8) + * .putLong(data) + * .array(), 8, seed); + * </pre> + * + * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, int)} does not affect + * this result as there are no bytes left over from dividing the length by 4.<p> + * + * @param data The long to hash + * @param seed The initial seed value + * @return The 32-bit hash + * @see #hash32(byte[], int, int) + */ + public static int hash32(final long data, final int seed) { int hash = seed; - final long r0 = Long.reverseBytes(l0); - final long r1 = Long.reverseBytes(l1); + final long r0 = Long.reverseBytes(data); hash = mix32((int) r0, hash); hash = mix32((int) (r0 >>> 32), hash); - hash = mix32((int) (r1), hash); - hash = mix32((int) (r1 >>> 32), hash); - return fmix32(LONG_BYTES * 2, hash); + hash ^= LONG_BYTES; + return fmix32(hash); } /** - * Generates 32 bit hash from byte array with the default seed. + * Generates 32-bit hash from the byte array with a default seed. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 104729; + * int hash = hash32(data, 0, data.length, seed); + * </pre> * - * @param data - input byte array - * @return 32 bit hash + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + * + * @param data The input byte array + * @return The 32-bit hash + * @see #hash32(byte[], int, int, int) */ public static int hash32(final byte[] data) { return hash32(data, 0, data.length, DEFAULT_SEED); } /** - * Generates 32 bit hash from a string with the default seed. + * Generates 32-bit hash from a string with a default seed. + * The string is converted to bytes using the default encoding. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 104729; + * byte[] bytes = data.getBytes(); + * int hash = hash32(bytes, 0, bytes.length, seed); + * </pre> * - * @param data - input string - * @return 32 bit hash + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + * + * @param data The input string + * @return The 32-bit hash + * @see #hash32(byte[], int, int, int) */ public static int hash32(final String data) { - final byte[] origin = data.getBytes(); - return hash32(origin, 0, origin.length, DEFAULT_SEED); + final byte[] bytes = data.getBytes(); + return hash32(bytes, 0, bytes.length, DEFAULT_SEED); } /** - * Generates 32 bit hash from byte array with the default seed. + * Generates 32-bit hash from the byte array with the given length and a default seed. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 104729; + * int hash = hash32(data, 0, data.length, seed); + * </pre> * - * @param data - input byte array - * @param length - length of array - * @return 32 bit hash + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + * + * @param data The input byte array + * @param length The length of array + * @return The 32-bit hash + * @see #hash32(byte[], int, int, int) */ public static int hash32(final byte[] data, final int length) { return hash32(data, length, DEFAULT_SEED); } /** - * Generates 32 bit hash from byte array with the given length and seed. + * Generates 32-bit hash from the byte array with the given length and seed. This is a + * helper method that will produce the same result as: + * + * <pre> + * int hash = hash32(data, 0, data.length, seed); + * </pre> * - * @param data - input byte array - * @param length - length of array - * @param seed - seed. (default 0) - * @return 32 bit hash + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + * + * @param data The input byte array + * @param length The length of array + * @param seed The initial seed value + * @return The 32-bit hash + * @see #hash32(byte[], int, int, int) */ public static int hash32(final byte[] data, final int length, final int seed) { return hash32(data, 0, length, seed); } /** - * Generates 32 bit hash from byte array with the given length, offset and seed. + * Generates 32-bit hash from the byte array with the given offset, length and seed. + * + * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} + * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> * - * @param data - input byte array - * @param offset - offset of data - * @param length - length of array - * @param seed - seed. (default 0) - * @return 32 bit hash + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + * + * @param data The input byte array + * @param offset The offset of data + * @param length The length of array + * @param seed The initial seed value + * @return The 32-bit hash */ public static int hash32(final byte[] data, final int offset, final int length, final int seed) { int hash = seed; @@ -197,23 +312,24 @@ public final class MurmurHash3 { // body for (int i = 0; i < nblocks; i++) { - final int i_4 = i << 2; - final int k = (data[offset + i_4] & 0xff) | ((data[offset + i_4 + 1] & 0xff) << 8) - | ((data[offset + i_4 + 2] & 0xff) << 16) | ((data[offset + i_4 + 3] & 0xff) << 24); - + final int index = offset + (i << 2); + final int k = getLittleEndianInt(data, index); hash = mix32(k, hash); } // tail - final int idx = nblocks << 2; + // ************ + // Note: This fails to apply masking using 0xff to the 3 remaining bytes. + // ************ + final int index = offset + (nblocks << 2); int k1 = 0; - switch (length - idx) { + switch (offset + length - index) { case 3: - k1 ^= data[offset + idx + 2] << 16; + k1 ^= data[index + 2] << 16; case 2: - k1 ^= data[offset + idx + 1] << 8; + k1 ^= data[index + 1] << 8; case 1: - k1 ^= data[offset + idx]; + k1 ^= data[index]; // mix functions k1 *= C1_32; @@ -222,26 +338,29 @@ public final class MurmurHash3 { hash ^= k1; } - return fmix32(length, hash); + hash ^= length; + return fmix32(hash); } /** - * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit - * variant. + * Generates 64-bit hash from a long with a default seed. * - * @param data - input byte array - * @return 64 bit hash - */ - public static long hash64(final byte[] data) { - return hash64(data, 0, data.length, DEFAULT_SEED); - } - - /** - * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit - * variant. + * <p>This is a Murmur3-like 64-bit variant. + * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong> + * It may be removed in a future release.</p> + * + * <p>This is a helper method that will produce the same result as:</p> * - * @param data - input long - * @return 64 bit hash + * <pre> + * int seed = 104729; + * long hash = hash64(ByteBuffer.allocate(8) + * .putLong(data) + * .array(), 0, 8, seed); + * <pre> + * + * @param data The long to hash + * @return The 64-bit hash + * @see #hash64(byte[], int, int, int) */ public static long hash64(final long data) { long hash = DEFAULT_SEED; @@ -260,11 +379,24 @@ public final class MurmurHash3 { } /** - * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit - * variant. + * Generates 64-bit hash from an int with a default seed. + * + * <p>This is a Murmur3-like 64-bit variant. + * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong> + * It may be removed in a future release.</p> + * + * <p>This is a helper method that will produce the same result as:</p> + * + * <pre> + * int seed = 104729; + * long hash = hash64(ByteBuffer.allocate(4) + * .putInt(data) + * .array(), 0, 4, seed); + * <pre> * - * @param data - input int - * @return 64 bit hash + * @param data The int to hash + * @return The 64-bit hash + * @see #hash64(byte[], int, int, int) */ public static long hash64(final int data) { long k1 = Integer.reverseBytes(data) & (-1L >>> 32); @@ -281,11 +413,24 @@ public final class MurmurHash3 { } /** - * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit - * variant. + * Generates 64-bit hash from a short with a default seed. + * + * <p>This is a Murmur3-like 64-bit variant. + * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong> + * It may be removed in a future release.</p> + * + * <p>This is a helper method that will produce the same result as:</p> + * + * <pre> + * int seed = 104729; + * long hash = hash64(ByteBuffer.allocate(2) + * .putShort(data) + * .array(), 0, 2, seed); + * <pre> * - * @param data - input short - * @return 64 bit hash + * @param data The short to hash + * @return The 64-bit hash + * @see #hash64(byte[], int, int, int) */ public static long hash64(final short data) { long hash = DEFAULT_SEED; @@ -304,38 +449,84 @@ public final class MurmurHash3 { } /** - * Generates 64 bit hash from byte array with the given length, offset and - * default seed. + * Generates 64-bit hash from a byte array with a default seed. * - * @param data - input byte array - * @param offset - offset of data - * @param length - length of array - * @return 64 bit hash + * <p>This is a Murmur3-like 64-bit variant. + * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong> + * It may be removed in a future release.</p> + * + * <p>This is a helper method that will produce the same result as:</p> + * + * <pre> + * int seed = 104729; + * long hash = hash64(data, 0, data.length, seed); + * <pre> + * + * @param data The input byte array + * @return The 64-bit hash + * @see #hash64(byte[], int, int, int) + */ + public static long hash64(final byte[] data) { + return hash64(data, 0, data.length, DEFAULT_SEED); + } + + /** + * Generates 64-bit hash from a byte array with the given offset and length and a default seed. + * + * <p>This is a Murmur3-like 64-bit variant. + * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong> + * It may be removed in a future release.</p> + * + * <p>This is a helper method that will produce the same result as:</p> + * + * <pre> + * int seed = 104729; + * long hash = hash64(data, offset, length, seed); + * <pre> + * + * @param data The input byte array + * @param offset The offset of data + * @param length The length of array + * @return The 64-bit hash + * @see #hash64(byte[], int, int, int) */ public static long hash64(final byte[] data, final int offset, final int length) { return hash64(data, offset, length, DEFAULT_SEED); } /** - * Generates 64 bit hash from byte array with the given length, offset and seed. + * Generates 64-bit hash from a byte array with the given offset, length and seed. + * + * <p>This is a Murmur3-like 64-bit variant. + * <strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong> + * It may be removed in a future release.</p> + * + * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks + * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash + * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return + * the same value as the first or second 64-bits of the function + * {@link #hash128(byte[], int, int, int)}.<p> * - * @param data - input byte array - * @param offset - offset of data - * @param length - length of array - * @param seed - seed. (default 0) - * @return 64 bit hash + * <p>Use of this method is not advised. Use the first long returned from + * {@link #hash128(byte[], int, int, int)}.<p> + * + * @param data The input byte array + * @param offset The offset of data + * @param length The length of array + * @param seed The initial seed value + * @return The 64-bit hash */ public static long hash64(final byte[] data, final int offset, final int length, final int seed) { + // ************ + // Note: This fails to apply masking using 0xffffffffL to the seed. + // ************ long hash = seed; final int nblocks = length >> 3; // body for (int i = 0; i < nblocks; i++) { - final int i8 = i << 3; - long k = ((long) data[offset + i8] & 0xff) | (((long) data[offset + i8 + 1] & 0xff) << 8) - | (((long) data[offset + i8 + 2] & 0xff) << 16) | (((long) data[offset + i8 + 3] & 0xff) << 24) - | (((long) data[offset + i8 + 4] & 0xff) << 32) | (((long) data[offset + i8 + 5] & 0xff) << 40) - | (((long) data[offset + i8 + 6] & 0xff) << 48) | (((long) data[offset + i8 + 7] & 0xff) << 56); + final int index = offset + (i << 3); + long k = getLittleEndianLong(data, index); // mix functions k *= C1; @@ -347,22 +538,22 @@ public final class MurmurHash3 { // tail long k1 = 0; - final int tailStart = nblocks << 3; - switch (length - tailStart) { + final int index = offset + (nblocks << 3); + switch (offset + length - index) { case 7: - k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48; + k1 ^= ((long) data[index + 6] & 0xff) << 48; case 6: - k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40; + k1 ^= ((long) data[index + 5] & 0xff) << 40; case 5: - k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32; + k1 ^= ((long) data[index + 4] & 0xff) << 32; case 4: - k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24; + k1 ^= ((long) data[index + 3] & 0xff) << 24; case 3: - k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16; + k1 ^= ((long) data[index + 2] & 0xff) << 16; case 2: - k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8; + k1 ^= ((long) data[index + 1] & 0xff) << 8; case 1: - k1 ^= ((long) data[offset + tailStart] & 0xff); + k1 ^= ((long) data[index] & 0xff); k1 *= C1; k1 = Long.rotateLeft(k1, R1); k1 *= C2; @@ -377,52 +568,76 @@ public final class MurmurHash3 { } /** - * Murmur3 128-bit variant. + * Generates 128-bit hash from the byte array with a default seed. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 104729; + * int hash = hash128(data, 0, data.length, seed); + * </pre> * - * @param data - input byte array - * @return - 128 bit hash (2 longs) + * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect + * this result as the default seed is positive.<p> + * + * @param data The input byte array + * @return The 128-bit hash (2 longs) + * @see #hash128(byte[], int, int, int) */ public static long[] hash128(final byte[] data) { return hash128(data, 0, data.length, DEFAULT_SEED); } /** - * Murmur3 128-bit variant. + * Generates 128-bit hash from a string with a default seed. + * The string is converted to bytes using the default encoding. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 104729; + * byte[] bytes = data.getBytes(); + * int hash = hash128(bytes, 0, bytes.length, seed); + * </pre> * - * @param data - input String - * @return - 128 bit hash (2 longs) + * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect + * this result as the default seed is positive.<p> + * + * @param data The input String + * @return The 128-bit hash (2 longs) + * @see #hash128(byte[], int, int, int) */ public static long[] hash128(final String data) { - final byte[] origin = data.getBytes(); - return hash128(origin, 0, origin.length, DEFAULT_SEED); + final byte[] bytes = data.getBytes(); + return hash128(bytes, 0, bytes.length, DEFAULT_SEED); } /** - * Murmur3 128-bit variant. + * Generates 128-bit hash from the byte array with the given offset, length and seed. + * + * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} + * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> * - * @param data - input byte array - * @param offset - the first element of array - * @param length - length of array - * @param seed - seed. (default is 0) - * @return - 128 bit hash (2 longs) + * <p>This implementation contains a sign-extension bug in the seed initialisation. + * This manifests if the seed is negative.<p> + * + * @param data The input byte array + * @param offset The first element of array + * @param length The length of array + * @param seed The initial seed value + * @return The 128-bit hash (2 longs) */ public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) { + // ************ + // Note: This fails to apply masking using 0xffffffffL to the seed. + // ************ long h1 = seed; long h2 = seed; final int nblocks = length >> 4; // body for (int i = 0; i < nblocks; i++) { - final int i16 = i << 4; - long k1 = ((long) data[offset + i16] & 0xff) | (((long) data[offset + i16 + 1] & 0xff) << 8) - | (((long) data[offset + i16 + 2] & 0xff) << 16) | (((long) data[offset + i16 + 3] & 0xff) << 24) - | (((long) data[offset + i16 + 4] & 0xff) << 32) | (((long) data[offset + i16 + 5] & 0xff) << 40) - | (((long) data[offset + i16 + 6] & 0xff) << 48) | (((long) data[offset + i16 + 7] & 0xff) << 56); - - long k2 = ((long) data[offset + i16 + 8] & 0xff) | (((long) data[offset + i16 + 9] & 0xff) << 8) - | (((long) data[offset + i16 + 10] & 0xff) << 16) | (((long) data[offset + i16 + 11] & 0xff) << 24) - | (((long) data[offset + i16 + 12] & 0xff) << 32) | (((long) data[offset + i16 + 13] & 0xff) << 40) - | (((long) data[offset + i16 + 14] & 0xff) << 48) | (((long) data[offset + i16 + 15] & 0xff) << 56); + final int index = offset + (i << 4); + long k1 = getLittleEndianLong(data, index); + long k2 = getLittleEndianLong(data, index + 8); // mix functions for k1 k1 *= C1; @@ -446,43 +661,43 @@ public final class MurmurHash3 { // tail long k1 = 0; long k2 = 0; - final int tailStart = nblocks << 4; - switch (length - tailStart) { + final int index = offset + (nblocks << 4); + switch (offset + length - index) { case 15: - k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48; + k2 ^= ((long) data[index + 14] & 0xff) << 48; case 14: - k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40; + k2 ^= ((long) data[index + 13] & 0xff) << 40; case 13: - k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32; + k2 ^= ((long) data[index + 12] & 0xff) << 32; case 12: - k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24; + k2 ^= ((long) data[index + 11] & 0xff) << 24; case 11: - k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16; + k2 ^= ((long) data[index + 10] & 0xff) << 16; case 10: - k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8; + k2 ^= ((long) data[index + 9] & 0xff) << 8; case 9: - k2 ^= data[offset + tailStart + 8] & 0xff; + k2 ^= data[index + 8] & 0xff; k2 *= C2; k2 = Long.rotateLeft(k2, R3); k2 *= C1; h2 ^= k2; case 8: - k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56; + k1 ^= ((long) data[index + 7] & 0xff) << 56; case 7: - k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48; + k1 ^= ((long) data[index + 6] & 0xff) << 48; case 6: - k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40; + k1 ^= ((long) data[index + 5] & 0xff) << 40; case 5: - k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32; + k1 ^= ((long) data[index + 4] & 0xff) << 32; case 4: - k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24; + k1 ^= ((long) data[index + 3] & 0xff) << 24; case 3: - k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16; + k1 ^= ((long) data[index + 2] & 0xff) << 16; case 2: - k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8; + k1 ^= ((long) data[index + 1] & 0xff) << 8; case 1: - k1 ^= data[offset + tailStart] & 0xff; + k1 ^= data[index] & 0xff; k1 *= C1; k1 = Long.rotateLeft(k1, R1); k1 *= C2; @@ -505,6 +720,46 @@ public final class MurmurHash3 { return new long[] { h1, h2 }; } + + /** + * Gets the little-endian long from 8 bytes starting at the specified index. + * + * @param data The data + * @param index The index + * @return The little-endian long + */ + private static long getLittleEndianLong(final byte[] data, final int index) { + return (((long) data[index ] & 0xff) ) | + (((long) data[index + 1] & 0xff) << 8) | + (((long) data[index + 2] & 0xff) << 16) | + (((long) data[index + 3] & 0xff) << 24) | + (((long) data[index + 4] & 0xff) << 32) | + (((long) data[index + 5] & 0xff) << 40) | + (((long) data[index + 6] & 0xff) << 48) | + (((long) data[index + 7] & 0xff) << 56); + } + + /** + * Gets the little-endian int from 4 bytes starting at the specified index. + * + * @param data The data + * @param index The index + * @return The little-endian int + */ + private static int getLittleEndianInt(final byte[] data, final int index) { + return ((data[index ] & 0xff) ) | + ((data[index + 1] & 0xff) << 8) | + ((data[index + 2] & 0xff) << 16) | + ((data[index + 3] & 0xff) << 24); + } + + /** + * Perform the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. + * + * @param k The data to add to the hash + * @param hash The current hash + * @return The new hash + */ private static int mix32(int k, int hash) { k *= C1_32; k = Integer.rotateLeft(k, R1_32); @@ -513,104 +768,172 @@ public final class MurmurHash3 { return Integer.rotateLeft(hash, R2_32) * M_32 + N_32; } - private static int fmix32(final int length, int hash) { - hash ^= length; + /** + * Perform the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. + * + * @param hash The current hash + * @return The final hash + */ + private static int fmix32(int hash) { hash ^= (hash >>> 16); hash *= 0x85ebca6b; hash ^= (hash >>> 13); hash *= 0xc2b2ae35; hash ^= (hash >>> 16); - return hash; } - private static long fmix64(long h) { - h ^= (h >>> 33); - h *= 0xff51afd7ed558ccdL; - h ^= (h >>> 33); - h *= 0xc4ceb9fe1a85ec53L; - h ^= (h >>> 33); - return h; + /** + * Perform the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}. + * + * @param hash The current hash + * @return The final hash + */ + private static long fmix64(long hash) { + hash ^= (hash >>> 33); + hash *= 0xff51afd7ed558ccdL; + hash ^= (hash >>> 33); + hash *= 0xc4ceb9fe1a85ec53L; + hash ^= (hash >>> 33); + return hash; } + /** + * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new + * hash computed. + * + * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} + * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> + * + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + */ public static class IncrementalHash32 { - byte[] tail = new byte[3]; - int tailLen; - int totalLen; - int hash; - - public final void start(final int hash) { - tailLen = totalLen = 0; - this.hash = hash; + /** The size of byte blocks that are processed together. */ + private static final int BLOCK_SIZE = 4; + + /** Up to 3 unprocessed bytes from input data. */ + private final byte[] unprocessed = new byte[3]; + /** The number of unprocessed bytes in the tail data. */ + private int unprocessedLength; + /** The total number of input bytes added since the start. */ + private int totalLen; + /** + * The current running hash. + * This must be finalised to generate the 32-bit hash value. + */ + private int hash; + + /** + * Start a new incremental hash. + * + * @param seed The initial seed value + */ + public final void start(final int seed) { + // Reset + unprocessedLength = totalLen = 0; + this.hash = seed; } - public final void add(final byte[] data, int offset, final int length) { + /** + * Adds the byte array to the current incremental hash. + * + * @param data The input byte array + * @param offset The offset of data + * @param length The length of array + */ + public final void add(final byte[] data, final int offset, final int length) { if (length == 0) { + // Nothing to add return; } totalLen += length; - if (tailLen + length < 4) { - System.arraycopy(data, offset, tail, tailLen, length); - tailLen += length; + + // Process the bytes in blocks of 4. + // New bytes must be added to any current unprocessed bytes, + // then processed in blocks of 4 and the remaining bytes saved: + // + // |--|---------------------------|--| + // unprocessed + // main block + // remaining + + // Check if the unprocessed bytes and new bytes can fill a block of 4 + if (unprocessedLength + length < BLOCK_SIZE) { + // Not enough so add to the unprocessed bytes + System.arraycopy(data, offset, unprocessed, unprocessedLength, length); + unprocessedLength += length; return; } - int offset2 = 0; - if (tailLen > 0) { - offset2 = (4 - tailLen); + + // Combine unprocessed bytes with new bytes. + int newOffset; + int newLength; + if (unprocessedLength > 0) { int k = -1; - switch (tailLen) { + switch (unprocessedLength) { case 1: - k = orBytes(tail[0], data[offset], data[offset + 1], data[offset + 2]); + k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]); break; case 2: - k = orBytes(tail[0], tail[1], data[offset], data[offset + 1]); + k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]); break; case 3: - k = orBytes(tail[0], tail[1], tail[2], data[offset]); + k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]); break; default: - throw new AssertionError(tailLen); + throw new AssertionError("Unprocessed length should be 1, 2, or 3: " + unprocessedLength); } - // mix functions - k *= C1_32; - k = Integer.rotateLeft(k, R1_32); - k *= C2_32; - hash ^= k; - hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; + hash = mix32(k, hash); + // Update the offset and length + final int consumed = BLOCK_SIZE - unprocessedLength; + newOffset = offset + consumed; + newLength = length - consumed; + } else { + newOffset = offset; + newLength = length; } - final int length2 = length - offset2; - offset += offset2; - final int nblocks = length2 >> 2; - for (int i = 0; i < nblocks; i++) { - final int i_4 = (i << 2) + offset; - int k = orBytes(data[i_4], data[i_4 + 1], data[i_4 + 2], data[i_4 + 3]); + // Main processing of blocks of 4 bytes + final int nblocks = newLength >> 2; - // mix functions - k *= C1_32; - k = Integer.rotateLeft(k, R1_32); - k *= C2_32; - hash ^= k; - hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; + for (int i = 0; i < nblocks; i++) { + final int index = newOffset + (i << 2); + final int k = getLittleEndianInt(data, index); + hash = mix32(k, hash); } + // Save left-over unprocessed bytes final int consumed = (nblocks << 2); - tailLen = length2 - consumed; - if (consumed == length2) { - return; + unprocessedLength = newLength - consumed; + if (unprocessedLength != 0) { + System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength); } - System.arraycopy(data, offset + consumed, tail, 0, tailLen); } + /** + * Generate the 32-bit hash value. Repeat calls to this method with no additional data + * will generate the same hash value. + * + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + * + * @return The 32-bit hash + */ public final int end() { + // ************ + // Note: This fails to apply masking using 0xff to the 3 remaining bytes. + // ************ int k1 = 0; - switch (tailLen) { + switch (unprocessedLength) { case 3: - k1 ^= tail[2] << 16; + k1 ^= unprocessed[2] << 16; case 2: - k1 ^= tail[1] << 8; + k1 ^= unprocessed[1] << 8; case 1: - k1 ^= tail[0]; + k1 ^= unprocessed[0]; // mix functions k1 *= C1_32; @@ -621,16 +944,22 @@ public final class MurmurHash3 { // finalization hash ^= totalLen; - hash ^= (hash >>> 16); - hash *= 0x85ebca6b; - hash ^= (hash >>> 13); - hash *= 0xc2b2ae35; - hash ^= (hash >>> 16); - return hash; + return fmix32(hash); } - } - private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) { - return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24); + /** + * Combine the bytes using an Or operation ({@code | } in a little-endian representation + * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most + * significant. + * + * @param b1 The first byte + * @param b2 The second byte + * @param b3 The third byte + * @param b4 The fourth byte + * @return The 32-bit integer + */ + private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) { + return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24); + } } }
