[commons-codec] 01/07: Fully documented MurmurHash3.

aherbert Thu, 21 Nov 2019 07:55:04 -0800

This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git


commit b5ce7fae5bb6794d41216756f080a087cb956e90
Author: aherbert <[email protected]>
AuthorDate: Thu Nov 21 14:04:30 2019 +0000

    Fully documented MurmurHash3.
    
    This attempts no code changes.
    
    The highly modified class passes the current tests.
    
    mvn clirr:clirr report shows no changes.
    Default mvn goal passes:
    mvn clean verify apache-rat:check clirr:check javadoc:javadoc
    
    - Documented all public constants.
    - Documented IncrementalHash32 class.
    - Added <p> tags to paragraphs.
    - Renamed input variables for consistency across all method.
    - Renamed internal variables in the hash32, 64, 128 methods so the
    similarities in the methods are obvious.
    - prefix joavadoc @params and @returns with 'The' consistently
    - Added code examples for the methods that hash primitives for the
    equivalent using a ByteBuffer to encode the bytes.
    - Fix links in the class header.
    - Documented sign extension bugs in hash32 and hash128.
    - Documented hash64 is not from the reference code and does not match
    part of hash128.
    - Documented hash64 as may be released in a future version.
    - Refactored combination of bytes to little endian int/long into
    methods.
---
 .../apache/commons/codec/digest/MurmurHash3.java   | 807 +++++++++++++++------
 1 file changed, 568 insertions(+), 239 deletions(-)

diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java 
b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index 6a276f9..95ab8b1 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -20,28 +20,46 @@ package org.apache.commons.codec.digest;
 /**
  * MurmurHash3 yields a 32-bit or 128-bit value.
  *
- * MurmurHash is a non-cryptographic hash function suitable for general
+ * <p>MurmurHash is a non-cryptographic hash function suitable for general
  * hash-based lookup. The name comes from two basic operations, multiply (MU)
  * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
  * it is not specifically designed to be difficult to reverse by an adversary,
- * making it unsuitable for cryptographic purposes.
+ * making it unsuitable for cryptographic purposes.</p>
  *
- * 32-bit Java port of
- * https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94
- * 128-bit Java port of
- * https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255
+ * <p>This contains a Java port of the 32-bit hash function {@code 
MurmurHash3_x86_32}
+ * and the 128-bit hash function {@code MurmurHash3_x64_128} from Austin 
Applyby's
+ * original {@code c++} code in SMHasher.</p>
  *
- * This is a public domain code with no copyrights. From homepage of MurmurHash
- * (https://code.google.com/p/smhasher/), "All MurmurHash versions are public
- * domain software, and the author disclaims all copyright to their code."
+ * <p>This is public domain code with no copyrights. From homepage of
+ * <a href="https://github.com/aappleby/smhasher";>SMHasher</a>:</p>
  *
- * Copied from Apache Hive:
- * 
https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java
+ * <blockquote>
+ * "All MurmurHash versions are public domain software, and the author
+ * disclaims all copyright to their code."
+ * </blockquote>
+ *
+ * <p>Original adaption from Apache Hive. That adaption contains a {@code 
hash64} method
+ * that is not part of the original MurmurHash3 code.<p>
  *
  * @see <a href="https://en.wikipedia.org/wiki/MurmurHash";>MurmurHash</a>
+ * @see <a 
href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp";>
+ *   Original MurmurHash3 c++ code</a>
+ * @see <a 
href="https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java";>
+ *   Apache Hive Murmer3</a>
  * @since 1.13
  */
 public final class MurmurHash3 {
+    /**
+     * A random number to use for a hash code.
+     */
+    public static final long NULL_HASHCODE = 2862933555777941757L;
+
+    /**
+     * A default seed to use for the murmur hash algorithm.
+     * Has the value {@code 104729}.
+     *
+     */
+    public static final int DEFAULT_SEED = 104729;
 
     /** TODO Replace on Java 8 with Long.BYTES. */
     static final int LONG_BYTES = Long.SIZE / Byte.SIZE;
@@ -52,10 +70,7 @@ public final class MurmurHash3 {
     /** TODO Replace on Java 8 with Short.BYTES. */
     static final int SHORT_BYTES = Short.SIZE / Byte.SIZE;
 
-    // from 64-bit linear congruential generator
-    public static final long NULL_HASHCODE = 2862933555777941757L;
-
-    // Constants for 32 bit variant
+    // Constants for 32-bit variant
     private static final int C1_32 = 0xcc9e2d51;
     private static final int C2_32 = 0x1b873593;
     private static final int R1_32 = 15;
@@ -63,7 +78,7 @@ public final class MurmurHash3 {
     private static final int M_32 = 5;
     private static final int N_32 = 0xe6546b64;
 
-    // Constants for 128 bit variant
+    // Constants for 128-bit variant
     private static final long C1 = 0x87c37b91114253d5L;
     private static final long C2 = 0x4cf5ad432745937fL;
     private static final int R1 = 31;
@@ -73,123 +88,223 @@ public final class MurmurHash3 {
     private static final int N1 = 0x52dce729;
     private static final int N2 = 0x38495ab5;
 
-    public static final int DEFAULT_SEED = 104729;
-
-    // all methods static; private constructor.
+    /** No instance methods. */
     private MurmurHash3() {
     }
 
     /**
-     * Generates 32 bit hash from two longs with default seed value.
+     * Generates 32-bit hash from two longs with a default seed value.
+     * This is a helper method that will produce the same result as:
      *
-     * @param l0 long to hash
-     * @param l1 long to hash
-     * @return 32 bit hash
-     */
-    public static int hash32(final long l0, final long l1) {
-        return hash32(l0, l1, DEFAULT_SEED);
-    }
-
-    /**
-     * Generates 32 bit hash from a long with default seed value.
+     * <pre>
+     * int seed = 104729;
+     * int hash = hash32(ByteBuffer.allocate(16)
+     *                             .putLong(data1)
+     *                             .putLong(data2)
+     *                             .array(), 16, seed);
+     * </pre>
      *
-     * @param l0 long to hash
-     * @return 32 bit hash
+     * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, 
int)} does not affect
+     * this result as there are no bytes left over from dividing the length by 
4.<p>
+     *
+     * @param data1 The first long to hash
+     * @param data2 The second long to hash
+     * @return The 32-bit hash
+     * @see #hash32(byte[], int, int)
      */
-    public static int hash32(final long l0) {
-        return hash32(l0, DEFAULT_SEED);
+    public static int hash32(final long data1, final long data2) {
+        return hash32(data1, data2, DEFAULT_SEED);
     }
 
     /**
-     * Generates 32 bit hash from a long with the given seed.
+     * Generates 32-bit hash from two longs with the given seed.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int hash = hash32(ByteBuffer.allocate(16)
+     *                             .putLong(data1)
+     *                             .putLong(data2)
+     *                             .array(), 16, seed);
+     * </pre>
      *
-     * @param l0   long to hash
-     * @param seed initial seed value
-     * @return 32 bit hash
+     * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, 
int)} does not affect
+     * this result as there are no bytes left over from dividing the length by 
4.<p>
+     *
+     * @param data1 The first long to hash
+     * @param data2 The second long to hash
+     * @param seed The initial seed value
+     * @return The 32-bit hash
+     * @see #hash32(byte[], int, int)
      */
-    public static int hash32(final long l0, final int seed) {
+    public static int hash32(final long data1, final long data2, final int 
seed) {
         int hash = seed;
-        final long r0 = Long.reverseBytes(l0);
+        final long r0 = Long.reverseBytes(data1);
+        final long r1 = Long.reverseBytes(data2);
 
         hash = mix32((int) r0, hash);
         hash = mix32((int) (r0 >>> 32), hash);
+        hash = mix32((int) (r1), hash);
+        hash = mix32((int) (r1 >>> 32), hash);
 
-        return fmix32(LONG_BYTES, hash);
+        hash ^= LONG_BYTES * 2;
+        return fmix32(hash);
     }
 
     /**
-     * Generates 32 bit hash from two longs with the given seed.
+     * Generates 32-bit hash from a long with a default seed value.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 104729;
+     * int hash = hash32(ByteBuffer.allocate(8)
+     *                             .putLong(data)
+     *                             .array(), 8, seed);
+     * </pre>
      *
-     * @param l0   long to hash
-     * @param l1   long to hash
-     * @param seed initial seed value
-     * @return 32 bit hash
+     * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, 
int)} does not affect
+     * this result as there are no bytes left over from dividing the length by 
4.<p>
+     *
+     * @param data The long to hash
+     * @return The 32-bit hash
+     * @see #hash32(byte[], int, int)
      */
-    public static int hash32(final long l0, final long l1, final int seed) {
+    public static int hash32(final long data) {
+        return hash32(data, DEFAULT_SEED);
+    }
+
+    /**
+     * Generates 32-bit hash from a long with the given seed.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int hash = hash32(ByteBuffer.allocate(8)
+     *                             .putLong(data)
+     *                             .array(), 8, seed);
+     * </pre>
+     *
+     * <p>Note: The sign extension bug in {@link #hash32(byte[], int, int, 
int)} does not affect
+     * this result as there are no bytes left over from dividing the length by 
4.<p>
+     *
+     * @param data The long to hash
+     * @param seed The initial seed value
+     * @return The 32-bit hash
+     * @see #hash32(byte[], int, int)
+     */
+    public static int hash32(final long data, final int seed) {
         int hash = seed;
-        final long r0 = Long.reverseBytes(l0);
-        final long r1 = Long.reverseBytes(l1);
+        final long r0 = Long.reverseBytes(data);
 
         hash = mix32((int) r0, hash);
         hash = mix32((int) (r0 >>> 32), hash);
-        hash = mix32((int) (r1), hash);
-        hash = mix32((int) (r1 >>> 32), hash);
 
-        return fmix32(LONG_BYTES * 2, hash);
+        hash ^= LONG_BYTES;
+        return fmix32(hash);
     }
 
     /**
-     * Generates 32 bit hash from byte array with the default seed.
+     * Generates 32-bit hash from the byte array with a default seed.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 104729;
+     * int hash = hash32(data, 0, data.length, seed);
+     * </pre>
      *
-     * @param data - input byte array
-     * @return 32 bit hash
+     * <p>This implementation contains a sign-extension bug in the 
finalisation step of
+     * any bytes left over from dividing the length by 4. This manifests if 
any of these
+     * bytes are negative.<p>
+     *
+     * @param data The input byte array
+     * @return The 32-bit hash
+     * @see #hash32(byte[], int, int, int)
      */
     public static int hash32(final byte[] data) {
         return hash32(data, 0, data.length, DEFAULT_SEED);
     }
 
     /**
-     * Generates 32 bit hash from a string with the default seed.
+     * Generates 32-bit hash from a string with a default seed.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 104729;
+     * byte[] bytes = data.getBytes();
+     * int hash = hash32(bytes, 0, bytes.length, seed);
+     * </pre>
      *
-     * @param data - input string
-     * @return 32 bit hash
+     * <p>This implementation contains a sign-extension bug in the 
finalisation step of
+     * any bytes left over from dividing the length by 4. This manifests if 
any of these
+     * bytes are negative.<p>
+     *
+     * @param data The input string
+     * @return The 32-bit hash
+     * @see #hash32(byte[], int, int, int)
      */
     public static int hash32(final String data) {
-        final byte[] origin = data.getBytes();
-        return hash32(origin, 0, origin.length, DEFAULT_SEED);
+        final byte[] bytes = data.getBytes();
+        return hash32(bytes, 0, bytes.length, DEFAULT_SEED);
     }
 
     /**
-     * Generates 32 bit hash from byte array with the default seed.
+     * Generates 32-bit hash from the byte array with the given length and a 
default seed.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 104729;
+     * int hash = hash32(data, 0, data.length, seed);
+     * </pre>
      *
-     * @param data   - input byte array
-     * @param length - length of array
-     * @return 32 bit hash
+     * <p>This implementation contains a sign-extension bug in the 
finalisation step of
+     * any bytes left over from dividing the length by 4. This manifests if 
any of these
+     * bytes are negative.<p>
+     *
+     * @param data The input byte array
+     * @param length The length of array
+     * @return The 32-bit hash
+     * @see #hash32(byte[], int, int, int)
      */
     public static int hash32(final byte[] data, final int length) {
         return hash32(data, length, DEFAULT_SEED);
     }
 
     /**
-     * Generates 32 bit hash from byte array with the given length and seed.
+     * Generates 32-bit hash from the byte array with the given length and 
seed. This is a
+     * helper method that will produce the same result as:
+     *
+     * <pre>
+     * int hash = hash32(data, 0, data.length, seed);
+     * </pre>
      *
-     * @param data   - input byte array
-     * @param length - length of array
-     * @param seed   - seed. (default 0)
-     * @return 32 bit hash
+     * <p>This implementation contains a sign-extension bug in the 
finalisation step of
+     * any bytes left over from dividing the length by 4. This manifests if 
any of these
+     * bytes are negative.<p>
+     *
+     * @param data The input byte array
+     * @param length The length of array
+     * @param seed The initial seed value
+     * @return The 32-bit hash
+     * @see #hash32(byte[], int, int, int)
      */
     public static int hash32(final byte[] data, final int length, final int 
seed) {
         return hash32(data, 0, length, seed);
     }
 
     /**
-     * Generates 32 bit hash from byte array with the given length, offset and 
seed.
+     * Generates 32-bit hash from the byte array with the given offset, length 
and seed.
+     *
+     * <p>This is an implementation of the 32-bit hash function {@code 
MurmurHash3_x86_32}
+     * from from Austin Applyby's original MurmurHash3 {@code c++} code in 
SMHasher.</p>
      *
-     * @param data   - input byte array
-     * @param offset - offset of data
-     * @param length - length of array
-     * @param seed   - seed. (default 0)
-     * @return 32 bit hash
+     * <p>This implementation contains a sign-extension bug in the 
finalisation step of
+     * any bytes left over from dividing the length by 4. This manifests if 
any of these
+     * bytes are negative.<p>
+     *
+     * @param data The input byte array
+     * @param offset The offset of data
+     * @param length The length of array
+     * @param seed The initial seed value
+     * @return The 32-bit hash
      */
     public static int hash32(final byte[] data, final int offset, final int 
length, final int seed) {
         int hash = seed;
@@ -197,23 +312,24 @@ public final class MurmurHash3 {
 
         // body
         for (int i = 0; i < nblocks; i++) {
-            final int i_4 = i << 2;
-            final int k = (data[offset + i_4] & 0xff) | ((data[offset + i_4 + 
1] & 0xff) << 8)
-                    | ((data[offset + i_4 + 2] & 0xff) << 16) | ((data[offset 
+ i_4 + 3] & 0xff) << 24);
-
+            final int index = offset + (i << 2);
+            final int k = getLittleEndianInt(data, index);
             hash = mix32(k, hash);
         }
 
         // tail
-        final int idx = nblocks << 2;
+        // ************
+        // Note: This fails to apply masking using 0xff to the 3 remaining 
bytes.
+        // ************
+        final int index = offset + (nblocks << 2);
         int k1 = 0;
-        switch (length - idx) {
+        switch (offset + length - index) {
         case 3:
-            k1 ^= data[offset + idx + 2] << 16;
+            k1 ^= data[index + 2] << 16;
         case 2:
-            k1 ^= data[offset + idx + 1] << 8;
+            k1 ^= data[index + 1] << 8;
         case 1:
-            k1 ^= data[offset + idx];
+            k1 ^= data[index];
 
             // mix functions
             k1 *= C1_32;
@@ -222,26 +338,29 @@ public final class MurmurHash3 {
             hash ^= k1;
         }
 
-        return fmix32(length, hash);
+        hash ^= length;
+        return fmix32(hash);
     }
 
     /**
-     * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit
-     * variant.
+     * Generates 64-bit hash from a long with a default seed.
      *
-     * @param data - input byte array
-     * @return 64 bit hash
-     */
-    public static long hash64(final byte[] data) {
-        return hash64(data, 0, data.length, DEFAULT_SEED);
-    }
-
-    /**
-     * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit
-     * variant.
+     * <p>This is a Murmur3-like 64-bit variant.
+     * <strong>This is not part of the original MurmurHash3 {@code c++} 
implementation.</strong>
+     * It may be removed in a future release.</p>
+     *
+     * <p>This is a helper method that will produce the same result as:</p>
      *
-     * @param data - input long
-     * @return 64 bit hash
+     * <pre>
+     * int seed = 104729;
+     * long hash = hash64(ByteBuffer.allocate(8)
+     *                              .putLong(data)
+     *                              .array(), 0, 8, seed);
+     * <pre>
+     *
+     * @param data The long to hash
+     * @return The 64-bit hash
+     * @see #hash64(byte[], int, int, int)
      */
     public static long hash64(final long data) {
         long hash = DEFAULT_SEED;
@@ -260,11 +379,24 @@ public final class MurmurHash3 {
     }
 
     /**
-     * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit
-     * variant.
+     * Generates 64-bit hash from an int with a default seed.
+     *
+     * <p>This is a Murmur3-like 64-bit variant.
+     * <strong>This is not part of the original MurmurHash3 {@code c++} 
implementation.</strong>
+     * It may be removed in a future release.</p>
+     *
+     * <p>This is a helper method that will produce the same result as:</p>
+     *
+     * <pre>
+     * int seed = 104729;
+     * long hash = hash64(ByteBuffer.allocate(4)
+     *                              .putInt(data)
+     *                              .array(), 0, 4, seed);
+     * <pre>
      *
-     * @param data - input int
-     * @return 64 bit hash
+     * @param data The int to hash
+     * @return The 64-bit hash
+     * @see #hash64(byte[], int, int, int)
      */
     public static long hash64(final int data) {
         long k1 = Integer.reverseBytes(data) & (-1L >>> 32);
@@ -281,11 +413,24 @@ public final class MurmurHash3 {
     }
 
     /**
-     * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 
128-bit
-     * variant.
+     * Generates 64-bit hash from a short with a default seed.
+     *
+     * <p>This is a Murmur3-like 64-bit variant.
+     * <strong>This is not part of the original MurmurHash3 {@code c++} 
implementation.</strong>
+     * It may be removed in a future release.</p>
+     *
+     * <p>This is a helper method that will produce the same result as:</p>
+     *
+     * <pre>
+     * int seed = 104729;
+     * long hash = hash64(ByteBuffer.allocate(2)
+     *                              .putShort(data)
+     *                              .array(), 0, 2, seed);
+     * <pre>
      *
-     * @param data - input short
-     * @return 64 bit hash
+     * @param data The short to hash
+     * @return The 64-bit hash
+     * @see #hash64(byte[], int, int, int)
      */
     public static long hash64(final short data) {
         long hash = DEFAULT_SEED;
@@ -304,38 +449,84 @@ public final class MurmurHash3 {
     }
 
     /**
-     * Generates 64 bit hash from byte array with the given length, offset and
-     * default seed.
+     * Generates 64-bit hash from a byte array with a default seed.
      *
-     * @param data   - input byte array
-     * @param offset - offset of data
-     * @param length - length of array
-     * @return 64 bit hash
+     * <p>This is a Murmur3-like 64-bit variant.
+     * <strong>This is not part of the original MurmurHash3 {@code c++} 
implementation.</strong>
+     * It may be removed in a future release.</p>
+     *
+     * <p>This is a helper method that will produce the same result as:</p>
+     *
+     * <pre>
+     * int seed = 104729;
+     * long hash = hash64(data, 0, data.length, seed);
+     * <pre>
+     *
+     * @param data The input byte array
+     * @return The 64-bit hash
+     * @see #hash64(byte[], int, int, int)
+     */
+    public static long hash64(final byte[] data) {
+        return hash64(data, 0, data.length, DEFAULT_SEED);
+    }
+
+    /**
+     * Generates 64-bit hash from a byte array with the given offset and 
length and a default seed.
+     *
+     * <p>This is a Murmur3-like 64-bit variant.
+     * <strong>This is not part of the original MurmurHash3 {@code c++} 
implementation.</strong>
+     * It may be removed in a future release.</p>
+     *
+     * <p>This is a helper method that will produce the same result as:</p>
+     *
+     * <pre>
+     * int seed = 104729;
+     * long hash = hash64(data, offset, length, seed);
+     * <pre>
+     *
+     * @param data The input byte array
+     * @param offset The offset of data
+     * @param length The length of array
+     * @return The 64-bit hash
+     * @see #hash64(byte[], int, int, int)
      */
     public static long hash64(final byte[] data, final int offset, final int 
length) {
         return hash64(data, offset, length, DEFAULT_SEED);
     }
 
     /**
-     * Generates 64 bit hash from byte array with the given length, offset and 
seed.
+     * Generates 64-bit hash from a byte array with the given offset, length 
and seed.
+     *
+     * <p>This is a Murmur3-like 64-bit variant.
+     * <strong>This is not part of the original MurmurHash3 {@code c++} 
implementation.</strong>
+     * It may be removed in a future release.</p>
+     *
+     * <p>This algorithm processes 8 bytes chunks of data in a manner similar 
to the 16 byte chunks
+     * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} 
method. However the hash
+     * is not mixed with a hash chunk from the next 8 bytes of data. The 
method will not return
+     * the same value as the first or second 64-bits of the function
+     * {@link #hash128(byte[], int, int, int)}.<p>
      *
-     * @param data   - input byte array
-     * @param offset - offset of data
-     * @param length - length of array
-     * @param seed   - seed. (default 0)
-     * @return 64 bit hash
+     * <p>Use of this method is not advised. Use the first long returned from
+     * {@link #hash128(byte[], int, int, int)}.<p>
+     *
+     * @param data The input byte array
+     * @param offset The offset of data
+     * @param length The length of array
+     * @param seed The initial seed value
+     * @return The 64-bit hash
      */
     public static long hash64(final byte[] data, final int offset, final int 
length, final int seed) {
+        // ************
+        // Note: This fails to apply masking using 0xffffffffL to the seed.
+        // ************
         long hash = seed;
         final int nblocks = length >> 3;
 
         // body
         for (int i = 0; i < nblocks; i++) {
-            final int i8 = i << 3;
-            long k = ((long) data[offset + i8] & 0xff) | (((long) data[offset 
+ i8 + 1] & 0xff) << 8)
-                    | (((long) data[offset + i8 + 2] & 0xff) << 16) | (((long) 
data[offset + i8 + 3] & 0xff) << 24)
-                    | (((long) data[offset + i8 + 4] & 0xff) << 32) | (((long) 
data[offset + i8 + 5] & 0xff) << 40)
-                    | (((long) data[offset + i8 + 6] & 0xff) << 48) | (((long) 
data[offset + i8 + 7] & 0xff) << 56);
+            final int index = offset + (i << 3);
+            long k = getLittleEndianLong(data, index);
 
             // mix functions
             k *= C1;
@@ -347,22 +538,22 @@ public final class MurmurHash3 {
 
         // tail
         long k1 = 0;
-        final int tailStart = nblocks << 3;
-        switch (length - tailStart) {
+        final int index = offset + (nblocks << 3);
+        switch (offset + length - index) {
         case 7:
-            k1 ^= ((long) data[offset + tailStart + 6] & 0xff) << 48;
+            k1 ^= ((long) data[index + 6] & 0xff) << 48;
         case 6:
-            k1 ^= ((long) data[offset + tailStart + 5] & 0xff) << 40;
+            k1 ^= ((long) data[index + 5] & 0xff) << 40;
         case 5:
-            k1 ^= ((long) data[offset + tailStart + 4] & 0xff) << 32;
+            k1 ^= ((long) data[index + 4] & 0xff) << 32;
         case 4:
-            k1 ^= ((long) data[offset + tailStart + 3] & 0xff) << 24;
+            k1 ^= ((long) data[index + 3] & 0xff) << 24;
         case 3:
-            k1 ^= ((long) data[offset + tailStart + 2] & 0xff) << 16;
+            k1 ^= ((long) data[index + 2] & 0xff) << 16;
         case 2:
-            k1 ^= ((long) data[offset + tailStart + 1] & 0xff) << 8;
+            k1 ^= ((long) data[index + 1] & 0xff) << 8;
         case 1:
-            k1 ^= ((long) data[offset + tailStart] & 0xff);
+            k1 ^= ((long) data[index] & 0xff);
             k1 *= C1;
             k1 = Long.rotateLeft(k1, R1);
             k1 *= C2;
@@ -377,52 +568,76 @@ public final class MurmurHash3 {
     }
 
     /**
-     * Murmur3 128-bit variant.
+     * Generates 128-bit hash from the byte array with a default seed.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 104729;
+     * int hash = hash128(data, 0, data.length, seed);
+     * </pre>
      *
-     * @param data - input byte array
-     * @return - 128 bit hash (2 longs)
+     * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, 
int)} does not effect
+     * this result as the default seed is positive.<p>
+     *
+     * @param data The input byte array
+     * @return The 128-bit hash (2 longs)
+     * @see #hash128(byte[], int, int, int)
      */
     public static long[] hash128(final byte[] data) {
         return hash128(data, 0, data.length, DEFAULT_SEED);
     }
 
     /**
-     * Murmur3 128-bit variant.
+     * Generates 128-bit hash from a string with a default seed.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 104729;
+     * byte[] bytes = data.getBytes();
+     * int hash = hash128(bytes, 0, bytes.length, seed);
+     * </pre>
      *
-     * @param data - input String
-     * @return - 128 bit hash (2 longs)
+     * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, 
int)} does not effect
+     * this result as the default seed is positive.<p>
+     *
+     * @param data The input String
+     * @return The 128-bit hash (2 longs)
+     * @see #hash128(byte[], int, int, int)
      */
     public static long[] hash128(final String data) {
-        final byte[] origin = data.getBytes();
-        return hash128(origin, 0, origin.length, DEFAULT_SEED);
+        final byte[] bytes = data.getBytes();
+        return hash128(bytes, 0, bytes.length, DEFAULT_SEED);
     }
 
     /**
-     * Murmur3 128-bit variant.
+     * Generates 128-bit hash from the byte array with the given offset, 
length and seed.
+     *
+     * <p>This is an implementation of the 128-bit hash function {@code 
MurmurHash3_x64_128}
+     * from from Austin Applyby's original MurmurHash3 {@code c++} code in 
SMHasher.</p>
      *
-     * @param data   - input byte array
-     * @param offset - the first element of array
-     * @param length - length of array
-     * @param seed   - seed. (default is 0)
-     * @return - 128 bit hash (2 longs)
+     * <p>This implementation contains a sign-extension bug in the seed 
initialisation.
+     * This manifests if the seed is negative.<p>
+     *
+     * @param data The input byte array
+     * @param offset The first element of array
+     * @param length The length of array
+     * @param seed The initial seed value
+     * @return The 128-bit hash (2 longs)
      */
     public static long[] hash128(final byte[] data, final int offset, final 
int length, final int seed) {
+        // ************
+        // Note: This fails to apply masking using 0xffffffffL to the seed.
+        // ************
         long h1 = seed;
         long h2 = seed;
         final int nblocks = length >> 4;
 
         // body
         for (int i = 0; i < nblocks; i++) {
-            final int i16 = i << 4;
-            long k1 = ((long) data[offset + i16] & 0xff) | (((long) 
data[offset + i16 + 1] & 0xff) << 8)
-                    | (((long) data[offset + i16 + 2] & 0xff) << 16) | 
(((long) data[offset + i16 + 3] & 0xff) << 24)
-                    | (((long) data[offset + i16 + 4] & 0xff) << 32) | 
(((long) data[offset + i16 + 5] & 0xff) << 40)
-                    | (((long) data[offset + i16 + 6] & 0xff) << 48) | 
(((long) data[offset + i16 + 7] & 0xff) << 56);
-
-            long k2 = ((long) data[offset + i16 + 8] & 0xff) | (((long) 
data[offset + i16 + 9] & 0xff) << 8)
-                    | (((long) data[offset + i16 + 10] & 0xff) << 16) | 
(((long) data[offset + i16 + 11] & 0xff) << 24)
-                    | (((long) data[offset + i16 + 12] & 0xff) << 32) | 
(((long) data[offset + i16 + 13] & 0xff) << 40)
-                    | (((long) data[offset + i16 + 14] & 0xff) << 48) | 
(((long) data[offset + i16 + 15] & 0xff) << 56);
+            final int index = offset + (i << 4);
+            long k1 = getLittleEndianLong(data, index);
+            long k2 = getLittleEndianLong(data, index + 8);
 
             // mix functions for k1
             k1 *= C1;
@@ -446,43 +661,43 @@ public final class MurmurHash3 {
         // tail
         long k1 = 0;
         long k2 = 0;
-        final int tailStart = nblocks << 4;
-        switch (length - tailStart) {
+        final int index = offset + (nblocks << 4);
+        switch (offset + length - index) {
         case 15:
-            k2 ^= (long) (data[offset + tailStart + 14] & 0xff) << 48;
+            k2 ^= ((long) data[index + 14] & 0xff) << 48;
         case 14:
-            k2 ^= (long) (data[offset + tailStart + 13] & 0xff) << 40;
+            k2 ^= ((long) data[index + 13] & 0xff) << 40;
         case 13:
-            k2 ^= (long) (data[offset + tailStart + 12] & 0xff) << 32;
+            k2 ^= ((long) data[index + 12] & 0xff) << 32;
         case 12:
-            k2 ^= (long) (data[offset + tailStart + 11] & 0xff) << 24;
+            k2 ^= ((long) data[index + 11] & 0xff) << 24;
         case 11:
-            k2 ^= (long) (data[offset + tailStart + 10] & 0xff) << 16;
+            k2 ^= ((long) data[index + 10] & 0xff) << 16;
         case 10:
-            k2 ^= (long) (data[offset + tailStart + 9] & 0xff) << 8;
+            k2 ^= ((long) data[index + 9] & 0xff) << 8;
         case 9:
-            k2 ^= data[offset + tailStart + 8] & 0xff;
+            k2 ^= data[index + 8] & 0xff;
             k2 *= C2;
             k2 = Long.rotateLeft(k2, R3);
             k2 *= C1;
             h2 ^= k2;
 
         case 8:
-            k1 ^= (long) (data[offset + tailStart + 7] & 0xff) << 56;
+            k1 ^= ((long) data[index + 7] & 0xff) << 56;
         case 7:
-            k1 ^= (long) (data[offset + tailStart + 6] & 0xff) << 48;
+            k1 ^= ((long) data[index + 6] & 0xff) << 48;
         case 6:
-            k1 ^= (long) (data[offset + tailStart + 5] & 0xff) << 40;
+            k1 ^= ((long) data[index + 5] & 0xff) << 40;
         case 5:
-            k1 ^= (long) (data[offset + tailStart + 4] & 0xff) << 32;
+            k1 ^= ((long) data[index + 4] & 0xff) << 32;
         case 4:
-            k1 ^= (long) (data[offset + tailStart + 3] & 0xff) << 24;
+            k1 ^= ((long) data[index + 3] & 0xff) << 24;
         case 3:
-            k1 ^= (long) (data[offset + tailStart + 2] & 0xff) << 16;
+            k1 ^= ((long) data[index + 2] & 0xff) << 16;
         case 2:
-            k1 ^= (long) (data[offset + tailStart + 1] & 0xff) << 8;
+            k1 ^= ((long) data[index + 1] & 0xff) << 8;
         case 1:
-            k1 ^= data[offset + tailStart] & 0xff;
+            k1 ^= data[index] & 0xff;
             k1 *= C1;
             k1 = Long.rotateLeft(k1, R1);
             k1 *= C2;
@@ -505,6 +720,46 @@ public final class MurmurHash3 {
         return new long[] { h1, h2 };
     }
 
+
+    /**
+     * Gets the little-endian long from 8 bytes starting at the specified 
index.
+     *
+     * @param data The data
+     * @param index The index
+     * @return The little-endian long
+     */
+    private static long getLittleEndianLong(final byte[] data, final int 
index) {
+        return (((long) data[index    ] & 0xff)      ) |
+               (((long) data[index + 1] & 0xff) <<  8) |
+               (((long) data[index + 2] & 0xff) << 16) |
+               (((long) data[index + 3] & 0xff) << 24) |
+               (((long) data[index + 4] & 0xff) << 32) |
+               (((long) data[index + 5] & 0xff) << 40) |
+               (((long) data[index + 6] & 0xff) << 48) |
+               (((long) data[index + 7] & 0xff) << 56);
+    }
+
+    /**
+     * Gets the little-endian int from 4 bytes starting at the specified index.
+     *
+     * @param data The data
+     * @param index The index
+     * @return The little-endian int
+     */
+    private static int getLittleEndianInt(final byte[] data, final int index) {
+        return ((data[index    ] & 0xff)      ) |
+               ((data[index + 1] & 0xff) <<  8) |
+               ((data[index + 2] & 0xff) << 16) |
+               ((data[index + 3] & 0xff) << 24);
+    }
+
+    /**
+     * Perform the intermediate mix step of the 32-bit hash function {@code 
MurmurHash3_x86_32}.
+     *
+     * @param k The data to add to the hash
+     * @param hash The current hash
+     * @return The new hash
+     */
     private static int mix32(int k, int hash) {
         k *= C1_32;
         k = Integer.rotateLeft(k, R1_32);
@@ -513,104 +768,172 @@ public final class MurmurHash3 {
         return Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
     }
 
-    private static int fmix32(final int length, int hash) {
-        hash ^= length;
+    /**
+     * Perform the final avalanche mix step of the 32-bit hash function {@code 
MurmurHash3_x86_32}.
+     *
+     * @param hash The current hash
+     * @return The final hash
+     */
+    private static int fmix32(int hash) {
         hash ^= (hash >>> 16);
         hash *= 0x85ebca6b;
         hash ^= (hash >>> 13);
         hash *= 0xc2b2ae35;
         hash ^= (hash >>> 16);
-
         return hash;
     }
 
-    private static long fmix64(long h) {
-        h ^= (h >>> 33);
-        h *= 0xff51afd7ed558ccdL;
-        h ^= (h >>> 33);
-        h *= 0xc4ceb9fe1a85ec53L;
-        h ^= (h >>> 33);
-        return h;
+    /**
+     * Perform the final avalanche mix step of the 64-bit hash function {@code 
MurmurHash3_x64_128}.
+     *
+     * @param hash The current hash
+     * @return The final hash
+     */
+    private static long fmix64(long hash) {
+        hash ^= (hash >>> 33);
+        hash *= 0xff51afd7ed558ccdL;
+        hash ^= (hash >>> 33);
+        hash *= 0xc4ceb9fe1a85ec53L;
+        hash ^= (hash >>> 33);
+        return hash;
     }
 
+    /**
+     * Generates 32-bit hash from input bytes. Bytes can be added 
incrementally and the new
+     * hash computed.
+     *
+     * <p>This is an implementation of the 32-bit hash function {@code 
MurmurHash3_x86_32}
+     * from from Austin Applyby's original MurmurHash3 {@code c++} code in 
SMHasher.</p>
+     *
+     * <p>This implementation contains a sign-extension bug in the 
finalisation step of
+     * any bytes left over from dividing the length by 4. This manifests if 
any of these
+     * bytes are negative.<p>
+     */
     public static class IncrementalHash32 {
-        byte[] tail = new byte[3];
-        int tailLen;
-        int totalLen;
-        int hash;
-
-        public final void start(final int hash) {
-            tailLen = totalLen = 0;
-            this.hash = hash;
+        /** The size of byte blocks that are processed together. */
+        private static final int BLOCK_SIZE = 4;
+
+        /** Up to 3 unprocessed bytes from input data. */
+        private final byte[] unprocessed = new byte[3];
+        /** The number of unprocessed bytes in the tail data. */
+        private int unprocessedLength;
+        /** The total number of input bytes added since the start. */
+        private int totalLen;
+        /**
+         * The current running hash. 
+         * This must be finalised to generate the 32-bit hash value.
+         */
+        private int hash;
+
+        /**
+         * Start a new incremental hash.
+         *
+         * @param seed The initial seed value
+         */
+        public final void start(final int seed) {
+            // Reset
+            unprocessedLength = totalLen = 0;
+            this.hash = seed;
         }
 
-        public final void add(final byte[] data, int offset, final int length) 
{
+        /**
+         * Adds the byte array to the current incremental hash.
+         *
+         * @param data The input byte array
+         * @param offset The offset of data
+         * @param length The length of array
+         */
+        public final void add(final byte[] data, final int offset, final int 
length) {
             if (length == 0) {
+                // Nothing to add
                 return;
             }
             totalLen += length;
-            if (tailLen + length < 4) {
-                System.arraycopy(data, offset, tail, tailLen, length);
-                tailLen += length;
+
+            // Process the bytes in blocks of 4.
+            // New bytes must be added to any current unprocessed bytes,
+            // then processed in blocks of 4 and the remaining bytes saved:
+            //
+            //    |--|---------------------------|--|
+            // unprocessed
+            //                main block
+            //                                remaining
+
+            // Check if the unprocessed bytes and new bytes can fill a block 
of 4
+            if (unprocessedLength + length < BLOCK_SIZE) {
+                // Not enough so add to the unprocessed bytes
+                System.arraycopy(data, offset, unprocessed, unprocessedLength, 
length);
+                unprocessedLength += length;
                 return;
             }
-            int offset2 = 0;
-            if (tailLen > 0) {
-                offset2 = (4 - tailLen);
+
+            // Combine unprocessed bytes with new bytes.
+            int newOffset;
+            int newLength;
+            if (unprocessedLength > 0) {
                 int k = -1;
-                switch (tailLen) {
+                switch (unprocessedLength) {
                 case 1:
-                    k = orBytes(tail[0], data[offset], data[offset + 1], 
data[offset + 2]);
+                    k = orBytes(unprocessed[0], data[offset], data[offset + 
1], data[offset + 2]);
                     break;
                 case 2:
-                    k = orBytes(tail[0], tail[1], data[offset], data[offset + 
1]);
+                    k = orBytes(unprocessed[0], unprocessed[1], data[offset], 
data[offset + 1]);
                     break;
                 case 3:
-                    k = orBytes(tail[0], tail[1], tail[2], data[offset]);
+                    k = orBytes(unprocessed[0], unprocessed[1], 
unprocessed[2], data[offset]);
                     break;
                 default:
-                    throw new AssertionError(tailLen);
+                    throw new AssertionError("Unprocessed length should be 1, 
2, or 3: " + unprocessedLength);
                 }
-                // mix functions
-                k *= C1_32;
-                k = Integer.rotateLeft(k, R1_32);
-                k *= C2_32;
-                hash ^= k;
-                hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+                hash = mix32(k, hash);
+                // Update the offset and length
+                final int consumed = BLOCK_SIZE - unprocessedLength;
+                newOffset = offset + consumed;
+                newLength = length - consumed;
+            } else {
+                newOffset = offset;
+                newLength = length;
             }
-            final int length2 = length - offset2;
-            offset += offset2;
-            final int nblocks = length2 >> 2;
 
-            for (int i = 0; i < nblocks; i++) {
-                final int i_4 = (i << 2) + offset;
-                int k = orBytes(data[i_4], data[i_4 + 1], data[i_4 + 2], 
data[i_4 + 3]);
+            // Main processing of blocks of 4 bytes
+            final int nblocks = newLength >> 2;
 
-                // mix functions
-                k *= C1_32;
-                k = Integer.rotateLeft(k, R1_32);
-                k *= C2_32;
-                hash ^= k;
-                hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32;
+            for (int i = 0; i < nblocks; i++) {
+                final int index = newOffset + (i << 2);
+                final int k = getLittleEndianInt(data, index);
+                hash = mix32(k, hash);
             }
 
+            // Save left-over unprocessed bytes
             final int consumed = (nblocks << 2);
-            tailLen = length2 - consumed;
-            if (consumed == length2) {
-                return;
+            unprocessedLength = newLength - consumed;
+            if (unprocessedLength != 0) {
+                System.arraycopy(data, newOffset + consumed, unprocessed, 0, 
unprocessedLength);
             }
-            System.arraycopy(data, offset + consumed, tail, 0, tailLen);
         }
 
+        /**
+         * Generate the 32-bit hash value. Repeat calls to this method with no 
additional data
+         * will generate the same hash value.
+         *
+         * <p>This implementation contains a sign-extension bug in the 
finalisation step of
+         * any bytes left over from dividing the length by 4. This manifests 
if any of these
+         * bytes are negative.<p>
+         *
+         * @return The 32-bit hash
+         */
         public final int end() {
+            // ************
+            // Note: This fails to apply masking using 0xff to the 3 remaining 
bytes.
+            // ************
             int k1 = 0;
-            switch (tailLen) {
+            switch (unprocessedLength) {
             case 3:
-                k1 ^= tail[2] << 16;
+                k1 ^= unprocessed[2] << 16;
             case 2:
-                k1 ^= tail[1] << 8;
+                k1 ^= unprocessed[1] << 8;
             case 1:
-                k1 ^= tail[0];
+                k1 ^= unprocessed[0];
 
                 // mix functions
                 k1 *= C1_32;
@@ -621,16 +944,22 @@ public final class MurmurHash3 {
 
             // finalization
             hash ^= totalLen;
-            hash ^= (hash >>> 16);
-            hash *= 0x85ebca6b;
-            hash ^= (hash >>> 13);
-            hash *= 0xc2b2ae35;
-            hash ^= (hash >>> 16);
-            return hash;
+            return fmix32(hash);
         }
-    }
 
-    private static int orBytes(final byte b1, final byte b2, final byte b3, 
final byte b4) {
-        return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 
0xff) << 24);
+        /**
+         * Combine the bytes using an Or operation ({@code | } in a 
little-endian representation
+         * of a 32-bit integer; byte 1 will be the least significant byte, 
byte 4 the most
+         * significant.
+         *
+         * @param b1 The first byte
+         * @param b2 The second byte
+         * @param b3 The third byte
+         * @param b4 The fourth byte
+         * @return The 32-bit integer
+         */
+        private static int orBytes(final byte b1, final byte b2, final byte 
b3, final byte b4) {
+            return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | 
((b4 & 0xff) << 24);
+        }
     }
 }

[commons-codec] 01/07: Fully documented MurmurHash3.

Reply via email to