Author: gsingers
Date: Wed Nov 2 14:57:44 2011
New Revision: 1196616
URL: http://svn.apache.org/viewvc?rev=1196616&view=rev
Log:
lop off some stop words so that we get better clusters
Added:
mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
mahout/trunk/examples/bin/build-reuters.sh
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java?rev=1196616&r1=1196615&r2=1196616&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
Wed Nov 2 14:57:44 2011
@@ -18,6 +18,7 @@ package org.apache.mahout.clustering.min
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.MurmurHash;
+import org.apache.mahout.math.MurmurHash3;
import java.util.Random;
@@ -27,7 +28,7 @@ public final class HashFactory {
}
public enum HashType {
- LINEAR, POLYNOMIAL, MURMUR
+ LINEAR, POLYNOMIAL, MURMUR, MURMUR3
}
public static HashFunction[] createHashFunctions(HashType type, int
numFunctions) {
@@ -49,6 +50,11 @@ public final class HashFactory {
hashFunction[i] = new MurmurHashWrapper(seed.nextInt());
}
break;
+ case MURMUR3:
+ for (int i = 0; i < numFunctions; i++) {
+ hashFunction[i] = new MurmurHash3Wrapper(seed.nextInt());
+ }
+ break;
default:
throw new IllegalStateException("Unknown type: " + type);
}
@@ -111,4 +117,18 @@ public final class HashFactory {
return Math.abs((int) (hashValue %
RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
}
}
+
+ static class MurmurHash3Wrapper implements HashFunction {
+ private final int seed;
+
+ MurmurHash3Wrapper(int seed) {
+ this.seed = seed;
+ }
+
+ @Override
+ public int hash(byte[] bytes) {
+ long hashValue = MurmurHash3.murmurhash3_x86_32(bytes, 0, bytes.length,
seed);
+ return Math.abs((int) (hashValue %
RandomUtils.MAX_INT_SMALLER_TWIN_PRIME));
+ }
+ }
}
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java?rev=1196616&r1=1196615&r2=1196616&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
Wed Nov 2 14:57:44 2011
@@ -170,5 +170,13 @@ public class TestMinHashClustering exten
assertEquals("Minhash MR Job failed for " + HashType.MURMUR, 0, ret);
verify(output, 0.3, "Hash Type: MURMUR");
}
+
+ @Test
+ public void testMurmur3MinHashMRJob() throws Exception {
+ String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR3.toString());
+ int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
+ assertEquals("Minhash MR Job failed for " + HashType.MURMUR3, 0, ret);
+ verify(output, 0.3, "Hash Type: MURMUR");
+ }
}
\ No newline at end of file
Modified: mahout/trunk/examples/bin/build-reuters.sh
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1196616&r1=1196615&r2=1196616&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Wed Nov 2 14:57:44 2011
@@ -93,7 +93,7 @@ fi
if [ "x$clustertype" == "xkmeans" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans --maxDFPercent 85 \
&& \
$MAHOUT kmeans \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
@@ -109,7 +109,7 @@ if [ "x$clustertype" == "xkmeans" ]; the
elif [ "x$clustertype" == "xfuzzykmeans" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans --maxDFPercent 85 \
&& \
$MAHOUT fkmeans \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-fkmeans/tfidf-vectors/ \
@@ -139,7 +139,7 @@ elif [ "x$clustertype" == "xlda" ]; then
elif [ "x$clustertype" == "xdirichlet" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet --maxDFPercent 85 \
&& \
$MAHOUT dirichlet \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-dirichlet/tfidf-vectors \
@@ -155,7 +155,7 @@ elif [ "x$clustertype" == "xdirichlet" ]
elif [ "x$clustertype" == "xminhash" ]; then
$MAHOUT seq2sparse \
-i ${WORK_DIR}/reuters-out-seqdir/ \
- -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash \
+ -o ${WORK_DIR}/reuters-out-seqdir-sparse-minhash --maxDFPercent 85 \
&& \
$MAHOUT org.apache.mahout.clustering.minhash.MinHashDriver \
-i ${WORK_DIR}/reuters-out-seqdir-sparse-minhash/tfidf-vectors \
Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java?rev=1196616&view=auto
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java
(added)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/MurmurHash3.java Wed
Nov 2 14:57:44 2011
@@ -0,0 +1,80 @@
+package org.apache.mahout.math;
+/**
+ * This code is public domain.
+ *
+ * The MurmurHash3 algorithm was created by Austin Appleby and put into the
public domain. See http://code.google.com/p/smhasher/
+ *
+ * This java port was authored by
+ * Yonik Seeley and was placed into the public domain per
https://github.com/yonik/java_util/blob/master/src/util/hash/MurmurHash3.java.
+ */
+
+//
+
+/**
+ * <p>
+ * This produces exactly the same hash values as the final C++
+ * version of MurmurHash3 and is thus suitable for producing the same hash
values across
+ * platforms.
+ * <p>
+ * The 32 bit x86 version of this hash should be the fastest variant for
relatively short keys like ids.
+ * <p>
+ * Note - The x86 and x64 versions do _not_ produce the same results, as the
+ * algorithms are optimized for their respective platforms.
+ * <p>
+ * See also http://github.com/yonik/java_util for future updates to this file.
+ */
+public class MurmurHash3 {
+
+ /** Returns the MurmurHash3_x86_32 hash. */
+ public static int murmurhash3_x86_32(byte[] data, int offset, int len, int
seed) {
+
+ final int c1 = 0xcc9e2d51;
+ final int c2 = 0x1b873593;
+
+ int h1 = seed;
+ int roundedEnd = offset + (len & 0xfffffffc); // round down to 4 byte
block
+
+ for (int i=offset; i<roundedEnd; i+=4) {
+ // little endian load order
+ int k1 = (data[i] & 0xff) | ((data[i+1] & 0xff) << 8) | ((data[i+2] &
0xff) << 16) | (data[i+3] << 24);
+ k1 *= c1;
+ k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
+ k1 *= c2;
+
+ h1 ^= k1;
+ h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13);
+ h1 = h1*5+0xe6546b64;
+ }
+
+ // tail
+ int k1 = 0;
+
+ switch(len & 0x03) {
+ case 3:
+ k1 = (data[roundedEnd + 2] & 0xff) << 16;
+ // fallthrough
+ case 2:
+ k1 |= (data[roundedEnd + 1] & 0xff) << 8;
+ // fallthrough
+ case 1:
+ k1 |= (data[roundedEnd] & 0xff);
+ k1 *= c1;
+ k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15);
+ k1 *= c2;
+ h1 ^= k1;
+ }
+
+ // finalization
+ h1 ^= len;
+
+ // fmix(h1);
+ h1 ^= h1 >>> 16;
+ h1 *= 0x85ebca6b;
+ h1 ^= h1 >>> 13;
+ h1 *= 0xc2b2ae35;
+ h1 ^= h1 >>> 16;
+
+ return h1;
+ }
+
+}
\ No newline at end of file