Author: srowen
Date: Thu Dec 16 14:20:00 2010
New Revision: 1049984
URL: http://svn.apache.org/viewvc?rev=1049984&view=rev
Log:
MAHOUT-565
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java?rev=1049984&r1=1049983&r2=1049984&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
Thu Dec 16 14:20:00 2010
@@ -32,7 +32,7 @@ public class HashFactory {
public static HashFunction[] createHashFunctions(HashType type, int
numFunctions) {
HashFunction[] hashFunction = new HashFunction[numFunctions];
- Random seed = new Random(11);
+ Random seed = RandomUtils.getRandom(11);
switch (type) {
case LINEAR:
for (int i = 0; i < numFunctions; i++) {
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java?rev=1049984&r1=1049983&r2=1049984&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
Thu Dec 16 14:20:00 2010
@@ -30,10 +30,10 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
-public class MinHashMapper extends Mapper<Text, Writable, Text, Writable> {
-
+public class MinHashMapper extends Mapper<Text,Writable,Text,Writable> {
+
private static final Logger log =
LoggerFactory.getLogger(MinHashMapper.class);
-
+
private HashFunction[] hashFunction;
private int numHashFunctions;
private int keyGroups;
@@ -41,9 +41,9 @@ public class MinHashMapper extends Mappe
private boolean debugOutput;
private int[] minHashValues;
private byte[] bytesToHash;
-
+
@Override
- protected void setup(Context context) throws IOException,
InterruptedException {
+ protected void setup(Context context) throws IOException,
InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
this.numHashFunctions =
conf.getInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, 10);
@@ -53,7 +53,7 @@ public class MinHashMapper extends Mappe
this.minVectorSize = conf.getInt(MinhashOptionCreator.MIN_VECTOR_SIZE, 5);
String htype = conf.get(MinhashOptionCreator.HASH_TYPE, "linear");
this.debugOutput = conf.getBoolean(MinhashOptionCreator.DEBUG_OUTPUT,
false);
-
+
HashType hashType;
try {
hashType = HashType.valueOf(htype);
@@ -63,13 +63,13 @@ public class MinHashMapper extends Mappe
}
hashFunction = HashFactory.createHashFunctions(hashType, numHashFunctions);
}
-
+
/**
- * Hash all items with each function and retain min. value for each
iteration.
- * We up with X number of minhash signatures.
+ * Hash all items with each function and retain min. value for each
iteration. We up with X number of
+ * minhash signatures.
*
- * Now depending upon the number of key-groups (1 - 4) concatenate that many
- * minhash values to form cluster-id as 'key' and item-id as 'value'
+ * Now depending upon the number of key-groups (1 - 4) concatenate that many
minhash values to form
+ * cluster-id as 'key' and item-id as 'value'
*/
@Override
public void map(Text item, Writable features, Context context) throws
IOException, InterruptedException {
@@ -81,6 +81,7 @@ public class MinHashMapper extends Mappe
for (int i = 0; i < numHashFunctions; i++) {
minHashValues[i] = Integer.MAX_VALUE;
}
+
for (int i = 0; i < numHashFunctions; i++) {
for (Vector.Element ele : featureVector) {
int value = (int) ele.get();
@@ -95,10 +96,10 @@ public class MinHashMapper extends Mappe
}
}
// output the cluster information
- for (int i = 0; i < numHashFunctions; i += keyGroups) {
+ for (int i = 0; i < numHashFunctions; i++) {
StringBuilder clusterIdBuilder = new StringBuilder();
- for (int j = 0; j < keyGroups && (i + j) < numHashFunctions; j++) {
- clusterIdBuilder.append(minHashValues[i + j]).append('-');
+ for (int j = 0; j < keyGroups; j++) {
+ clusterIdBuilder.append(minHashValues[(i + j) %
numHashFunctions]).append('-');
}
String clusterId = clusterIdBuilder.toString();
clusterId = clusterId.substring(0, clusterId.lastIndexOf('-'));
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java?rev=1049984&r1=1049983&r2=1049984&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java
Thu Dec 16 14:20:00 2010
@@ -29,13 +29,14 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
-public class MinHashReducer extends Reducer<Text, Writable, Text, Writable> {
-
+public class MinHashReducer extends Reducer<Text,Writable,Text,Writable> {
+
private int minClusterSize;
private boolean debugOutput;
-
+
enum Clusters {
- Accepted, Discarded
+ Accepted,
+ Discarded
}
@Override
@@ -45,13 +46,13 @@ public class MinHashReducer extends Redu
this.minClusterSize = conf.getInt(MinhashOptionCreator.MIN_CLUSTER_SIZE,
5);
this.debugOutput = conf.getBoolean(MinhashOptionCreator.DEBUG_OUTPUT,
false);
}
-
+
/**
* output the items clustered
*/
@Override
- protected void reduce(Text cluster, Iterable<Writable> points, Context
context)
- throws IOException, InterruptedException {
+ protected void reduce(Text cluster, Iterable<Writable> points, Context
context) throws IOException,
+
InterruptedException {
Collection<Writable> pointList = new ArrayList<Writable>();
for (Writable point : points) {
if (debugOutput) {
@@ -72,5 +73,5 @@ public class MinHashReducer extends Redu
context.getCounter(Clusters.Discarded).increment(1);
}
}
-
+
}
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java?rev=1049984&r1=1049983&r2=1049984&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
Thu Dec 16 14:20:00 2010
@@ -40,16 +40,16 @@ import java.util.List;
import java.util.Set;
public class TestMinHashClustering extends MahoutTestCase {
-
- public static final double[][] REFERENCE = { { 1, 2, 3, 4, 5 },
- { 2, 1, 3, 6, 7 }, { 3, 7, 6, 11, 8, 9 }, { 4, 7, 8, 9, 6, 1 },
- { 5, 8, 10, 4, 1 }, { 6, 17, 14, 15 }, { 8, 9, 11, 6, 12, 1, 7 },
- { 10, 13, 9, 7, 4, 6, 3 }, { 3, 5, 7, 9, 2, 11 }, { 13, 7, 6, 8, 5 } };
-
+
+ public static final double[][] REFERENCE = { {1, 2, 3, 4, 5}, {2, 1, 3, 6,
7}, {3, 7, 6, 11, 8, 9},
+ {4, 7, 8, 9, 6, 1}, {5, 8, 10,
4, 1}, {6, 17, 14, 15},
+ {8, 9, 11, 6, 12, 1, 7}, {10,
13, 9, 7, 4, 6, 3},
+ {3, 5, 7, 9, 2, 11}, {13, 7, 6,
8, 5}};
+
private FileSystem fs;
private Path input;
private Path output;
-
+
public static List<VectorWritable> getPointsWritable(double[][] raw) {
List<VectorWritable> points = new ArrayList<VectorWritable>();
for (double[] fr : raw) {
@@ -59,7 +59,7 @@ public class TestMinHashClustering exten
}
return points;
}
-
+
@Override
public void setUp() throws Exception {
super.setUp();
@@ -69,31 +69,31 @@ public class TestMinHashClustering exten
input = getTestTempDirPath("points");
output = new Path(getTestTempDirPath(), "output");
Path pointFile = new Path(input, "file1");
- SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, pointFile,
Text.class, VectorWritable.class);
+ SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, pointFile,
Text.class,
+ VectorWritable.class);
int id = 0;
for (VectorWritable point : points) {
writer.append(new Text("Id-" + id++), point);
}
writer.close();
}
-
+
private String[] makeArguments(int minClusterSize,
int minVectorSize,
int numHashFunctions,
int keyGroups,
- String hashType){
- return new String[] {
- optKey(DefaultOptionCreator.INPUT_OPTION), input.toString(),
- optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
- optKey(MinhashOptionCreator.MIN_CLUSTER_SIZE),
String.valueOf(minClusterSize),
- optKey(MinhashOptionCreator.MIN_VECTOR_SIZE),
String.valueOf(minVectorSize),
- optKey(MinhashOptionCreator.HASH_TYPE), hashType,
- optKey(MinhashOptionCreator.NUM_HASH_FUNCTIONS),
String.valueOf(numHashFunctions),
- optKey(MinhashOptionCreator.KEY_GROUPS), String.valueOf(keyGroups),
- optKey(MinhashOptionCreator.NUM_REDUCERS), "1",
- optKey(MinhashOptionCreator.DEBUG_OUTPUT), "true"};
+ String hashType) {
+ return new String[] {optKey(DefaultOptionCreator.INPUT_OPTION),
input.toString(),
+ optKey(DefaultOptionCreator.OUTPUT_OPTION),
output.toString(),
+ optKey(MinhashOptionCreator.MIN_CLUSTER_SIZE),
String.valueOf(minClusterSize),
+ optKey(MinhashOptionCreator.MIN_VECTOR_SIZE),
String.valueOf(minVectorSize),
+ optKey(MinhashOptionCreator.HASH_TYPE), hashType,
+ optKey(MinhashOptionCreator.NUM_HASH_FUNCTIONS),
String.valueOf(numHashFunctions),
+ optKey(MinhashOptionCreator.KEY_GROUPS),
String.valueOf(keyGroups),
+ optKey(MinhashOptionCreator.NUM_REDUCERS), "1",
+ optKey(MinhashOptionCreator.DEBUG_OUTPUT), "true"};
}
-
+
private static Set<Integer> getValues(Vector vector) {
Iterator<Vector.Element> itr = vector.iterator();
Set<Integer> values = new HashSet<Integer>();
@@ -102,8 +102,28 @@ public class TestMinHashClustering exten
}
return values;
}
-
- private void verify(Path output) throws Exception {
+
+ private void runPairwiseSimilarity(List<Vector> clusteredItems, double
simThreshold, String msg) {
+ if (clusteredItems.size() > 1) {
+ for (int i = 0; i < clusteredItems.size(); i++) {
+ Set<Integer> itemSet1 = getValues(clusteredItems.get(i));
+ for (int j = i + 1; j < clusteredItems.size(); j++) {
+ Set<Integer> itemSet2 = getValues(clusteredItems.get(j));
+ Set<Integer> union = new HashSet<Integer>();
+ union.addAll(itemSet1);
+ union.addAll(itemSet2);
+ Collection<Integer> intersect = new HashSet<Integer>();
+ intersect.addAll(itemSet1);
+ intersect.retainAll(itemSet2);
+ double similarity = intersect.size() / (double) union.size();
+ assertTrue(msg + " - Sets failed min similarity test, Set1: " +
itemSet1 + " Set2: " + itemSet2
+ + ", similarity:" + similarity, similarity >=
simThreshold);
+ }
+ }
+ }
+ }
+
+ private void verify(Path output, double simThreshold, String msg) throws
Exception {
Configuration conf = new Configuration();
Path outputFile = new Path(output, "part-r-00000");
SequenceFile.Reader reader = new SequenceFile.Reader(fs, outputFile, conf);
@@ -115,55 +135,37 @@ public class TestMinHashClustering exten
if (prevClusterId.equals(clusterId.toString())) {
clusteredItems.add(point.get().clone());
} else {
- if (clusteredItems.size() > 1) {
- // run pair-wise similarity test on items in a cluster
- for (int i = 0; i < clusteredItems.size(); i++) {
- Set<Integer> itemSet1 = getValues(clusteredItems.get(i));
- for (int j = i + 1; j < clusteredItems.size(); j++) {
- Set<Integer> itemSet2 = getValues(clusteredItems.get(j));
- Set<Integer> union = new HashSet<Integer>();
- union.addAll(itemSet1);
- union.addAll(itemSet2);
- Collection<Integer> intersect = new HashSet<Integer>();
- intersect.addAll(itemSet1);
- intersect.retainAll(itemSet2);
- double similarity = intersect.size() / (double) union.size();
- assertTrue("Sets failed min similarity test, Set1: "
- + itemSet1 + " Set2: " + itemSet2 + ", similarity:" +
similarity, similarity > 0.4);
- }
- }
- }
+ runPairwiseSimilarity(clusteredItems, simThreshold, msg);
clusteredItems.clear();
prevClusterId = clusterId.toString();
+ clusteredItems.add(point.get().clone());
}
}
+ runPairwiseSimilarity(clusteredItems, simThreshold, msg);
}
-
+
@Test
public void testLinearMinHashMRJob() throws Exception {
- String[] args = makeArguments(2, 3, 20, 4, HashType.LINEAR.toString());
+ String[] args = makeArguments(2, 3, 20, 3, HashType.LINEAR.toString());
int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
assertEquals("Minhash MR Job failed for " + HashType.LINEAR.toString(), 0,
ret);
- System.out.println("Verifying linear hash results");
- verify(output);
+ verify(output, 0.2, "Hash Type: LINEAR");
}
-
+
@Test
public void testPolynomialMinHashMRJob() throws Exception {
- String[] args = makeArguments(2, 3, 20, 4, HashType.POLYNOMIAL.toString());
+ String[] args = makeArguments(2, 3, 20, 3, HashType.POLYNOMIAL.toString());
int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
assertEquals("Minhash MR Job failed for " +
HashType.POLYNOMIAL.toString(), 0, ret);
- System.out.println("Verifying linear hash results");
- verify(output);
+ verify(output, 0.3, "Hash Type: POLYNOMIAL");
}
-
+
@Test
public void testMurmurMinHashMRJob() throws Exception {
String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR.toString());
int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
assertEquals("Minhash MR Job failed for " + HashType.MURMUR.toString(), 0,
ret);
- System.out.println("verifying murmur hash results");
- verify(output);
+ verify(output, 0.3, "Hash Type: MURMUR");
}
-
-}
+
+}
\ No newline at end of file