Author: srowen
Date: Thu Dec 16 14:20:00 2010
New Revision: 1049984

URL: http://svn.apache.org/viewvc?rev=1049984&view=rev
Log:
MAHOUT-565

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java?rev=1049984&r1=1049983&r2=1049984&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/HashFactory.java
 Thu Dec 16 14:20:00 2010
@@ -32,7 +32,7 @@ public class HashFactory {
 
   public static HashFunction[] createHashFunctions(HashType type, int 
numFunctions) {
     HashFunction[] hashFunction = new HashFunction[numFunctions];
-    Random seed = new Random(11);
+    Random seed = RandomUtils.getRandom(11);
     switch (type) {
       case LINEAR:
         for (int i = 0; i < numFunctions; i++) {

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java?rev=1049984&r1=1049983&r2=1049984&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashMapper.java
 Thu Dec 16 14:20:00 2010
@@ -30,10 +30,10 @@ import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 
-public class MinHashMapper extends Mapper<Text, Writable, Text, Writable> {
-
+public class MinHashMapper extends Mapper<Text,Writable,Text,Writable> {
+  
   private static final Logger log = 
LoggerFactory.getLogger(MinHashMapper.class);
-
+  
   private HashFunction[] hashFunction;
   private int numHashFunctions;
   private int keyGroups;
@@ -41,9 +41,9 @@ public class MinHashMapper extends Mappe
   private boolean debugOutput;
   private int[] minHashValues;
   private byte[] bytesToHash;
-
+  
   @Override
-  protected void setup(Context context) throws IOException,  
InterruptedException {
+  protected void setup(Context context) throws IOException, 
InterruptedException {
     super.setup(context);
     Configuration conf = context.getConfiguration();
     this.numHashFunctions = 
conf.getInt(MinhashOptionCreator.NUM_HASH_FUNCTIONS, 10);
@@ -53,7 +53,7 @@ public class MinHashMapper extends Mappe
     this.minVectorSize = conf.getInt(MinhashOptionCreator.MIN_VECTOR_SIZE, 5);
     String htype = conf.get(MinhashOptionCreator.HASH_TYPE, "linear");
     this.debugOutput = conf.getBoolean(MinhashOptionCreator.DEBUG_OUTPUT, 
false);
-
+    
     HashType hashType;
     try {
       hashType = HashType.valueOf(htype);
@@ -63,13 +63,13 @@ public class MinHashMapper extends Mappe
     }
     hashFunction = HashFactory.createHashFunctions(hashType, numHashFunctions);
   }
-
+  
   /**
-   * Hash all items with each function and retain min. value for each 
iteration.
-   * We up with X number of minhash signatures.
+   * Hash all items with each function and retain min. value for each 
iteration. We up with X number of
+   * minhash signatures.
    * 
-   * Now depending upon the number of key-groups (1 - 4) concatenate that many
-   * minhash values to form cluster-id as 'key' and item-id as 'value'
+   * Now depending upon the number of key-groups (1 - 4) concatenate that many 
minhash values to form
+   * cluster-id as 'key' and item-id as 'value'
    */
   @Override
   public void map(Text item, Writable features, Context context) throws 
IOException, InterruptedException {
@@ -81,6 +81,7 @@ public class MinHashMapper extends Mappe
     for (int i = 0; i < numHashFunctions; i++) {
       minHashValues[i] = Integer.MAX_VALUE;
     }
+    
     for (int i = 0; i < numHashFunctions; i++) {
       for (Vector.Element ele : featureVector) {
         int value = (int) ele.get();
@@ -95,10 +96,10 @@ public class MinHashMapper extends Mappe
       }
     }
     // output the cluster information
-    for (int i = 0; i < numHashFunctions; i += keyGroups) {
+    for (int i = 0; i < numHashFunctions; i++) {
       StringBuilder clusterIdBuilder = new StringBuilder();
-      for (int j = 0; j < keyGroups && (i + j) < numHashFunctions; j++) {
-        clusterIdBuilder.append(minHashValues[i + j]).append('-');
+      for (int j = 0; j < keyGroups; j++) {
+        clusterIdBuilder.append(minHashValues[(i + j) % 
numHashFunctions]).append('-');
       }
       String clusterId = clusterIdBuilder.toString();
       clusterId = clusterId.substring(0, clusterId.lastIndexOf('-'));

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java?rev=1049984&r1=1049983&r2=1049984&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/minhash/MinHashReducer.java
 Thu Dec 16 14:20:00 2010
@@ -29,13 +29,14 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 
-public class MinHashReducer extends Reducer<Text, Writable, Text, Writable> {
-
+public class MinHashReducer extends Reducer<Text,Writable,Text,Writable> {
+  
   private int minClusterSize;
   private boolean debugOutput;
-
+  
   enum Clusters {
-    Accepted, Discarded
+    Accepted,
+    Discarded
   }
   
   @Override
@@ -45,13 +46,13 @@ public class MinHashReducer extends Redu
     this.minClusterSize = conf.getInt(MinhashOptionCreator.MIN_CLUSTER_SIZE, 
5);
     this.debugOutput = conf.getBoolean(MinhashOptionCreator.DEBUG_OUTPUT, 
false);
   }
-
+  
   /**
    * output the items clustered
    */
   @Override
-  protected void reduce(Text cluster, Iterable<Writable> points, Context 
context)
-    throws IOException, InterruptedException {
+  protected void reduce(Text cluster, Iterable<Writable> points, Context 
context) throws IOException,
+                                                                               
  InterruptedException {
     Collection<Writable> pointList = new ArrayList<Writable>();
     for (Writable point : points) {
       if (debugOutput) {
@@ -72,5 +73,5 @@ public class MinHashReducer extends Redu
       context.getCounter(Clusters.Discarded).increment(1);
     }
   }
-
+  
 }

Modified: 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java?rev=1049984&r1=1049983&r2=1049984&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/minhash/TestMinHashClustering.java
 Thu Dec 16 14:20:00 2010
@@ -40,16 +40,16 @@ import java.util.List;
 import java.util.Set;
 
 public class TestMinHashClustering extends MahoutTestCase {
-
-  public static final double[][] REFERENCE = { { 1, 2, 3, 4, 5 },
-      { 2, 1, 3, 6, 7 }, { 3, 7, 6, 11, 8, 9 }, { 4, 7, 8, 9, 6, 1 },
-      { 5, 8, 10, 4, 1 }, { 6, 17, 14, 15 }, { 8, 9, 11, 6, 12, 1, 7 },
-      { 10, 13, 9, 7, 4, 6, 3 }, { 3, 5, 7, 9, 2, 11 }, { 13, 7, 6, 8, 5 } };
-
+  
+  public static final double[][] REFERENCE = { {1, 2, 3, 4, 5}, {2, 1, 3, 6, 
7}, {3, 7, 6, 11, 8, 9},
+                                              {4, 7, 8, 9, 6, 1}, {5, 8, 10, 
4, 1}, {6, 17, 14, 15},
+                                              {8, 9, 11, 6, 12, 1, 7}, {10, 
13, 9, 7, 4, 6, 3},
+                                              {3, 5, 7, 9, 2, 11}, {13, 7, 6, 
8, 5}};
+  
   private FileSystem fs;
   private Path input;
   private Path output;
-
+  
   public static List<VectorWritable> getPointsWritable(double[][] raw) {
     List<VectorWritable> points = new ArrayList<VectorWritable>();
     for (double[] fr : raw) {
@@ -59,7 +59,7 @@ public class TestMinHashClustering exten
     }
     return points;
   }
-
+  
   @Override
   public void setUp() throws Exception {
     super.setUp();
@@ -69,31 +69,31 @@ public class TestMinHashClustering exten
     input = getTestTempDirPath("points");
     output = new Path(getTestTempDirPath(), "output");
     Path pointFile = new Path(input, "file1");
-    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, pointFile, 
Text.class, VectorWritable.class);
+    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, pointFile, 
Text.class,
+        VectorWritable.class);
     int id = 0;
     for (VectorWritable point : points) {
       writer.append(new Text("Id-" + id++), point);
     }
     writer.close();
   }
-
+  
   private String[] makeArguments(int minClusterSize,
                                  int minVectorSize,
                                  int numHashFunctions,
                                  int keyGroups,
-                                 String hashType){
-    return new String[] {
-        optKey(DefaultOptionCreator.INPUT_OPTION), input.toString(),
-        optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
-        optKey(MinhashOptionCreator.MIN_CLUSTER_SIZE), 
String.valueOf(minClusterSize),
-        optKey(MinhashOptionCreator.MIN_VECTOR_SIZE), 
String.valueOf(minVectorSize),
-        optKey(MinhashOptionCreator.HASH_TYPE), hashType,
-        optKey(MinhashOptionCreator.NUM_HASH_FUNCTIONS), 
String.valueOf(numHashFunctions),
-        optKey(MinhashOptionCreator.KEY_GROUPS), String.valueOf(keyGroups),
-        optKey(MinhashOptionCreator.NUM_REDUCERS), "1",
-        optKey(MinhashOptionCreator.DEBUG_OUTPUT), "true"};
+                                 String hashType) {
+    return new String[] {optKey(DefaultOptionCreator.INPUT_OPTION), 
input.toString(),
+                         optKey(DefaultOptionCreator.OUTPUT_OPTION), 
output.toString(),
+                         optKey(MinhashOptionCreator.MIN_CLUSTER_SIZE), 
String.valueOf(minClusterSize),
+                         optKey(MinhashOptionCreator.MIN_VECTOR_SIZE), 
String.valueOf(minVectorSize),
+                         optKey(MinhashOptionCreator.HASH_TYPE), hashType,
+                         optKey(MinhashOptionCreator.NUM_HASH_FUNCTIONS), 
String.valueOf(numHashFunctions),
+                         optKey(MinhashOptionCreator.KEY_GROUPS), 
String.valueOf(keyGroups),
+                         optKey(MinhashOptionCreator.NUM_REDUCERS), "1",
+                         optKey(MinhashOptionCreator.DEBUG_OUTPUT), "true"};
   }
-
+  
   private static Set<Integer> getValues(Vector vector) {
     Iterator<Vector.Element> itr = vector.iterator();
     Set<Integer> values = new HashSet<Integer>();
@@ -102,8 +102,28 @@ public class TestMinHashClustering exten
     }
     return values;
   }
-
-  private void verify(Path output) throws Exception {
+  
+  private void runPairwiseSimilarity(List<Vector> clusteredItems, double 
simThreshold, String msg) {
+    if (clusteredItems.size() > 1) {
+      for (int i = 0; i < clusteredItems.size(); i++) {
+        Set<Integer> itemSet1 = getValues(clusteredItems.get(i));
+        for (int j = i + 1; j < clusteredItems.size(); j++) {
+          Set<Integer> itemSet2 = getValues(clusteredItems.get(j));
+          Set<Integer> union = new HashSet<Integer>();
+          union.addAll(itemSet1);
+          union.addAll(itemSet2);
+          Collection<Integer> intersect = new HashSet<Integer>();
+          intersect.addAll(itemSet1);
+          intersect.retainAll(itemSet2);
+          double similarity = intersect.size() / (double) union.size();
+          assertTrue(msg + " - Sets failed min similarity test, Set1: " + 
itemSet1 + " Set2: " + itemSet2
+                     + ", similarity:" + similarity, similarity >= 
simThreshold);
+        }
+      }
+    }
+  }
+  
+  private void verify(Path output, double simThreshold, String msg) throws 
Exception {
     Configuration conf = new Configuration();
     Path outputFile = new Path(output, "part-r-00000");
     SequenceFile.Reader reader = new SequenceFile.Reader(fs, outputFile, conf);
@@ -115,55 +135,37 @@ public class TestMinHashClustering exten
       if (prevClusterId.equals(clusterId.toString())) {
         clusteredItems.add(point.get().clone());
       } else {
-        if (clusteredItems.size() > 1) {
-          // run pair-wise similarity test on items in a cluster
-          for (int i = 0; i < clusteredItems.size(); i++) {
-            Set<Integer> itemSet1 = getValues(clusteredItems.get(i));
-            for (int j = i + 1; j < clusteredItems.size(); j++) {
-              Set<Integer> itemSet2 = getValues(clusteredItems.get(j));
-              Set<Integer> union = new HashSet<Integer>();
-              union.addAll(itemSet1);
-              union.addAll(itemSet2);
-              Collection<Integer> intersect = new HashSet<Integer>();
-              intersect.addAll(itemSet1);
-              intersect.retainAll(itemSet2);
-              double similarity = intersect.size() / (double) union.size();
-              assertTrue("Sets failed min similarity test, Set1: "
-                  + itemSet1 + " Set2: " + itemSet2 + ", similarity:" + 
similarity, similarity > 0.4);
-            }
-          }
-        }
+        runPairwiseSimilarity(clusteredItems, simThreshold, msg);
         clusteredItems.clear();
         prevClusterId = clusterId.toString();
+        clusteredItems.add(point.get().clone());
       }
     }
+    runPairwiseSimilarity(clusteredItems, simThreshold, msg);
   }
-
+  
   @Test
   public void testLinearMinHashMRJob() throws Exception {
-    String[] args = makeArguments(2, 3, 20, 4, HashType.LINEAR.toString());
+    String[] args = makeArguments(2, 3, 20, 3, HashType.LINEAR.toString());
     int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
     assertEquals("Minhash MR Job failed for " + HashType.LINEAR.toString(), 0, 
ret);
-    System.out.println("Verifying linear hash results");
-    verify(output);
+    verify(output, 0.2, "Hash Type: LINEAR");
   }
-
+  
   @Test
   public void testPolynomialMinHashMRJob() throws Exception {
-    String[] args = makeArguments(2, 3, 20, 4, HashType.POLYNOMIAL.toString());
+    String[] args = makeArguments(2, 3, 20, 3, HashType.POLYNOMIAL.toString());
     int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
     assertEquals("Minhash MR Job failed for " + 
HashType.POLYNOMIAL.toString(), 0, ret);
-    System.out.println("Verifying linear hash results");
-    verify(output);
+    verify(output, 0.3, "Hash Type: POLYNOMIAL");
   }
-
+  
   @Test
   public void testMurmurMinHashMRJob() throws Exception {
     String[] args = makeArguments(2, 3, 20, 4, HashType.MURMUR.toString());
     int ret = ToolRunner.run(new Configuration(), new MinHashDriver(), args);
     assertEquals("Minhash MR Job failed for " + HashType.MURMUR.toString(), 0, 
ret);
-    System.out.println("verifying murmur hash results");
-    verify(output);
+    verify(output, 0.3, "Hash Type: MURMUR");
   }
-
-}
+  
+}
\ No newline at end of file


Reply via email to