Author: jeastman
Date: Sat Sep 25 16:33:05 2010
New Revision: 1001258

URL: http://svn.apache.org/viewvc?rev=1001258&view=rev
Log:
MAHOUT-236:
- Modified ClusterEvaluator to use the same dataset as the clustering Display 
examples.
- Switched evaluator to run sequential versions of the clustering jobs to 
reduce execution time.
- Fixed a clusteredPoints path bug in sequential Mean Shift clustering
All tests run

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=1001258&r1=1001257&r2=1001258&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
 Sat Sep 25 16:33:05 2010
@@ -426,13 +426,12 @@ public class MeanShiftCanopyDriver exten
     // iterate over all points, assigning each to the closest canopy and 
outputting that clustering
     fs = FileSystem.get(input.toUri(), conf);
     status = fs.listStatus(input, new OutputLogFilter());
-    Path outPath = new Path(output, 
CanopyDriver.DEFAULT_CLUSTERED_POINTS_DIRECTORY);
     int part = 0;
     for (FileStatus s : status) {
       SequenceFile.Reader reader = new SequenceFile.Reader(fs, s.getPath(), 
conf);
       SequenceFile.Writer writer = new SequenceFile.Writer(fs,
                                                            conf,
-                                                           new Path(outPath, 
"part-m-" + part++),
+                                                           new Path(output, 
"part-m-" + part++),
                                                            IntWritable.class,
                                                            
WeightedVectorWritable.class);
       try {

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java?rev=1001258&r1=1001257&r2=1001258&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
 Sat Sep 25 16:33:05 2010
@@ -116,7 +116,7 @@ public final class RepresentativePointsD
         SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, 
IntWritable.class, VectorWritable.class);
         while (reader.next(key, value)) {
           Cluster cluster = (Cluster) value;
-          log.debug("C-" + cluster.getId() + ": " + 
AbstractCluster.formatVector(cluster.getCenter(), null));
+          log.info("C-" + cluster.getId() + ": " + 
AbstractCluster.formatVector(cluster.getCenter(), null));
           writer.append(new IntWritable(cluster.getId()), new 
VectorWritable(cluster.getCenter()));
         }
         writer.close();

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1001258&r1=1001257&r2=1001258&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
 Sat Sep 25 16:33:05 2010
@@ -33,6 +33,7 @@ import org.apache.hadoop.io.Writable;
 import org.apache.mahout.clustering.canopy.Canopy;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.dirichlet.DirichletDriver;
+import org.apache.mahout.clustering.dirichlet.UncommonDistributions;
 import 
org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
 import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
 import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
@@ -47,25 +48,70 @@ import org.apache.mahout.math.DenseVecto
 import org.apache.mahout.math.VectorWritable;
 import org.junit.Before;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public final class TestClusterEvaluator extends MahoutTestCase {
 
   private static final double[][] REFERENCE = { { 1, 1 }, { 2, 1 }, { 1, 2 }, 
{ 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
       { 5, 5 } };
 
+  private List<VectorWritable> referenceData = new ArrayList<VectorWritable>();
+
+  private List<VectorWritable> sampleData = new ArrayList<VectorWritable>();
+
   private Map<Integer, List<VectorWritable>> representativePoints;
 
   private List<Cluster> clusters;
 
+  private static final Logger log = 
LoggerFactory.getLogger(TestClusterEvaluator.class);
+
+  private Configuration conf;
+
+  private FileSystem fs;
+
+  private Path testdata;
+
+  private Path output;
+
   @Override
   @Before
   public void setUp() throws Exception {
     super.setUp();
-    Configuration conf = new Configuration();
-    FileSystem fs = FileSystem.get(conf);
-    // Create test data
-    List<VectorWritable> sampleData = 
TestKmeansClustering.getPointsWritable(REFERENCE);
-    ClusteringTestUtils.writePointsToFile(sampleData, 
getTestTempFilePath("testdata/file1"), fs, conf);
+    conf = new Configuration();
+    fs = FileSystem.get(conf);
+    testdata = getTestTempDirPath("testdata");
+    output = getTestTempDirPath("output");
+    // Create small reference data set
+    referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
+    // generate larger test data set for the clustering tests to chew on
+    generateSamples();
+  }
+
+  /**
+   * Generate random samples and add them to the sampleData
+   * 
+   * @param num
+   *          int number of samples to generate
+   * @param mx
+   *          double x-value of the sample mean
+   * @param my
+   *          double y-value of the sample mean
+   * @param sd
+   *          double standard deviation of the samples
+   */
+  private void generateSamples(int num, double mx, double my, double sd) {
+    log.info("Generating {} samples m=[{}, {}] sd={}", new Object[] { num, mx, 
my, sd });
+    for (int i = 0; i < num; i++) {
+      sampleData.add(new VectorWritable(new DenseVector(new double[] { 
UncommonDistributions.rNorm(mx, sd),
+          UncommonDistributions.rNorm(my, sd) })));
+    }
+  }
+
+  private void generateSamples() {
+    generateSamples(500, 1, 1, 3);
+    generateSamples(300, 1, 0, 0.5);
+    generateSamples(300, 0, 2, 0.1);
   }
 
   private void checkRefPoints(int numIterations) throws IOException {
@@ -115,7 +161,8 @@ public final class TestClusterEvaluator 
   }
 
   @Test
-  public void testCluster0() {
+  public void testCluster0() throws IOException {
+    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.25, measure);
     ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, 
clusters, measure);
@@ -124,7 +171,8 @@ public final class TestClusterEvaluator 
   }
 
   @Test
-  public void testCluster1() {
+  public void testCluster1() throws IOException {
+    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.5, measure);
     ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, 
clusters, measure);
@@ -133,7 +181,8 @@ public final class TestClusterEvaluator 
   }
 
   @Test
-  public void testCluster2() {
+  public void testCluster2() throws IOException {
+    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.75, measure);
     ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints, 
clusters, measure);
@@ -142,7 +191,8 @@ public final class TestClusterEvaluator 
   }
 
   @Test
-  public void testEmptyCluster() {
+  public void testEmptyCluster() throws IOException {
+    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.25, measure);
     Canopy cluster = new Canopy(new DenseVector(new double[] { 10, 10 }), 19, 
measure);
@@ -155,7 +205,8 @@ public final class TestClusterEvaluator 
   }
 
   @Test
-  public void testSingleValueCluster() {
+  public void testSingleValueCluster() throws IOException {
+    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.25, measure);
     Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, 
measure);
@@ -171,9 +222,11 @@ public final class TestClusterEvaluator 
   /**
    * Representative points extraction will duplicate the cluster center if the 
cluster has no 
    * assigned points. These clusters should be ignored like empty clusters 
above
+   * @throws IOException 
    */
   @Test
-  public void testAllSameValueCluster() {
+  public void testAllSameValueCluster() throws IOException {
+    ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.25, measure);
     Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19, 
measure);
@@ -189,12 +242,12 @@ public final class TestClusterEvaluator 
   }
 
   @Test
-  public void testCanopy() throws Exception { // now run the Job
+  public void testCanopy() throws Exception {
+    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     Configuration conf = new Configuration();
-    CanopyDriver.run(conf, getTestTempDirPath("testdata"), 
getTestTempDirPath("output"), measure, 3.1, 2.1, true, false);
-    int numIterations = 2;
-    Path output = getTestTempDirPath("output");
+    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, true);
+    int numIterations = 10;
     Path clustersIn = new Path(output, "clusters-0");
     RepresentativePointsDriver.run(conf, clustersIn, new Path(output, 
"clusteredPoints"), output, measure, numIterations);
     ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
@@ -207,28 +260,16 @@ public final class TestClusterEvaluator 
 
   @Test
   public void testKmeans() throws Exception {
+    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // now run the Canopy job to prime kMeans canopies
     Configuration conf = new Configuration();
-    CanopyDriver.run(conf,
-                     getTestTempDirPath("testdata"),
-                     getTestTempDirPath("output"),
-                     measure,
-                     3.1,
-                     2.1,
-                     false,
-                     false);
+    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, true);
     // now run the KMeans job
-    Path output = getTestTempDirPath("output");
-    KMeansDriver.run(getTestTempDirPath("testdata"), new Path(output, 
"clusters-0"), output, measure, 0.001, 10, true, false);
-    int numIterations = 2;
+    KMeansDriver.run(testdata, new Path(output, "clusters-0"), output, 
measure, 0.001, 10, true, true);
+    int numIterations = 10;
     Path clustersIn = new Path(output, "clusters-2");
-    RepresentativePointsDriver.run(conf,
-                                   clustersIn,
-                                   new Path(output, "clusteredPoints"),
-                                   output,
-                                   measure,
-                                   numIterations);
+    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, 
"clusteredPoints"), output, measure, numIterations);
     ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
     // now print out the Results
     System.out.println("Intra-cluster density = " + 
evaluator.intraClusterDensity());
@@ -238,38 +279,16 @@ public final class TestClusterEvaluator 
 
   @Test
   public void testFuzzyKmeans() throws Exception {
+    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // now run the Canopy job to prime kMeans canopies
     Configuration conf = new Configuration();
-    CanopyDriver.run(conf,
-                     getTestTempDirPath("testdata"),
-                     getTestTempDirPath("output"),
-                     measure,
-                     3.1,
-                     2.1,
-                     false,
-                     false);
+    CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, true);
     // now run the KMeans job
-    Path output = getTestTempDirPath("output");
-    FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
-                          new Path(output, "clusters-0"),
-                          output,
-                          measure,
-                          0.001,
-                          10,
-                          2,
-                          true,
-                          true,
-                          0,
-                          false);
-    int numIterations = 2;
+    FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0"), output, 
measure, 0.001, 10, 2, true, true, 0, true);
+    int numIterations = 10;
     Path clustersIn = new Path(output, "clusters-4");
-    RepresentativePointsDriver.run(conf,
-                                   clustersIn,
-                                   new Path(output, "clusteredPoints"),
-                                   output,
-                                   measure,
-                                   numIterations);
+    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, 
"clusteredPoints"), output, measure, numIterations);
     ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
     // now print out the Results
     System.out.println("Intra-cluster density = " + 
evaluator.intraClusterDensity());
@@ -279,27 +298,13 @@ public final class TestClusterEvaluator 
 
   @Test
   public void testMeanShift() throws Exception {
+    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, 
"file1"), fs, conf);
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     Configuration conf = new Configuration();
-    new MeanShiftCanopyDriver().run(conf,
-                                    getTestTempDirPath("testdata"),
-                                    getTestTempDirPath("output"),
-                                    measure,
-                                    2.1,
-                                    1.0,
-                                    0.001,
-                                    10,
-                                    false,
-                                    true, false);
-    int numIterations = 2;
-    Path output = getTestTempDirPath("output");
-    Path clustersIn = new Path(output, "clusters-2");
-    RepresentativePointsDriver.run(conf,
-                                   clustersIn,
-                                   new Path(output, "clusteredPoints"),
-                                   output,
-                                   measure,
-                                   numIterations);
+    new MeanShiftCanopyDriver().run(conf, testdata, output, measure, 2.1, 1.0, 
0.001, 10, false, true, true);
+    int numIterations = 10;
+    Path clustersIn = new Path(output, "clusters-10");
+    RepresentativePointsDriver.run(conf, clustersIn, new Path(output, 
"clusteredPoints"), output, measure, numIterations);
     ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
     // now print out the Results
     System.out.println("Intra-cluster density = " + 
evaluator.intraClusterDensity());
@@ -309,19 +314,10 @@ public final class TestClusterEvaluator 
 
   @Test
   public void testDirichlet() throws Exception {
+    ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata, 
"file1"), fs, conf);
     ModelDistribution<VectorWritable> modelDistribution = new 
GaussianClusterDistribution(new VectorWritable(new DenseVector(2)));
-    DirichletDriver.run(getTestTempDirPath("testdata"),
-                        getTestTempDirPath("output"),
-                        modelDistribution,
-                        15,
-                        5,
-                        1.0,
-                        true,
-                        true,
-                        0,
-                        true);
-    int numIterations = 2;
-    Path output = getTestTempDirPath("output");
+    DirichletDriver.run(testdata, output, modelDistribution, 15, 5, 1.0, true, 
true, 0, true);
+    int numIterations = 10;
     Configuration conf = new Configuration();
     Path clustersIn = new Path(output, "clusters-5");
     RepresentativePointsDriver.run(conf,


Reply via email to