Author: jeastman
Date: Fri Sep 24 19:18:46 2010
New Revision: 1001034

URL: http://svn.apache.org/viewvc?rev=1001034&view=rev
Log:
MAHOUT-414: Added Configuration arguments to MeanShift and LDA that I 
overlooked in last commit. 
Added code to find last clusters directory to TestClusterDumper.
All tests run

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java 
(original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java 
Fri Sep 24 19:18:46 2010
@@ -76,36 +76,6 @@ public final class LDADriver extends Abs
     new LDADriver().run(args);
   }
 
-  /**
-   * Run the job using supplied arguments
-   * 
-   * @param input
-   *          the directory pathname for input points
-   * @param output
-   *          the directory pathname for output points
-   * @param numTopics
-   *          the number of topics
-   * @param numWords
-   *          the number of words
-   * @param topicSmoothing
-   *          pseudocounts for each topic, typically small < .5
-   * @param maxIterations
-   *          the maximum number of iterations
-   * @param numReducers
-   *          the number of Reducers desired
-   * @throws IOException
-   */
-  public static void runJob(Path input,
-                            Path output,
-                            int numTopics,
-                            int numWords,
-                            double topicSmoothing,
-                            int maxIterations,
-                            int numReducers) throws IOException, 
InterruptedException, ClassNotFoundException {
-
-    new LDADriver().job(input, output, numTopics, numWords, topicSmoothing, 
maxIterations, numReducers);
-  }
-
   static LDAState createState(Configuration job) throws IOException {
     String statePath = job.get(STATE_IN_KEY);
     int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY));
@@ -165,7 +135,6 @@ public final class LDADriver extends Abs
               "The total number of words in the corpus (can be approximate, 
needs to exceed the actual value)");
     addOption(TOPIC_SMOOTHING_OPTION, "a", "Topic smoothing parameter. Default 
is 50/numTopics.", "-1.0");
     
addOption(DefaultOptionCreator.maxIterationsOption().withRequired(false).create());
-    addOption(DefaultOptionCreator.numReducersOption().create());
 
     if (parseArguments(args) == null) {
       return -1;
@@ -177,7 +146,6 @@ public final class LDADriver extends Abs
       HadoopUtil.overwriteOutput(output);
     }
     int maxIterations = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
-    int numReduceTasks = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_REDUCERS_OPTION));
     int numTopics = Integer.parseInt(getOption(NUM_TOPICS_OPTION));
     int numWords = Integer.parseInt(getOption(NUM_WORDS_OPTION));
     double topicSmoothing = 
Double.parseDouble(getOption(TOPIC_SMOOTHING_OPTION));
@@ -185,25 +153,30 @@ public final class LDADriver extends Abs
       topicSmoothing = 50.0 / numTopics;
     }
 
-    job(input, output, numTopics, numWords, topicSmoothing, maxIterations, 
numReduceTasks);
+    run(getConf(), input, output, numTopics, numWords, topicSmoothing, 
maxIterations);
 
     return 0;
   }
 
   /**
+   * @param conf 
    * @param input
    * @param output
    * @param numTopics
    * @param numWords
    * @param topicSmoothing
    * @param maxIterations
-   * @param numReducers
    * @throws IOException
    * @throws InterruptedException
    * @throws ClassNotFoundException
    */
-  private void job(Path input, Path output, int numTopics, int numWords, 
double topicSmoothing, int maxIterations, int numReducers)
-      throws IOException, InterruptedException, ClassNotFoundException {
+  private void run(Configuration conf,
+                   Path input,
+                   Path output,
+                   int numTopics,
+                   int numWords,
+                   double topicSmoothing,
+                   int maxIterations) throws IOException, 
InterruptedException, ClassNotFoundException {
     Path stateIn = new Path(output, "state-0");
     writeInitialState(stateIn, numTopics, numWords);
     double oldLL = Double.NEGATIVE_INFINITY;
@@ -213,7 +186,7 @@ public final class LDADriver extends Abs
       log.info("Iteration {}", iteration);
       // point the output to a new directory per iteration
       Path stateOut = new Path(output, "state-" + iteration);
-      double ll = runIteration(input, stateIn, stateOut, numTopics, numWords, 
topicSmoothing, numReducers);
+      double ll = runIteration(conf, input, stateIn, stateOut, numTopics, 
numWords, topicSmoothing);
       double relChange = (oldLL - ll) / oldLL;
 
       // now point the input to the old output directory
@@ -280,7 +253,7 @@ public final class LDADriver extends Abs
 
   /**
    * Run the job using supplied arguments
-   * 
+   * @param conf TODO
    * @param input
    *          the directory pathname for input points
    * @param stateIn
@@ -289,17 +262,14 @@ public final class LDADriver extends Abs
    *          the directory pathname for output state
    * @param numTopics
    *          the number of clusters
-   * @param numReducers
-   *          the number of Reducers desired
    */
-  private double runIteration(Path input,
+  private double runIteration(Configuration conf,
+                              Path input,
                               Path stateIn,
                               Path stateOut,
                               int numTopics,
                               int numWords,
-                              double topicSmoothing,
-                              int numReducers) throws IOException, 
InterruptedException, ClassNotFoundException {
-    Configuration conf = new Configuration();
+                              double topicSmoothing) throws IOException, 
InterruptedException, ClassNotFoundException {
     conf.set(STATE_IN_KEY, stateIn.toString());
     conf.set(NUM_TOPICS_KEY, Integer.toString(numTopics));
     conf.set(NUM_WORDS_KEY, Integer.toString(numWords));
@@ -315,7 +285,6 @@ public final class LDADriver extends Abs
     job.setMapperClass(LDAMapper.class);
     job.setReducerClass(LDAReducer.class);
     job.setCombinerClass(LDAReducer.class);
-    job.setNumReduceTasks(numReducers);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     job.setInputFormatClass(SequenceFileInputFormat.class);
     job.setJarByClass(LDADriver.class);

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
 Fri Sep 24 19:18:46 2010
@@ -35,6 +35,7 @@ import org.apache.hadoop.mapreduce.lib.i
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.AbstractCluster;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.WeightedVectorWritable;
@@ -60,7 +61,7 @@ public class MeanShiftCanopyDriver exten
   private static final String CONTROL_CONVERGED = "control/converged";
 
   public static void main(String[] args) throws Exception {
-    new MeanShiftCanopyDriver().run(args);
+    ToolRunner.run(new Configuration(), new MeanShiftCanopyDriver(), args);
   }
 
   @Override
@@ -95,27 +96,17 @@ public class MeanShiftCanopyDriver exten
     double convergenceDelta = 
Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
     int maxIterations = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     boolean inputIsCanopies = hasOption(INPUT_IS_CANOPIES_OPTION);
-    boolean runSequential =
-        
getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
+    boolean runSequential = 
getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
     DistanceMeasure measure = 
ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
 
-    run(input,
-        output,
-        measure,
-        t1,
-        t2,
-        convergenceDelta,
-        maxIterations,
-        inputIsCanopies,
-        runClustering,
-        runSequential);
+    run(getConf(), input, output, measure, t1, t2, convergenceDelta, 
maxIterations, inputIsCanopies, runClustering, runSequential);
     return 0;
   }
 
   /**
    * Run an iteration
-   * 
+   * @param conf TODO
    * @param input
    *          the input pathname String
    * @param output
@@ -131,16 +122,15 @@ public class MeanShiftCanopyDriver exten
    * @param convergenceDelta
    *          the double convergence criteria
    */
-  private static void runIteration(Path input,
+  private static void runIteration(Configuration conf,
+                                   Path input,
                                    Path output,
                                    Path control,
                                    String measureClassName,
                                    double t1,
                                    double t2,
-                                   double convergenceDelta)
-    throws IOException, InterruptedException, ClassNotFoundException {
+                                   double convergenceDelta) throws 
IOException, InterruptedException, ClassNotFoundException {
 
-    Configuration conf = new Configuration();
     conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
     conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY, 
String.valueOf(convergenceDelta));
     conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
@@ -167,7 +157,7 @@ public class MeanShiftCanopyDriver exten
   /**
    * Run the job where the input format can be either Vectors or Canopies.
    * If requested, cluster the input data using the computed Canopies
-   * 
+   * @param conf the Configuration to use
    * @param input
    *          the input pathname String
    * @param output
@@ -188,7 +178,8 @@ public class MeanShiftCanopyDriver exten
    *          true if the input points are to be clustered once the iterations 
complete
    * @param runSequential if true run in sequential execution mode
    */
-  public void run(Path input,
+  public void run(Configuration conf,
+                  Path input,
                   Path output,
                   DistanceMeasure measure,
                   double t1,
@@ -197,31 +188,35 @@ public class MeanShiftCanopyDriver exten
                   int maxIterations,
                   boolean inputIsCanopies,
                   boolean runClustering,
-                  boolean runSequential)
-    throws IOException, InterruptedException, ClassNotFoundException, 
InstantiationException, IllegalAccessException {
+                  boolean runSequential) throws IOException, 
InterruptedException, ClassNotFoundException, InstantiationException,
+      IllegalAccessException {
     Path clustersIn = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
     if (inputIsCanopies) {
       clustersIn = input;
     } else {
-      createCanopyFromVectors(input, clustersIn, measure, runSequential);
+      createCanopyFromVectors(conf, input, clustersIn, measure, runSequential);
     }
 
-    Path clustersOut =
-        buildClusters(clustersIn, output, measure, t1, t2, convergenceDelta, 
maxIterations, runSequential);
+    Path clustersOut = buildClusters(conf, clustersIn, output, measure, t1, 
t2, convergenceDelta, maxIterations, runSequential);
     if (runClustering) {
-      clusterData(inputIsCanopies ? input : new Path(output, 
Cluster.INITIAL_CLUSTERS_DIR),
+      clusterData(conf,
+                  inputIsCanopies ? input : new Path(output, 
Cluster.INITIAL_CLUSTERS_DIR),
                   clustersOut,
                   new Path(output, Cluster.CLUSTERED_POINTS_DIR),
                   runSequential);
     }
   }
 
-  public static void createCanopyFromVectors(Path input, Path output, 
DistanceMeasure measure, boolean runSequential)
-    throws IOException, InterruptedException, ClassNotFoundException, 
InstantiationException, IllegalAccessException {
+  public static void createCanopyFromVectors(Configuration conf,
+                                             Path input,
+                                             Path output,
+                                             DistanceMeasure measure,
+                                             boolean runSequential) throws 
IOException, InterruptedException,
+      ClassNotFoundException, InstantiationException, IllegalAccessException {
     if (runSequential) {
       createCanopyFromVectorsSeq(input, output, measure);
     } else {
-      createCanopyFromVectorsMR(input, output, measure);
+      createCanopyFromVectorsMR(conf, input, output, measure);
     }
   }
 
@@ -230,8 +225,8 @@ public class MeanShiftCanopyDriver exten
    * @param output the Path to the initial clusters directory
    * @param measure the DistanceMeasure
    */
-  private static void createCanopyFromVectorsSeq(Path input, Path output, 
DistanceMeasure measure)
-    throws IOException, InstantiationException, IllegalAccessException {
+  private static void createCanopyFromVectorsSeq(Path input, Path output, 
DistanceMeasure measure) throws IOException,
+      InstantiationException, IllegalAccessException {
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(input.toUri(), conf);
     FileStatus[] status = fs.listStatus(input, new OutputLogFilter());
@@ -258,9 +253,8 @@ public class MeanShiftCanopyDriver exten
     }
   }
 
-  private static void createCanopyFromVectorsMR(Path input, Path output, 
DistanceMeasure measure)
-    throws IOException, InterruptedException, ClassNotFoundException {
-    Configuration conf = new Configuration();
+  private static void createCanopyFromVectorsMR(Configuration conf, Path 
input, Path output, DistanceMeasure measure)
+      throws IOException, InterruptedException, ClassNotFoundException {
     conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, 
measure.getClass().getName());
     Job job = new Job(conf);
     job.setJarByClass(MeanShiftCanopyDriver.class);
@@ -279,7 +273,7 @@ public class MeanShiftCanopyDriver exten
 
   /**
    * Iterate over the input clusters to produce the next cluster directories 
for each iteration
-   * 
+   * @param conf TODO
    * @param clustersIn
    *          the input directory Path
    * @param output
@@ -296,19 +290,20 @@ public class MeanShiftCanopyDriver exten
    *          an int number of iterations
    * @param runSequential if true run in sequential execution mode
    */
-  public Path buildClusters(Path clustersIn,
-                                    Path output,
-                                   DistanceMeasure measure,
-                                   double t1,
-                                   double t2,
-                                   double convergenceDelta,
-                                   int maxIterations,
-                                   boolean runSequential)
-    throws IOException, InterruptedException, ClassNotFoundException, 
InstantiationException, IllegalAccessException {
+  public Path buildClusters(Configuration conf,
+                            Path clustersIn,
+                            Path output,
+                            DistanceMeasure measure,
+                            double t1,
+                            double t2,
+                            double convergenceDelta,
+                            int maxIterations,
+                            boolean runSequential) throws IOException, 
InterruptedException, ClassNotFoundException,
+      InstantiationException, IllegalAccessException {
     if (runSequential) {
       return buildClustersSeq(clustersIn, output, measure, t1, t2, 
convergenceDelta, maxIterations);
     } else {
-      return buildClustersMR(clustersIn, output, measure, t1, t2, 
convergenceDelta, maxIterations);
+      return buildClustersMR(conf, clustersIn, output, measure, t1, t2, 
convergenceDelta, maxIterations);
     }
   }
 
@@ -318,8 +313,7 @@ public class MeanShiftCanopyDriver exten
                                        double t1,
                                        double t2,
                                        double convergenceDelta,
-                                       int maxIterations)
-    throws IOException, InstantiationException, IllegalAccessException {
+                                       int maxIterations) throws IOException, 
InstantiationException, IllegalAccessException {
     MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(measure, 
t1, t2, convergenceDelta);
     List<MeanShiftCanopy> clusters = new ArrayList<MeanShiftCanopy>();
     Configuration conf = new Configuration();
@@ -338,7 +332,7 @@ public class MeanShiftCanopyDriver exten
         reader.close();
       }
     }
-    boolean[] converged = {false};
+    boolean[] converged = { false };
     int iteration = 1;
     while (!converged[0] && iteration <= maxIterations) {
       log.info("Iteration: {}", iteration);
@@ -351,12 +345,9 @@ public class MeanShiftCanopyDriver exten
                                                            
MeanShiftCanopy.class);
       try {
         for (MeanShiftCanopy cluster : clusters) {
-          log.info("Writing Cluster:{} center:{} numPoints:{} radius:{} to: 
{}",
-                   new Object[] { cluster.getId(),
-                                  
AbstractCluster.formatVector(cluster.getCenter(), null),
-                                  cluster.getNumPoints(),
-                                  
AbstractCluster.formatVector(cluster.getRadius(), null),
-                                  clustersOut.getName() });
+          log.info("Writing Cluster:{} center:{} numPoints:{} radius:{} to: 
{}", new Object[] { cluster.getId(),
+              AbstractCluster.formatVector(cluster.getCenter(), null), 
cluster.getNumPoints(),
+              AbstractCluster.formatVector(cluster.getRadius(), null), 
clustersOut.getName() });
           writer.append(new Text(cluster.getIdentifier()), cluster);
         }
       } finally {
@@ -368,14 +359,14 @@ public class MeanShiftCanopyDriver exten
     return clustersIn;
   }
 
-  private static Path buildClustersMR(Path clustersIn,
+  private static Path buildClustersMR(Configuration conf,
+                                      Path clustersIn,
                                       Path output,
                                       DistanceMeasure measure,
                                       double t1,
                                       double t2,
                                       double convergenceDelta,
-                                      int maxIterations)
-    throws IOException, InterruptedException, ClassNotFoundException {
+                                      int maxIterations) throws IOException, 
InterruptedException, ClassNotFoundException {
     // iterate until the clusters converge
     boolean converged = false;
     int iteration = 1;
@@ -384,7 +375,7 @@ public class MeanShiftCanopyDriver exten
       // point the output to a new directory per iteration
       Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
       Path controlOut = new Path(output, CONTROL_CONVERGED);
-      runIteration(clustersIn, clustersOut, controlOut, 
measure.getClass().getName(), t1, t2, convergenceDelta);
+      runIteration(conf, clustersIn, clustersOut, controlOut, 
measure.getClass().getName(), t1, t2, convergenceDelta);
       converged = FileSystem.get(new Configuration()).exists(controlOut);
       // now point the input to the old output directory
       clustersIn = clustersOut;
@@ -395,7 +386,7 @@ public class MeanShiftCanopyDriver exten
 
   /**
    * Run the job using supplied arguments
-   * 
+   * @param conf TODO
    * @param input
    *          the directory pathname for input points
    * @param clustersIn
@@ -404,11 +395,8 @@ public class MeanShiftCanopyDriver exten
    *          the directory pathname for output clustered points
    * @param runSequential if true run in sequential execution mode
    */
-  public static void clusterData(Path input,
-                                 Path clustersIn,
-                                 Path output,
-                                 boolean runSequential)
-    throws IOException, InterruptedException, ClassNotFoundException, 
InstantiationException, IllegalAccessException {
+  public static void clusterData(Configuration conf, Path input, Path 
clustersIn, Path output, boolean runSequential)
+      throws IOException, InterruptedException, ClassNotFoundException, 
InstantiationException, IllegalAccessException {
     if (runSequential) {
       clusterDataSeq(input, clustersIn, output);
     } else {
@@ -416,8 +404,8 @@ public class MeanShiftCanopyDriver exten
     }
   }
 
-  private static void clusterDataSeq(Path input, Path clustersIn, Path output)
-    throws IOException, InstantiationException, IllegalAccessException {
+  private static void clusterDataSeq(Path input, Path clustersIn, Path output) 
throws IOException, InstantiationException,
+      IllegalAccessException {
     Collection<MeanShiftCanopy> clusters = new ArrayList<MeanShiftCanopy>();
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(clustersIn.toUri(), conf);
@@ -452,8 +440,7 @@ public class MeanShiftCanopyDriver exten
         MeanShiftCanopy canopy = 
reader.getValueClass().asSubclass(MeanShiftCanopy.class).newInstance();
         while (reader.next(key, canopy)) {
           MeanShiftCanopy closest = 
MeanShiftCanopyClusterer.findCoveringCanopy(canopy, clusters);
-          writer.append(new IntWritable(closest.getId()),
-                        new WeightedVectorWritable(1, canopy.getCenter()));
+          writer.append(new IntWritable(closest.getId()), new 
WeightedVectorWritable(1, canopy.getCenter()));
           canopy = 
reader.getValueClass().asSubclass(MeanShiftCanopy.class).newInstance();
         }
       } finally {
@@ -463,8 +450,8 @@ public class MeanShiftCanopyDriver exten
     }
   }
 
-  private static void clusterDataMR(Path input, Path clustersIn, Path output)
-    throws IOException, InterruptedException, ClassNotFoundException {
+  private static void clusterDataMR(Path input, Path clustersIn, Path output) 
throws IOException, InterruptedException,
+      ClassNotFoundException {
     Configuration conf = new Configuration();
     conf.set(STATE_IN_KEY, clustersIn.toString());
     Job job = new Job(conf);

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
 Fri Sep 24 19:18:46 2010
@@ -24,6 +24,7 @@ import java.awt.geom.AffineTransform;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
@@ -108,7 +109,7 @@ final class DisplayMeanShift extends Dis
     writeSampleData(samples);
     boolean b = true;
     if (b) {
-      new MeanShiftCanopyDriver().run(samples, output, measure, t1, t2, 0.005, 
20, false, true, true);
+      new MeanShiftCanopyDriver().run(new Configuration(), samples, output, 
measure, t1, t2, 0.005, 20, false, true, true);
       loadClusters(output);
     } else {
       List<Vector> points = new ArrayList<Vector>();

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
 Fri Sep 24 19:18:46 2010
@@ -127,7 +127,8 @@ public final class Job extends AbstractJ
       IllegalAccessException {
     Path directoryContainingConvertedInput = new Path(output, 
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     InputDriver.runJob(input, directoryContainingConvertedInput);
-    new MeanShiftCanopyDriver().run(directoryContainingConvertedInput,
+    new MeanShiftCanopyDriver().run(conf,
+                                    directoryContainingConvertedInput,
                                     output,
                                     measure,
                                     t1,
@@ -135,8 +136,7 @@ public final class Job extends AbstractJ
                                     convergenceDelta,
                                     maxIterations,
                                     true,
-                                    true,
-                                    false);
+                                    true, false);
     // run ClusterDumper
     ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-" + maxIterations), new Path(output,
                                                                                
                             "clusteredPoints"));

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
 Fri Sep 24 19:18:46 2010
@@ -150,6 +150,26 @@ public final class TestClusterDumper ext
     }
   }
 
+  /**
+   * Return the path to the final iteration's clusters
+   * 
+   * @param conf 
+   * @param output
+   * @param maxIterations
+   * @return
+   * @throws IOException 
+   */
+  private Path finalClusterPath(Configuration conf, Path output, int 
maxIterations) throws IOException {
+    FileSystem fs = FileSystem.get(conf);
+    for (int i = maxIterations; i >= 0; i--) {
+      Path clusters = new Path(output, "clusters-" + i);
+      if (fs.exists(clusters)) {
+        return clusters;
+      }
+    }
+    return null;
+  }
+
   @Test
   public void testCanopy() throws Exception { // now run the Job
     DistanceMeasure measure = new EuclideanDistanceMeasure();
@@ -166,18 +186,12 @@ public final class TestClusterDumper ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // now run the Canopy job to prime kMeans canopies
     Path output = getTestTempDirPath("output");
-    CanopyDriver.run(new Configuration(), getTestTempDirPath("testdata"), 
output, measure, 8, 4, false, false);
+    Configuration conf = new Configuration();
+    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8, 
4, false, false);
     // now run the KMeans job
-    KMeansDriver.run(getTestTempDirPath("testdata"),
-    new Path(output, "clusters-0"),
-    output,
-    measure,
-    0.001,
-    10,
-    true,
-    false);
+    KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output, 
"clusters-0"), output, measure, 0.001, 10, true, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-2"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
@@ -186,21 +200,23 @@ public final class TestClusterDumper ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     // now run the Canopy job to prime kMeans canopies
     Path output = getTestTempDirPath("output");
-    CanopyDriver.run(new Configuration(), getTestTempDirPath("testdata"), 
output, measure, 8, 4, false, false);
+    Configuration conf = new Configuration();
+    CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8, 
4, false, false);
     // now run the Fuzzy KMeans job
-    FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
-    new Path(output, "clusters-0"),
-    output,
-    measure,
-    0.001,
-    10,
-    ((float) 1.1),
-    true,
-    true,
-    0,
-    false);
+    FuzzyKMeansDriver.run(conf,
+                          getTestTempDirPath("testdata"),
+                          new Path(output, "clusters-0"),
+                          output,
+                          measure,
+                          0.001,
+                          10,
+                          ((float) 1.1),
+                          true,
+                          true,
+                          0,
+                          false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-3"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
@@ -208,18 +224,10 @@ public final class TestClusterDumper ext
   public void testMeanShift() throws Exception {
     DistanceMeasure measure = new CosineDistanceMeasure();
     Path output = getTestTempDirPath("output");
-    new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
-    output,
-    measure,
-    0.5,
-    0.01,
-    0.05,
-    10,
-    false,
-    true,
-    false);
+    Configuration conf = new Configuration();
+    new MeanShiftCanopyDriver().run(conf, getTestTempDirPath("testdata"), 
output, measure, 0.5, 0.01, 0.05, 10, false, true, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-1"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
@@ -228,9 +236,10 @@ public final class TestClusterDumper ext
     Path output = getTestTempDirPath("output");
     NamedVector prototype = (NamedVector) sampleData.get(0).get();
     AbstractVectorModelDistribution modelDistribution = new 
SampledNormalDistribution(new VectorWritable(prototype));
-    DirichletDriver.run(getTestTempDirPath("testdata"), output, 
modelDistribution, 15, 10, 1.0, true, true, 0, false);
+    Configuration conf = new Configuration();
+    DirichletDriver.run(conf, getTestTempDirPath("testdata"), output, 
modelDistribution, 15, 10, 1.0, true, true, 0, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-10"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
@@ -239,9 +248,10 @@ public final class TestClusterDumper ext
     Path output = getTestTempDirPath("output");
     NamedVector prototype = (NamedVector) sampleData.get(0).get();
     AbstractVectorModelDistribution modelDistribution = new 
GaussianClusterDistribution(new VectorWritable(prototype));
-    DirichletDriver.run(getTestTempDirPath("testdata"), output, 
modelDistribution, 15, 10, 1.0, true, true, 0, true);
+    Configuration conf = new Configuration();
+    DirichletDriver.run(conf, getTestTempDirPath("testdata"), output, 
modelDistribution, 15, 10, 1.0, true, true, 0, true);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-10"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
@@ -250,9 +260,10 @@ public final class TestClusterDumper ext
     Path output = getTestTempDirPath("output");
     NamedVector prototype = (NamedVector) sampleData.get(0).get();
     AbstractVectorModelDistribution modelDistribution = new 
DistanceMeasureClusterDistribution(new VectorWritable(prototype));
-    DirichletDriver.run(getTestTempDirPath("testdata"), output, 
modelDistribution, 15, 10, 1.0, true, true, 0, true);
+    Configuration conf = new Configuration();
+    DirichletDriver.run(conf, getTestTempDirPath("testdata"), output, 
modelDistribution, 15, 10, 1.0, true, true, 0, true);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-10"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
@@ -315,16 +326,9 @@ public final class TestClusterDumper ext
     // now run the Canopy job to prime kMeans canopies
     CanopyDriver.run(conf, svdData, output, measure, 8, 4, false, false);
     // now run the KMeans job
-    KMeansDriver.run(svdData,
-    new Path(output, "clusters-0"),
-    output,
-    measure,
-    0.001,
-    10,
-    true,
-    false);
+    KMeansDriver.run(svdData, new Path(output, "clusters-0"), output, measure, 
0.001, 10, true, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-2"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
@@ -355,16 +359,9 @@ public final class TestClusterDumper ext
     // now run the Canopy job to prime kMeans canopies
     CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false, 
false);
     // now run the KMeans job
-    KMeansDriver.run(sData.getRowPath(),
-    new Path(output, "clusters-0"),
-    output,
-    measure,
-    0.001,
-    10,
-    true,
-    false);
+    KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"), 
output, measure, 0.001, 10, true, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-2"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 
@@ -397,16 +394,9 @@ public final class TestClusterDumper ext
     // now run the Canopy job to prime kMeans canopies
     CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false, 
false);
     // now run the KMeans job
-    KMeansDriver.run(sData.getRowPath(),
-    new Path(output, "clusters-0"),
-    output,
-    measure,
-    0.001,
-    10,
-    true,
-    false);
+    KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"), 
output, measure, 0.001, 10, true, false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-2"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, 10), new Path(output, "clusteredPoints"));
     clusterDumper.printClusters(termDictionary);
   }
 }

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
 Fri Sep 24 19:18:46 2010
@@ -280,7 +280,9 @@ public final class TestClusterEvaluator 
   @Test
   public void testMeanShift() throws Exception {
     DistanceMeasure measure = new EuclideanDistanceMeasure();
-    new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
+    Configuration conf = new Configuration();
+    new MeanShiftCanopyDriver().run(conf,
+                                    getTestTempDirPath("testdata"),
                                     getTestTempDirPath("output"),
                                     measure,
                                     2.1,
@@ -288,11 +290,9 @@ public final class TestClusterEvaluator 
                                     0.001,
                                     10,
                                     false,
-                                    true,
-                                    false);
+                                    true, false);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
-    Configuration conf = new Configuration();
     Path clustersIn = new Path(output, "clusters-2");
     RepresentativePointsDriver.run(conf,
                                    clustersIn,

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
 Fri Sep 24 19:18:46 2010
@@ -297,7 +297,9 @@ public final class TestCDbwEvaluator ext
   @Test
   public void testMeanShift() throws Exception {
     DistanceMeasure measure = new EuclideanDistanceMeasure();
-    new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
+    Configuration conf = new Configuration();
+    new MeanShiftCanopyDriver().run(conf,
+                                    getTestTempDirPath("testdata"),
                                     getTestTempDirPath("output"),
                                     measure,
                                     2.1,
@@ -305,11 +307,9 @@ public final class TestCDbwEvaluator ext
                                     0.001,
                                     10,
                                     false,
-                                    true,
-                                    false);
+                                    true, false);
     int numIterations = 2;
     Path output = getTestTempDirPath("output");
-    Configuration conf = new Configuration();
     Path clustersIn = new Path(output, "clusters-2");
     RepresentativePointsDriver.run(conf, clustersIn, new Path(output, 
"clusteredPoints"), output, measure, numIterations);
     CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);


Reply via email to