Author: jeastman
Date: Fri Sep 24 18:33:55 2010
New Revision: 1001015

URL: http://svn.apache.org/viewvc?rev=1001015&view=rev
Log:
MAHOUT-504. Fixed CLI arguments and did other refactoring of synthetic control
example. Tested CLI invocation with explicit arguments which was the source of
the problems cited in this issue. All tests run

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
 Fri Sep 24 18:33:55 2010
@@ -71,11 +71,11 @@ public class DirichletDriver extends Abs
 
   public static final String THRESHOLD_KEY = 
"org.apache.mahout.clustering.dirichlet.threshold";
 
-  protected static final String MODEL_PROTOTYPE_CLASS_OPTION = 
"modelPrototype";
+  public static final String MODEL_PROTOTYPE_CLASS_OPTION = "modelPrototype";
 
-  protected static final String MODEL_DISTRIBUTION_CLASS_OPTION = "modelDist";
+  public static final String MODEL_DISTRIBUTION_CLASS_OPTION = "modelDist";
 
-  protected static final String ALPHA_OPTION = "alpha";
+  public static final String ALPHA_OPTION = "alpha";
 
   private static final Logger log = 
LoggerFactory.getLogger(DirichletDriver.class);
 

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
 Fri Sep 24 18:33:55 2010
@@ -55,9 +55,7 @@ import org.slf4j.LoggerFactory;
 
 public class FuzzyKMeansDriver extends AbstractJob {
 
-  protected static final String M_OPTION = "m";
-
-  public static final String M_OPTION_KEY = "--" + M_OPTION;
+  public static final String M_OPTION = "m";
 
   private static final Logger log = 
LoggerFactory.getLogger(FuzzyKMeansDriver.class);
 

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
 Fri Sep 24 18:33:55 2010
@@ -51,7 +51,7 @@ import org.slf4j.LoggerFactory;
 
 public class MeanShiftCanopyDriver extends AbstractJob {
 
-  protected static final String INPUT_IS_CANOPIES_OPTION = "inputIsCanopies";
+  public static final String INPUT_IS_CANOPIES_OPTION = "inputIsCanopies";
 
   private static final Logger log = 
LoggerFactory.getLogger(MeanShiftCanopyDriver.class);
 
@@ -63,52 +63,6 @@ public class MeanShiftCanopyDriver exten
     new MeanShiftCanopyDriver().run(args);
   }
 
-  /**
-   * Run the job on a new driver instance (convenience)
-   * 
-   * @param input
-   *          the input pathname String
-   * @param output
-   *          the output pathname String
-   * @param measure
-   *          the DistanceMeasure
-   * @param t1
-   *          the T1 distance threshold
-   * @param t2
-   *          the T2 distance threshold
-   * @param convergenceDelta
-   *          the double convergence criteria
-   * @param maxIterations
-   *          an int number of iterations
-   * @param inputIsCanopies 
-              true if the input path already contains MeanShiftCanopies and 
does not need to be converted from Vectors
-   * @param runClustering 
-   *          true if the input points are to be clustered once the iterations 
complete
-   * @param runSequential if true run in sequential execution mode
-   */
-  public static void runJob(Path input,
-                            Path output,
-                            DistanceMeasure measure,
-                            double t1,
-                            double t2,
-                            double convergenceDelta,
-                            int maxIterations,
-                            boolean inputIsCanopies,
-                            boolean runClustering,
-                            boolean runSequential)
-    throws IOException, InterruptedException, ClassNotFoundException, 
InstantiationException, IllegalAccessException {
-    new MeanShiftCanopyDriver().job(input,
-                                    output,
-                                    measure,
-                                    t1,
-                                    t2,
-                                    convergenceDelta,
-                                    maxIterations,
-                                    inputIsCanopies,
-                                    runClustering,
-                                    runSequential);
-  }
-
   @Override
   public int run(String[] args) throws Exception {
     addInputOption();
@@ -146,7 +100,7 @@ public class MeanShiftCanopyDriver exten
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
     DistanceMeasure measure = 
ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
 
-    job(input,
+    run(input,
         output,
         measure,
         t1,
@@ -234,7 +188,7 @@ public class MeanShiftCanopyDriver exten
    *          true if the input points are to be clustered once the iterations 
complete
    * @param runSequential if true run in sequential execution mode
    */
-  public void job(Path input,
+  public void run(Path input,
                   Path output,
                   DistanceMeasure measure,
                   double t1,

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
 Fri Sep 24 18:33:55 2010
@@ -73,7 +73,7 @@ final class DisplayMeanShift extends Dis
       DisplayClustering.plotRectangle(g2, v.get(), dv);
     }
     int i = 0;
-    for (Cluster cluster : CLUSTERS.get(CLUSTERS.size()-1)) {
+    for (Cluster cluster : CLUSTERS.get(CLUSTERS.size() - 1)) {
       MeanShiftCanopy canopy = (MeanShiftCanopy) cluster;
       if (canopy.getBoundPoints().toList().size() >= significance * 
DisplayClustering.SAMPLE_DATA.size()) {
         g2.setColor(COLORS[Math.min(i++, DisplayClustering.COLORS.length - 
1)]);
@@ -108,7 +108,7 @@ final class DisplayMeanShift extends Dis
     writeSampleData(samples);
     boolean b = true;
     if (b) {
-      MeanShiftCanopyDriver.runJob(samples, output, measure, t1, t2, 0.005, 
20, false, true, true);
+      new MeanShiftCanopyDriver().run(samples, output, measure, t1, t2, 0.005, 
20, false, true, true);
       loadClusters(output);
     } else {
       List<Vector> points = new ArrayList<Vector>();

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
 Fri Sep 24 18:33:55 2010
@@ -25,6 +25,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
+import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.DistanceMeasure;
@@ -33,7 +34,7 @@ import org.apache.mahout.utils.clusterin
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public final class Job extends CanopyDriver {
+public final class Job extends AbstractJob {
 
   private Job() {
   }

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
 Fri Sep 24 18:33:55 2010
@@ -39,6 +39,7 @@ import org.apache.mahout.clustering.diri
 import org.apache.mahout.clustering.dirichlet.models.NormalModelDistribution;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
 import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
+import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.math.RandomAccessSparseVector;
@@ -47,7 +48,7 @@ import org.apache.mahout.utils.clusterin
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public final class Job extends DirichletDriver {
+public final class Job extends AbstractJob {
 
   private static final Logger log = LoggerFactory.getLogger(Job.class);
 
@@ -75,16 +76,16 @@ public final class Job extends Dirichlet
     addOption(DefaultOptionCreator.maxIterationsOption().create());
     
addOption(DefaultOptionCreator.numClustersOption().withRequired(true).create());
     addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption(new 
DefaultOptionBuilder().withLongName(ALPHA_OPTION).withRequired(false).withShortName("m")
-        .withArgument(new 
ArgumentBuilder().withName(ALPHA_OPTION).withDefault("1.0").withMinimum(1).withMaximum(1).create())
-        .withDescription("The alpha0 value for the DirichletDistribution. 
Defaults to 1.0").create());
-    addOption(new 
DefaultOptionBuilder().withLongName(MODEL_DISTRIBUTION_CLASS_OPTION).withRequired(false).withShortName("md")
-        .withArgument(new 
ArgumentBuilder().withName(MODEL_DISTRIBUTION_CLASS_OPTION).withDefault(NormalModelDistribution.class
-            
.getName()).withMinimum(1).withMaximum(1).create()).withDescription("The 
ModelDistribution class name. "
-            + "Defaults to NormalModelDistribution").create());
-    addOption(new 
DefaultOptionBuilder().withLongName(MODEL_PROTOTYPE_CLASS_OPTION).withRequired(false).withShortName("mp")
-        .withArgument(new 
ArgumentBuilder().withName("prototypeClass").withDefault(RandomAccessSparseVector.class.getName())
-            .withMinimum(1).withMaximum(1).create())
+    addOption(new 
DefaultOptionBuilder().withLongName(DirichletDriver.ALPHA_OPTION).withRequired(false).withShortName("m")
+        .withArgument(new 
ArgumentBuilder().withName(DirichletDriver.ALPHA_OPTION).withDefault("1.0").withMinimum(1).withMaximum(1)
+            .create()).withDescription("The alpha0 value for the 
DirichletDistribution. Defaults to 1.0").create());
+    addOption(new 
DefaultOptionBuilder().withLongName(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION).withRequired(false)
+        .withShortName("md").withArgument(new 
ArgumentBuilder().withName(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION)
+            
.withDefault(NormalModelDistribution.class.getName()).withMinimum(1).withMaximum(1).create())
+        .withDescription("The ModelDistribution class name. " + "Defaults to 
NormalModelDistribution").create());
+    addOption(new 
DefaultOptionBuilder().withLongName(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION).withRequired(false)
+        .withShortName("mp").withArgument(new 
ArgumentBuilder().withName("prototypeClass")
+            
.withDefault(RandomAccessSparseVector.class.getName()).withMinimum(1).withMaximum(1).create())
         .withDescription("The ModelDistribution prototype Vector class name. 
Defaults to RandomAccessSparseVector").create());
     
addOption(DefaultOptionCreator.distanceMeasureOption().withRequired(false).create());
     addOption(DefaultOptionCreator.emitMostLikelyOption().create());
@@ -100,19 +101,18 @@ public final class Job extends Dirichlet
     if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
       HadoopUtil.overwriteOutput(output);
     }
-    String modelFactory = getOption(MODEL_DISTRIBUTION_CLASS_OPTION);
-    String modelPrototype = getOption(MODEL_PROTOTYPE_CLASS_OPTION);
+    String modelFactory = 
getOption(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION);
+    String modelPrototype = 
getOption(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION);
     String distanceMeasure = 
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
     int numModels = 
Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
     int maxIterations = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     boolean emitMostLikely = 
Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION));
     double threshold = 
Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION));
-    double alpha0 = Double.parseDouble(getOption(ALPHA_OPTION));
-    int prototypeSize = DirichletDriver.readPrototypeSize(input);
+    double alpha0 = 
Double.parseDouble(getOption(DirichletDriver.ALPHA_OPTION));
     AbstractVectorModelDistribution modelDistribution = 
DirichletDriver.createModelDistribution(modelFactory,
                                                                                
                 modelPrototype,
                                                                                
                 distanceMeasure,
-                                                                               
                 prototypeSize);
+                                                                               
                 60);
 
     run(input, output, modelDistribution, numModels, maxIterations, alpha0, 
emitMostLikely, threshold);
     return 0;
@@ -134,14 +134,14 @@ public final class Job extends Dirichlet
    * @param alpha0
    *          the alpha0 value for the DirichletDistribution
    */
-  private void run(Path input,
-                   Path output,
-                   ModelDistribution<VectorWritable> modelDistribution,
-                   int numModels,
-                   int maxIterations,
-                   double alpha0,
-                   boolean emitMostLikely,
-                   double threshold) throws IOException, 
ClassNotFoundException, InstantiationException, IllegalAccessException,
+  public void run(Path input,
+                  Path output,
+                  ModelDistribution<VectorWritable> modelDistribution,
+                  int numModels,
+                  int maxIterations,
+                  double alpha0,
+                  boolean emitMostLikely,
+                  double threshold) throws IOException, 
ClassNotFoundException, InstantiationException, IllegalAccessException,
       NoSuchMethodException, InvocationTargetException, SecurityException, 
InterruptedException {
     Path directoryContainingConvertedInput = new Path(output, 
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     InputDriver.runJob(input, directoryContainingConvertedInput, 
"org.apache.mahout.math.RandomAccessSparseVector");
@@ -182,11 +182,11 @@ public final class Job extends Dirichlet
                                   double alpha0) throws NoSuchMethodException, 
InvocationTargetException {
     Collection<List<DirichletCluster>> clusters = new 
ArrayList<List<DirichletCluster>>();
     Configuration conf = new Configuration();
-    conf.set(MODEL_DISTRIBUTION_KEY, modelDistribution.asJsonString());
-    conf.set(NUM_CLUSTERS_KEY, Integer.toString(numModels));
-    conf.set(ALPHA_0_KEY, Double.toString(alpha0));
+    conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY, 
modelDistribution.asJsonString());
+    conf.set(DirichletDriver.NUM_CLUSTERS_KEY, Integer.toString(numModels));
+    conf.set(DirichletDriver.ALPHA_0_KEY, Double.toString(alpha0));
     for (int i = 0; i < numIterations; i++) {
-      conf.set(STATE_IN_KEY, output + "/clusters-" + i);
+      conf.set(DirichletDriver.STATE_IN_KEY, output + "/clusters-" + i);
       clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
     }
     printClusters(clusters, 0);

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
 Fri Sep 24 18:33:55 2010
@@ -23,14 +23,15 @@ import java.util.Map;
 import org.apache.commons.cli2.builder.ArgumentBuilder;
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
 import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
+import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.DistanceMeasure;
@@ -40,8 +41,9 @@ import org.apache.mahout.utils.clusterin
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public final class Job extends FuzzyKMeansDriver {
+public final class Job extends AbstractJob {
 
+  private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION;
   private static final Logger log = LoggerFactory.getLogger(Job.class);
 
   private Job() {
@@ -55,30 +57,21 @@ public final class Job extends FuzzyKMea
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
-      run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80, 
55, 10, 1, (float) 2, 0.5);
+      new Job().run(new Configuration(), new Path("testdata"), output, new 
EuclideanDistanceMeasure(), 80, 55, 10, (float) 2, 0.5);
     }
   }
 
   @Override
   public int run(String[] args) throws Exception {
-
     addInputOption();
     addOutputOption();
     addOption(DefaultOptionCreator.distanceMeasureOption().create());
-    addOption(DefaultOptionCreator.clustersInOption()
-        .withDescription("The input centroids, as Vectors.  Must be a 
SequenceFile of Writable, Cluster/Canopy.  "
-            + "If k is also specified, then a random set of vectors will be 
selected" + " and written out to this path first")
-        .create());
-    addOption(DefaultOptionCreator.numClustersOption()
-        .withDescription("The k in k-Means.  If specified, then a random 
selection of k Vectors will be chosen"
-            + " as the Centroid and written to the clusters input 
path.").create());
     addOption(DefaultOptionCreator.convergenceOption().create());
     addOption(DefaultOptionCreator.maxIterationsOption().create());
     addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption(DefaultOptionCreator.numReducersOption().create());
-    addOption(DefaultOptionCreator.clusteringOption().create());
     addOption(DefaultOptionCreator.t1Option().create());
     addOption(DefaultOptionCreator.t2Option().create());
+    addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be 
greater than 1", true);
 
     Map<String, String> argMap = parseArguments(args);
     if (argMap == null) {
@@ -86,45 +79,58 @@ public final class Job extends FuzzyKMea
     }
 
     Path input = getInputPath();
-    Path clusters = new 
Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
     Path output = getOutputPath();
     String measureClass = 
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
     if (measureClass == null) {
       measureClass = SquaredEuclideanDistanceMeasure.class.getName();
     }
     double convergenceDelta = 
Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
-    int numReduceTasks = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_REDUCERS_OPTION));
     int maxIterations = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
     float fuzziness = Float.parseFloat(getOption(M_OPTION));
 
-    addOption(new 
DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true).withArgument(new
 ArgumentBuilder()
-        .withName(M_OPTION).withMinimum(1).withMaximum(1).create())
-        .withDescription("coefficient normalization factor, must be greater 
than 1").withShortName(M_OPTION).create());
+    addOption(new 
DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true)
+        .withArgument(new 
ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create())
+        .withDescription("coefficient normalization factor, must be greater 
than 1").withShortName(M_OPTION)
+        .create());
     if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
       HadoopUtil.overwriteOutput(output);
     }
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
     DistanceMeasure measure = 
ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
-
-    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
-      clusters = RandomSeedGenerator.buildRandom(input, clusters, 
Integer.parseInt(argMap
-          .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
-    }
-    //boolean runClustering = 
hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
     double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
     double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-    run(input, output, measure, t1, t2, maxIterations, numReduceTasks, 
fuzziness, convergenceDelta);
+    run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness, 
convergenceDelta);
     return 0;
   }
 
   /**
+   * Return the path to the final iteration's clusters
+   * 
+   * @param conf 
+   * @param output
+   * @param maxIterations
+   * @return
+   * @throws IOException 
+   */
+  private Path finalClusterPath(Configuration conf, Path output, int 
maxIterations) throws IOException {
+    FileSystem fs = FileSystem.get(conf);
+    for (int i = maxIterations; i >= 0; i--) {
+      Path clusters = new Path(output, "clusters-" + i);
+      if (fs.exists(clusters)) {
+        return clusters;
+      }
+    }
+    return null;
+  }
+
+  /**
    * Run the kmeans clustering job on an input dataset using the given 
distance measure, t1, t2 and iteration
    * parameters. All output data will be written to the output directory, 
which will be initially deleted if
    * it exists. The clustered points will reside in the path 
<output>/clustered-points. By default, the job
    * expects the a file containing synthetic_control.data as obtained from
    * 
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series 
resides in a directory named
    * "testdata", and writes output to a directory named "output".
-   * 
+   * @param conf TODO
    * @param input
    *          the String denoting the input directory path
    * @param output
@@ -135,24 +141,21 @@ public final class Job extends FuzzyKMea
    *          the canopy T2 threshold
    * @param maxIterations 
    *          the int maximum number of iterations
-   * @param numReducerTasks 
-   *          the int number of reducer tasks
    * @param fuzziness 
    *          the float "m" fuzziness coefficient
    * @param convergenceDelta
    *          the double convergence criteria for iterations
    */
-  private static void run(Path input,
-                          Path output,
-                          DistanceMeasure measure,
-                          double t1,
-                          double t2,
-                          int maxIterations,
-                          int numReducerTasks,
-                          float fuzziness,
-                          double convergenceDelta) throws IOException, 
InstantiationException, IllegalAccessException,
+  public void run(Configuration conf,
+                  Path input,
+                  Path output,
+                  DistanceMeasure measure,
+                  double t1,
+                  double t2,
+                  int maxIterations,
+                  float fuzziness,
+                  double convergenceDelta) throws IOException, 
InstantiationException, IllegalAccessException,
       InterruptedException, ClassNotFoundException {
-
     Path directoryContainingConvertedInput = new Path(output, 
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     log.info("Preparing Input");
     InputDriver.runJob(input, directoryContainingConvertedInput, 
"org.apache.mahout.math.RandomAccessSparseVector");
@@ -160,18 +163,19 @@ public final class Job extends FuzzyKMea
     CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, 
output, measure, t1, t2, false, false);
     log.info("Running FuzzyKMeans");
     FuzzyKMeansDriver.run(directoryContainingConvertedInput,
-    new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
-    output,
-    measure,
-    convergenceDelta,
-    maxIterations,
-    fuzziness,
-    true,
-    true,
-    0.0,
-    false);
+                          new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
+                          output,
+                          measure,
+                          convergenceDelta,
+                          maxIterations,
+                          fuzziness,
+                          true,
+                          true,
+                          0.0,
+                          false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-3"), new Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, maxIterations), new Path(output,
+                                                                               
                             "clusteredPoints"));
     clusterDumper.printClusters(null);
   }
 }

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
 Fri Sep 24 18:33:55 2010
@@ -21,14 +21,15 @@ import java.io.IOException;
 import java.util.Map;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.canopy.CanopyDriver;
 import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
 import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
+import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.DistanceMeasure;
@@ -38,7 +39,7 @@ import org.apache.mahout.utils.clusterin
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public final class Job extends KMeansDriver {
+public final class Job extends AbstractJob {
 
   private static final Logger log = LoggerFactory.getLogger(Job.class);
 
@@ -53,27 +54,20 @@ public final class Job extends KMeansDri
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
-      new Job().run(new Path("testdata"), output, new 
EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
+      new Job().run(new Configuration(), new Path("testdata"), output, new 
EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
     }
   }
 
   @Override
   public int run(String[] args) throws Exception {
-
     addInputOption();
     addOutputOption();
     addOption(DefaultOptionCreator.distanceMeasureOption().create());
-    addOption(DefaultOptionCreator.clustersInOption()
-        .withDescription("The input centroids, as Vectors.  Must be a 
SequenceFile of Writable, Cluster/Canopy.  "
-            + "If k is also specified, then a random set of vectors will be 
selected" + " and written out to this path first")
-        .create());
-    addOption(DefaultOptionCreator.numClustersOption()
-        .withDescription("The k in k-Means.  If specified, then a random 
selection of k Vectors will be chosen"
-            + " as the Centroid and written to the clusters input 
path.").create());
+    addOption(DefaultOptionCreator.t1Option().create());
+    addOption(DefaultOptionCreator.t2Option().create());
     addOption(DefaultOptionCreator.convergenceOption().create());
     addOption(DefaultOptionCreator.maxIterationsOption().create());
     addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption(DefaultOptionCreator.clusteringOption().create());
 
     Map<String, String> argMap = parseArguments(args);
     if (argMap == null) {
@@ -81,7 +75,6 @@ public final class Job extends KMeansDri
     }
 
     Path input = getInputPath();
-    Path clusters = new 
Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
     Path output = getOutputPath();
     String measureClass = 
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
     if (measureClass == null) {
@@ -95,19 +88,9 @@ public final class Job extends KMeansDri
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
     Class<?> cl = ccl.loadClass(measureClass);
     DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
-    if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
-      clusters = RandomSeedGenerator.buildRandom(input, clusters, 
Integer.parseInt(argMap
-          .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
-    }
-    boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
-    run(input,
-    clusters,
-    output,
-    measure,
-    convergenceDelta,
-    maxIterations,
-    runClustering,
-    false);
+    double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+    double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+    run(getConf(), input, output, measure, t1, t2, convergenceDelta, 
maxIterations);
     return 0;
   }
 
@@ -118,7 +101,7 @@ public final class Job extends KMeansDri
    * expects the a file containing synthetic_control.data as obtained from
    * 
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series 
resides in a directory named
    * "testdata", and writes output to a directory named "output".
-   * 
+   * @param conf the Configuration to use
    * @param input
    *          the String denoting the input directory path
    * @param output
@@ -133,38 +116,59 @@ public final class Job extends KMeansDri
    *          the double convergence criteria for iterations
    * @param maxIterations
    *          the int maximum number of iterations
+   * 
    * @throws IllegalAccessException 
    * @throws InstantiationException 
    * @throws ClassNotFoundException 
    * @throws InterruptedException 
    */
-  private void run(Path input,
-                   Path output,
-                   DistanceMeasure measure,
-                   double t1,
-                   double t2,
-                   double convergenceDelta,
-                   int maxIterations) throws IOException, 
InstantiationException, IllegalAccessException, InterruptedException,
+  public void run(Configuration conf,
+                  Path input,
+                  Path output,
+                  DistanceMeasure measure,
+                  double t1,
+                  double t2,
+                  double convergenceDelta,
+                  int maxIterations) throws IOException, 
InstantiationException, IllegalAccessException, InterruptedException,
       ClassNotFoundException {
-    HadoopUtil.overwriteOutput(output);
-
     Path directoryContainingConvertedInput = new Path(output, 
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     log.info("Preparing Input");
     InputDriver.runJob(input, directoryContainingConvertedInput, 
"org.apache.mahout.math.RandomAccessSparseVector");
     log.info("Running Canopy to get initial clusters");
-    CanopyDriver.run(new Configuration(), directoryContainingConvertedInput, 
output, measure, t1, t2, false, false);
+    CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure, 
t1, t2, false, false);
     log.info("Running KMeans");
-    KMeansDriver.run(directoryContainingConvertedInput,
-    new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
-    output,
-    measure,
-    convergenceDelta,
-    maxIterations,
-    true,
-    false);
+    KMeansDriver.run(conf,
+                     directoryContainingConvertedInput,
+                     new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
+                     output,
+                     measure,
+                     convergenceDelta,
+                     maxIterations,
+                     true,
+                     false);
     // run ClusterDumper
-    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-" + maxIterations), new Path(output,
+    ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf, 
output, maxIterations), new Path(output,
                                                                                
                             "clusteredPoints"));
     clusterDumper.printClusters(null);
   }
+
+  /**
+   * Return the path to the final iteration's clusters
+   * 
+   * @param conf 
+   * @param output
+   * @param maxIterations
+   * @return
+   * @throws IOException 
+   */
+  private Path finalClusterPath(Configuration conf, Path output, int 
maxIterations) throws IOException {
+    FileSystem fs = FileSystem.get(conf);
+    for (int i = maxIterations; i >= 0; i--) {
+      Path clusters = new Path(output, "clusters-" + i);
+      if (fs.exists(clusters)) {
+        return clusters;
+      }
+    }
+    return null;
+  }
 }

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
 Fri Sep 24 18:33:55 2010
@@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
 import org.apache.mahout.clustering.syntheticcontrol.Constants;
+import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
 import org.apache.mahout.common.distance.DistanceMeasure;
@@ -35,7 +36,7 @@ import org.apache.mahout.utils.clusterin
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-public final class Job extends MeanShiftCanopyDriver {
+public final class Job extends AbstractJob {
 
   private static final Logger log = LoggerFactory.getLogger(Job.class);
 
@@ -50,7 +51,7 @@ public final class Job extends MeanShift
       log.info("Running with default arguments");
       Path output = new Path("output");
       HadoopUtil.overwriteOutput(output);
-      run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 47.6, 
1, 0.5, 10);
+      new Job().run(new Configuration(), new Path("testdata"), output, new 
EuclideanDistanceMeasure(), 47.6, 1, 0.5, 10);
     }
   }
 
@@ -61,8 +62,9 @@ public final class Job extends MeanShift
     addOption(DefaultOptionCreator.convergenceOption().create());
     addOption(DefaultOptionCreator.maxIterationsOption().create());
     addOption(DefaultOptionCreator.overwriteOption().create());
-    addOption(new 
DefaultOptionBuilder().withLongName(INPUT_IS_CANOPIES_OPTION).withRequired(false).withShortName("ic")
-        .withArgument(new 
ArgumentBuilder().withName(INPUT_IS_CANOPIES_OPTION).withMinimum(1).withMaximum(1).create())
+    addOption(new 
DefaultOptionBuilder().withLongName(MeanShiftCanopyDriver.INPUT_IS_CANOPIES_OPTION).withRequired(false)
+        .withShortName("ic").withArgument(new 
ArgumentBuilder().withName(MeanShiftCanopyDriver.INPUT_IS_CANOPIES_OPTION)
+            .withMinimum(1).withMaximum(1).create())
         .withDescription("If present, the input directory already contains 
MeanShiftCanopies").create());
     addOption(DefaultOptionCreator.distanceMeasureOption().create());
     addOption(DefaultOptionCreator.t1Option().create());
@@ -82,14 +84,12 @@ public final class Job extends MeanShift
     String measureClass = 
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
     double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
     double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
-    boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
     double convergenceDelta = 
Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
     int maxIterations = 
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
-    boolean inputIsCanopies = hasOption(INPUT_IS_CANOPIES_OPTION);
     ClassLoader ccl = Thread.currentThread().getContextClassLoader();
     DistanceMeasure measure = (DistanceMeasure) ((Class<?>) 
ccl.loadClass(measureClass)).newInstance();
 
-    runJob(input, output, measure, t1, t2, convergenceDelta, maxIterations, 
inputIsCanopies, runClustering, false);
+    run(getConf(), input, output, measure, t1, t2, convergenceDelta, 
maxIterations);
     return 0;
   }
 
@@ -100,7 +100,7 @@ public final class Job extends MeanShift
    * the job expects the a file containing synthetic_control.data as obtained 
from
    * 
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series 
resides in a directory named
    * "testdata", and writes output to a directory named "output".
-   * 
+   * @param conf TODO
    * @param input
    *          the String denoting the input directory path
    * @param output
@@ -116,29 +116,30 @@ public final class Job extends MeanShift
    * @param maxIterations
    *          the int maximum number of iterations
    */
-  private static void run(Path input,
-                          Path output,
-                          DistanceMeasure measure,
-                          double t1,
-                          double t2,
-                          double convergenceDelta,
-                          int maxIterations)
-    throws IOException, InterruptedException, ClassNotFoundException, 
InstantiationException, IllegalAccessException {
+  public void run(Configuration conf,
+                  Path input,
+                  Path output,
+                  DistanceMeasure measure,
+                  double t1,
+                  double t2,
+                  double convergenceDelta,
+                  int maxIterations) throws IOException, InterruptedException, 
ClassNotFoundException, InstantiationException,
+      IllegalAccessException {
     Path directoryContainingConvertedInput = new Path(output, 
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
     InputDriver.runJob(input, directoryContainingConvertedInput);
-    MeanShiftCanopyDriver.runJob(directoryContainingConvertedInput,
-                                 output,
-                                 measure,
-                                 t1,
-                                 t2,
-                                 convergenceDelta,
-                                 maxIterations,
-                                 true,
-                                 true,
-                                 false);
+    new MeanShiftCanopyDriver().run(directoryContainingConvertedInput,
+                                    output,
+                                    measure,
+                                    t1,
+                                    t2,
+                                    convergenceDelta,
+                                    maxIterations,
+                                    true,
+                                    true,
+                                    false);
     // run ClusterDumper
-    ClusterDumper clusterDumper =
-        new ClusterDumper(new Path(output, "clusters-" + maxIterations), new 
Path(output, "clusteredPoints"));
+    ClusterDumper clusterDumper = new ClusterDumper(new Path(output, 
"clusters-" + maxIterations), new Path(output,
+                                                                               
                             "clusteredPoints"));
     clusterDumper.printClusters(null);
   }
 


Reply via email to