Author: jeastman
Date: Fri Sep 24 18:33:55 2010
New Revision: 1001015
URL: http://svn.apache.org/viewvc?rev=1001015&view=rev
Log:
MAHOUT-504. Fixed CLI arguments and did other refactoring of synthetic control
example. Tested CLI invocation with explicit arguments which was the source of
the problems cited in this issue. All tests run
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
Fri Sep 24 18:33:55 2010
@@ -71,11 +71,11 @@ public class DirichletDriver extends Abs
public static final String THRESHOLD_KEY =
"org.apache.mahout.clustering.dirichlet.threshold";
- protected static final String MODEL_PROTOTYPE_CLASS_OPTION =
"modelPrototype";
+ public static final String MODEL_PROTOTYPE_CLASS_OPTION = "modelPrototype";
- protected static final String MODEL_DISTRIBUTION_CLASS_OPTION = "modelDist";
+ public static final String MODEL_DISTRIBUTION_CLASS_OPTION = "modelDist";
- protected static final String ALPHA_OPTION = "alpha";
+ public static final String ALPHA_OPTION = "alpha";
private static final Logger log =
LoggerFactory.getLogger(DirichletDriver.class);
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
Fri Sep 24 18:33:55 2010
@@ -55,9 +55,7 @@ import org.slf4j.LoggerFactory;
public class FuzzyKMeansDriver extends AbstractJob {
- protected static final String M_OPTION = "m";
-
- public static final String M_OPTION_KEY = "--" + M_OPTION;
+ public static final String M_OPTION = "m";
private static final Logger log =
LoggerFactory.getLogger(FuzzyKMeansDriver.class);
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
Fri Sep 24 18:33:55 2010
@@ -51,7 +51,7 @@ import org.slf4j.LoggerFactory;
public class MeanShiftCanopyDriver extends AbstractJob {
- protected static final String INPUT_IS_CANOPIES_OPTION = "inputIsCanopies";
+ public static final String INPUT_IS_CANOPIES_OPTION = "inputIsCanopies";
private static final Logger log =
LoggerFactory.getLogger(MeanShiftCanopyDriver.class);
@@ -63,52 +63,6 @@ public class MeanShiftCanopyDriver exten
new MeanShiftCanopyDriver().run(args);
}
- /**
- * Run the job on a new driver instance (convenience)
- *
- * @param input
- * the input pathname String
- * @param output
- * the output pathname String
- * @param measure
- * the DistanceMeasure
- * @param t1
- * the T1 distance threshold
- * @param t2
- * the T2 distance threshold
- * @param convergenceDelta
- * the double convergence criteria
- * @param maxIterations
- * an int number of iterations
- * @param inputIsCanopies
- true if the input path already contains MeanShiftCanopies and
does not need to be converted from Vectors
- * @param runClustering
- * true if the input points are to be clustered once the iterations
complete
- * @param runSequential if true run in sequential execution mode
- */
- public static void runJob(Path input,
- Path output,
- DistanceMeasure measure,
- double t1,
- double t2,
- double convergenceDelta,
- int maxIterations,
- boolean inputIsCanopies,
- boolean runClustering,
- boolean runSequential)
- throws IOException, InterruptedException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
- new MeanShiftCanopyDriver().job(input,
- output,
- measure,
- t1,
- t2,
- convergenceDelta,
- maxIterations,
- inputIsCanopies,
- runClustering,
- runSequential);
- }
-
@Override
public int run(String[] args) throws Exception {
addInputOption();
@@ -146,7 +100,7 @@ public class MeanShiftCanopyDriver exten
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
DistanceMeasure measure =
ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
- job(input,
+ run(input,
output,
measure,
t1,
@@ -234,7 +188,7 @@ public class MeanShiftCanopyDriver exten
* true if the input points are to be clustered once the iterations
complete
* @param runSequential if true run in sequential execution mode
*/
- public void job(Path input,
+ public void run(Path input,
Path output,
DistanceMeasure measure,
double t1,
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
Fri Sep 24 18:33:55 2010
@@ -73,7 +73,7 @@ final class DisplayMeanShift extends Dis
DisplayClustering.plotRectangle(g2, v.get(), dv);
}
int i = 0;
- for (Cluster cluster : CLUSTERS.get(CLUSTERS.size()-1)) {
+ for (Cluster cluster : CLUSTERS.get(CLUSTERS.size() - 1)) {
MeanShiftCanopy canopy = (MeanShiftCanopy) cluster;
if (canopy.getBoundPoints().toList().size() >= significance *
DisplayClustering.SAMPLE_DATA.size()) {
g2.setColor(COLORS[Math.min(i++, DisplayClustering.COLORS.length -
1)]);
@@ -108,7 +108,7 @@ final class DisplayMeanShift extends Dis
writeSampleData(samples);
boolean b = true;
if (b) {
- MeanShiftCanopyDriver.runJob(samples, output, measure, t1, t2, 0.005,
20, false, true, true);
+ new MeanShiftCanopyDriver().run(samples, output, measure, t1, t2, 0.005,
20, false, true, true);
loadClusters(output);
} else {
List<Vector> points = new ArrayList<Vector>();
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/Job.java
Fri Sep 24 18:33:55 2010
@@ -25,6 +25,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.syntheticcontrol.Constants;
+import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
@@ -33,7 +34,7 @@ import org.apache.mahout.utils.clusterin
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public final class Job extends CanopyDriver {
+public final class Job extends AbstractJob {
private Job() {
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/dirichlet/Job.java
Fri Sep 24 18:33:55 2010
@@ -39,6 +39,7 @@ import org.apache.mahout.clustering.diri
import org.apache.mahout.clustering.dirichlet.models.NormalModelDistribution;
import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
+import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.math.RandomAccessSparseVector;
@@ -47,7 +48,7 @@ import org.apache.mahout.utils.clusterin
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public final class Job extends DirichletDriver {
+public final class Job extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(Job.class);
@@ -75,16 +76,16 @@ public final class Job extends Dirichlet
addOption(DefaultOptionCreator.maxIterationsOption().create());
addOption(DefaultOptionCreator.numClustersOption().withRequired(true).create());
addOption(DefaultOptionCreator.overwriteOption().create());
- addOption(new
DefaultOptionBuilder().withLongName(ALPHA_OPTION).withRequired(false).withShortName("m")
- .withArgument(new
ArgumentBuilder().withName(ALPHA_OPTION).withDefault("1.0").withMinimum(1).withMaximum(1).create())
- .withDescription("The alpha0 value for the DirichletDistribution.
Defaults to 1.0").create());
- addOption(new
DefaultOptionBuilder().withLongName(MODEL_DISTRIBUTION_CLASS_OPTION).withRequired(false).withShortName("md")
- .withArgument(new
ArgumentBuilder().withName(MODEL_DISTRIBUTION_CLASS_OPTION).withDefault(NormalModelDistribution.class
-
.getName()).withMinimum(1).withMaximum(1).create()).withDescription("The
ModelDistribution class name. "
- + "Defaults to NormalModelDistribution").create());
- addOption(new
DefaultOptionBuilder().withLongName(MODEL_PROTOTYPE_CLASS_OPTION).withRequired(false).withShortName("mp")
- .withArgument(new
ArgumentBuilder().withName("prototypeClass").withDefault(RandomAccessSparseVector.class.getName())
- .withMinimum(1).withMaximum(1).create())
+ addOption(new
DefaultOptionBuilder().withLongName(DirichletDriver.ALPHA_OPTION).withRequired(false).withShortName("m")
+ .withArgument(new
ArgumentBuilder().withName(DirichletDriver.ALPHA_OPTION).withDefault("1.0").withMinimum(1).withMaximum(1)
+ .create()).withDescription("The alpha0 value for the
DirichletDistribution. Defaults to 1.0").create());
+ addOption(new
DefaultOptionBuilder().withLongName(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION).withRequired(false)
+ .withShortName("md").withArgument(new
ArgumentBuilder().withName(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION)
+
.withDefault(NormalModelDistribution.class.getName()).withMinimum(1).withMaximum(1).create())
+ .withDescription("The ModelDistribution class name. " + "Defaults to
NormalModelDistribution").create());
+ addOption(new
DefaultOptionBuilder().withLongName(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION).withRequired(false)
+ .withShortName("mp").withArgument(new
ArgumentBuilder().withName("prototypeClass")
+
.withDefault(RandomAccessSparseVector.class.getName()).withMinimum(1).withMaximum(1).create())
.withDescription("The ModelDistribution prototype Vector class name.
Defaults to RandomAccessSparseVector").create());
addOption(DefaultOptionCreator.distanceMeasureOption().withRequired(false).create());
addOption(DefaultOptionCreator.emitMostLikelyOption().create());
@@ -100,19 +101,18 @@ public final class Job extends Dirichlet
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.overwriteOutput(output);
}
- String modelFactory = getOption(MODEL_DISTRIBUTION_CLASS_OPTION);
- String modelPrototype = getOption(MODEL_PROTOTYPE_CLASS_OPTION);
+ String modelFactory =
getOption(DirichletDriver.MODEL_DISTRIBUTION_CLASS_OPTION);
+ String modelPrototype =
getOption(DirichletDriver.MODEL_PROTOTYPE_CLASS_OPTION);
String distanceMeasure =
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
int numModels =
Integer.parseInt(getOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION));
int maxIterations =
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
boolean emitMostLikely =
Boolean.parseBoolean(getOption(DefaultOptionCreator.EMIT_MOST_LIKELY_OPTION));
double threshold =
Double.parseDouble(getOption(DefaultOptionCreator.THRESHOLD_OPTION));
- double alpha0 = Double.parseDouble(getOption(ALPHA_OPTION));
- int prototypeSize = DirichletDriver.readPrototypeSize(input);
+ double alpha0 =
Double.parseDouble(getOption(DirichletDriver.ALPHA_OPTION));
AbstractVectorModelDistribution modelDistribution =
DirichletDriver.createModelDistribution(modelFactory,
modelPrototype,
distanceMeasure,
-
prototypeSize);
+
60);
run(input, output, modelDistribution, numModels, maxIterations, alpha0,
emitMostLikely, threshold);
return 0;
@@ -134,14 +134,14 @@ public final class Job extends Dirichlet
* @param alpha0
* the alpha0 value for the DirichletDistribution
*/
- private void run(Path input,
- Path output,
- ModelDistribution<VectorWritable> modelDistribution,
- int numModels,
- int maxIterations,
- double alpha0,
- boolean emitMostLikely,
- double threshold) throws IOException,
ClassNotFoundException, InstantiationException, IllegalAccessException,
+ public void run(Path input,
+ Path output,
+ ModelDistribution<VectorWritable> modelDistribution,
+ int numModels,
+ int maxIterations,
+ double alpha0,
+ boolean emitMostLikely,
+ double threshold) throws IOException,
ClassNotFoundException, InstantiationException, IllegalAccessException,
NoSuchMethodException, InvocationTargetException, SecurityException,
InterruptedException {
Path directoryContainingConvertedInput = new Path(output,
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
InputDriver.runJob(input, directoryContainingConvertedInput,
"org.apache.mahout.math.RandomAccessSparseVector");
@@ -182,11 +182,11 @@ public final class Job extends Dirichlet
double alpha0) throws NoSuchMethodException,
InvocationTargetException {
Collection<List<DirichletCluster>> clusters = new
ArrayList<List<DirichletCluster>>();
Configuration conf = new Configuration();
- conf.set(MODEL_DISTRIBUTION_KEY, modelDistribution.asJsonString());
- conf.set(NUM_CLUSTERS_KEY, Integer.toString(numModels));
- conf.set(ALPHA_0_KEY, Double.toString(alpha0));
+ conf.set(DirichletDriver.MODEL_DISTRIBUTION_KEY,
modelDistribution.asJsonString());
+ conf.set(DirichletDriver.NUM_CLUSTERS_KEY, Integer.toString(numModels));
+ conf.set(DirichletDriver.ALPHA_0_KEY, Double.toString(alpha0));
for (int i = 0; i < numIterations; i++) {
- conf.set(STATE_IN_KEY, output + "/clusters-" + i);
+ conf.set(DirichletDriver.STATE_IN_KEY, output + "/clusters-" + i);
clusters.add(DirichletMapper.getDirichletState(conf).getClusters());
}
printClusters(clusters, 0);
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/fuzzykmeans/Job.java
Fri Sep 24 18:33:55 2010
@@ -23,14 +23,15 @@ import java.util.Map;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.fuzzykmeans.FuzzyKMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
+import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
@@ -40,8 +41,9 @@ import org.apache.mahout.utils.clusterin
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public final class Job extends FuzzyKMeansDriver {
+public final class Job extends AbstractJob {
+ private static final String M_OPTION = FuzzyKMeansDriver.M_OPTION;
private static final Logger log = LoggerFactory.getLogger(Job.class);
private Job() {
@@ -55,30 +57,21 @@ public final class Job extends FuzzyKMea
log.info("Running with default arguments");
Path output = new Path("output");
HadoopUtil.overwriteOutput(output);
- run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 80,
55, 10, 1, (float) 2, 0.5);
+ new Job().run(new Configuration(), new Path("testdata"), output, new
EuclideanDistanceMeasure(), 80, 55, 10, (float) 2, 0.5);
}
}
@Override
public int run(String[] args) throws Exception {
-
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.distanceMeasureOption().create());
- addOption(DefaultOptionCreator.clustersInOption()
- .withDescription("The input centroids, as Vectors. Must be a
SequenceFile of Writable, Cluster/Canopy. "
- + "If k is also specified, then a random set of vectors will be
selected" + " and written out to this path first")
- .create());
- addOption(DefaultOptionCreator.numClustersOption()
- .withDescription("The k in k-Means. If specified, then a random
selection of k Vectors will be chosen"
- + " as the Centroid and written to the clusters input
path.").create());
addOption(DefaultOptionCreator.convergenceOption().create());
addOption(DefaultOptionCreator.maxIterationsOption().create());
addOption(DefaultOptionCreator.overwriteOption().create());
- addOption(DefaultOptionCreator.numReducersOption().create());
- addOption(DefaultOptionCreator.clusteringOption().create());
addOption(DefaultOptionCreator.t1Option().create());
addOption(DefaultOptionCreator.t2Option().create());
+ addOption(M_OPTION, M_OPTION, "coefficient normalization factor, must be
greater than 1", true);
Map<String, String> argMap = parseArguments(args);
if (argMap == null) {
@@ -86,45 +79,58 @@ public final class Job extends FuzzyKMea
}
Path input = getInputPath();
- Path clusters = new
Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
Path output = getOutputPath();
String measureClass =
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
if (measureClass == null) {
measureClass = SquaredEuclideanDistanceMeasure.class.getName();
}
double convergenceDelta =
Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
- int numReduceTasks =
Integer.parseInt(getOption(DefaultOptionCreator.MAX_REDUCERS_OPTION));
int maxIterations =
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
float fuzziness = Float.parseFloat(getOption(M_OPTION));
- addOption(new
DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true).withArgument(new
ArgumentBuilder()
- .withName(M_OPTION).withMinimum(1).withMaximum(1).create())
- .withDescription("coefficient normalization factor, must be greater
than 1").withShortName(M_OPTION).create());
+ addOption(new
DefaultOptionBuilder().withLongName(M_OPTION).withRequired(true)
+ .withArgument(new
ArgumentBuilder().withName(M_OPTION).withMinimum(1).withMaximum(1).create())
+ .withDescription("coefficient normalization factor, must be greater
than 1").withShortName(M_OPTION)
+ .create());
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.overwriteOutput(output);
}
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
DistanceMeasure measure =
ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
-
- if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
- clusters = RandomSeedGenerator.buildRandom(input, clusters,
Integer.parseInt(argMap
- .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
- }
- //boolean runClustering =
hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- run(input, output, measure, t1, t2, maxIterations, numReduceTasks,
fuzziness, convergenceDelta);
+ run(getConf(), input, output, measure, t1, t2, maxIterations, fuzziness,
convergenceDelta);
return 0;
}
/**
+ * Return the path to the final iteration's clusters
+ *
+ * @param conf
+ * @param output
+ * @param maxIterations
+ * @return
+ * @throws IOException
+ */
+ private Path finalClusterPath(Configuration conf, Path output, int
maxIterations) throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+ for (int i = maxIterations; i >= 0; i--) {
+ Path clusters = new Path(output, "clusters-" + i);
+ if (fs.exists(clusters)) {
+ return clusters;
+ }
+ }
+ return null;
+ }
+
+ /**
* Run the kmeans clustering job on an input dataset using the given
distance measure, t1, t2 and iteration
* parameters. All output data will be written to the output directory,
which will be initially deleted if
* it exists. The clustered points will reside in the path
<output>/clustered-points. By default, the job
* expects the a file containing synthetic_control.data as obtained from
*
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
resides in a directory named
* "testdata", and writes output to a directory named "output".
- *
+ * @param conf TODO
* @param input
* the String denoting the input directory path
* @param output
@@ -135,24 +141,21 @@ public final class Job extends FuzzyKMea
* the canopy T2 threshold
* @param maxIterations
* the int maximum number of iterations
- * @param numReducerTasks
- * the int number of reducer tasks
* @param fuzziness
* the float "m" fuzziness coefficient
* @param convergenceDelta
* the double convergence criteria for iterations
*/
- private static void run(Path input,
- Path output,
- DistanceMeasure measure,
- double t1,
- double t2,
- int maxIterations,
- int numReducerTasks,
- float fuzziness,
- double convergenceDelta) throws IOException,
InstantiationException, IllegalAccessException,
+ public void run(Configuration conf,
+ Path input,
+ Path output,
+ DistanceMeasure measure,
+ double t1,
+ double t2,
+ int maxIterations,
+ float fuzziness,
+ double convergenceDelta) throws IOException,
InstantiationException, IllegalAccessException,
InterruptedException, ClassNotFoundException {
-
Path directoryContainingConvertedInput = new Path(output,
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
log.info("Preparing Input");
InputDriver.runJob(input, directoryContainingConvertedInput,
"org.apache.mahout.math.RandomAccessSparseVector");
@@ -160,18 +163,19 @@ public final class Job extends FuzzyKMea
CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
output, measure, t1, t2, false, false);
log.info("Running FuzzyKMeans");
FuzzyKMeansDriver.run(directoryContainingConvertedInput,
- new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
- output,
- measure,
- convergenceDelta,
- maxIterations,
- fuzziness,
- true,
- true,
- 0.0,
- false);
+ new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
+ output,
+ measure,
+ convergenceDelta,
+ maxIterations,
+ fuzziness,
+ true,
+ true,
+ 0.0,
+ false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-3"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, maxIterations), new Path(output,
+
"clusteredPoints"));
clusterDumper.printClusters(null);
}
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
Fri Sep 24 18:33:55 2010
@@ -21,14 +21,15 @@ import java.io.IOException;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.kmeans.KMeansDriver;
-import org.apache.mahout.clustering.kmeans.RandomSeedGenerator;
import org.apache.mahout.clustering.syntheticcontrol.Constants;
import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
+import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
@@ -38,7 +39,7 @@ import org.apache.mahout.utils.clusterin
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public final class Job extends KMeansDriver {
+public final class Job extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(Job.class);
@@ -53,27 +54,20 @@ public final class Job extends KMeansDri
log.info("Running with default arguments");
Path output = new Path("output");
HadoopUtil.overwriteOutput(output);
- new Job().run(new Path("testdata"), output, new
EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
+ new Job().run(new Configuration(), new Path("testdata"), output, new
EuclideanDistanceMeasure(), 80, 55, 0.5, 10);
}
}
@Override
public int run(String[] args) throws Exception {
-
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.distanceMeasureOption().create());
- addOption(DefaultOptionCreator.clustersInOption()
- .withDescription("The input centroids, as Vectors. Must be a
SequenceFile of Writable, Cluster/Canopy. "
- + "If k is also specified, then a random set of vectors will be
selected" + " and written out to this path first")
- .create());
- addOption(DefaultOptionCreator.numClustersOption()
- .withDescription("The k in k-Means. If specified, then a random
selection of k Vectors will be chosen"
- + " as the Centroid and written to the clusters input
path.").create());
+ addOption(DefaultOptionCreator.t1Option().create());
+ addOption(DefaultOptionCreator.t2Option().create());
addOption(DefaultOptionCreator.convergenceOption().create());
addOption(DefaultOptionCreator.maxIterationsOption().create());
addOption(DefaultOptionCreator.overwriteOption().create());
- addOption(DefaultOptionCreator.clusteringOption().create());
Map<String, String> argMap = parseArguments(args);
if (argMap == null) {
@@ -81,7 +75,6 @@ public final class Job extends KMeansDri
}
Path input = getInputPath();
- Path clusters = new
Path(getOption(DefaultOptionCreator.CLUSTERS_IN_OPTION));
Path output = getOutputPath();
String measureClass =
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
if (measureClass == null) {
@@ -95,19 +88,9 @@ public final class Job extends KMeansDri
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
Class<?> cl = ccl.loadClass(measureClass);
DistanceMeasure measure = (DistanceMeasure) cl.newInstance();
- if (hasOption(DefaultOptionCreator.NUM_CLUSTERS_OPTION)) {
- clusters = RandomSeedGenerator.buildRandom(input, clusters,
Integer.parseInt(argMap
- .get(DefaultOptionCreator.NUM_CLUSTERS_OPTION)), measure);
- }
- boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
- run(input,
- clusters,
- output,
- measure,
- convergenceDelta,
- maxIterations,
- runClustering,
- false);
+ double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
+ double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
+ run(getConf(), input, output, measure, t1, t2, convergenceDelta,
maxIterations);
return 0;
}
@@ -118,7 +101,7 @@ public final class Job extends KMeansDri
* expects the a file containing synthetic_control.data as obtained from
*
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
resides in a directory named
* "testdata", and writes output to a directory named "output".
- *
+ * @param conf the Configuration to use
* @param input
* the String denoting the input directory path
* @param output
@@ -133,38 +116,59 @@ public final class Job extends KMeansDri
* the double convergence criteria for iterations
* @param maxIterations
* the int maximum number of iterations
+ *
* @throws IllegalAccessException
* @throws InstantiationException
* @throws ClassNotFoundException
* @throws InterruptedException
*/
- private void run(Path input,
- Path output,
- DistanceMeasure measure,
- double t1,
- double t2,
- double convergenceDelta,
- int maxIterations) throws IOException,
InstantiationException, IllegalAccessException, InterruptedException,
+ public void run(Configuration conf,
+ Path input,
+ Path output,
+ DistanceMeasure measure,
+ double t1,
+ double t2,
+ double convergenceDelta,
+ int maxIterations) throws IOException,
InstantiationException, IllegalAccessException, InterruptedException,
ClassNotFoundException {
- HadoopUtil.overwriteOutput(output);
-
Path directoryContainingConvertedInput = new Path(output,
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
log.info("Preparing Input");
InputDriver.runJob(input, directoryContainingConvertedInput,
"org.apache.mahout.math.RandomAccessSparseVector");
log.info("Running Canopy to get initial clusters");
- CanopyDriver.run(new Configuration(), directoryContainingConvertedInput,
output, measure, t1, t2, false, false);
+ CanopyDriver.run(conf, directoryContainingConvertedInput, output, measure,
t1, t2, false, false);
log.info("Running KMeans");
- KMeansDriver.run(directoryContainingConvertedInput,
- new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
- output,
- measure,
- convergenceDelta,
- maxIterations,
- true,
- false);
+ KMeansDriver.run(conf,
+ directoryContainingConvertedInput,
+ new Path(output, Cluster.INITIAL_CLUSTERS_DIR),
+ output,
+ measure,
+ convergenceDelta,
+ maxIterations,
+ true,
+ false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-" + maxIterations), new Path(output,
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, maxIterations), new Path(output,
"clusteredPoints"));
clusterDumper.printClusters(null);
}
+
+ /**
+ * Return the path to the final iteration's clusters
+ *
+ * @param conf
+ * @param output
+ * @param maxIterations
+ * @return
+ * @throws IOException
+ */
+ private Path finalClusterPath(Configuration conf, Path output, int
maxIterations) throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+ for (int i = maxIterations; i >= 0; i--) {
+ Path clusters = new Path(output, "clusters-" + i);
+ if (fs.exists(clusters)) {
+ return clusters;
+ }
+ }
+ return null;
+ }
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=1001015&r1=1001014&r2=1001015&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
Fri Sep 24 18:33:55 2010
@@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.meanshift.MeanShiftCanopyDriver;
import org.apache.mahout.clustering.syntheticcontrol.Constants;
+import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.distance.DistanceMeasure;
@@ -35,7 +36,7 @@ import org.apache.mahout.utils.clusterin
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-public final class Job extends MeanShiftCanopyDriver {
+public final class Job extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(Job.class);
@@ -50,7 +51,7 @@ public final class Job extends MeanShift
log.info("Running with default arguments");
Path output = new Path("output");
HadoopUtil.overwriteOutput(output);
- run(new Path("testdata"), output, new EuclideanDistanceMeasure(), 47.6,
1, 0.5, 10);
+ new Job().run(new Configuration(), new Path("testdata"), output, new
EuclideanDistanceMeasure(), 47.6, 1, 0.5, 10);
}
}
@@ -61,8 +62,9 @@ public final class Job extends MeanShift
addOption(DefaultOptionCreator.convergenceOption().create());
addOption(DefaultOptionCreator.maxIterationsOption().create());
addOption(DefaultOptionCreator.overwriteOption().create());
- addOption(new
DefaultOptionBuilder().withLongName(INPUT_IS_CANOPIES_OPTION).withRequired(false).withShortName("ic")
- .withArgument(new
ArgumentBuilder().withName(INPUT_IS_CANOPIES_OPTION).withMinimum(1).withMaximum(1).create())
+ addOption(new
DefaultOptionBuilder().withLongName(MeanShiftCanopyDriver.INPUT_IS_CANOPIES_OPTION).withRequired(false)
+ .withShortName("ic").withArgument(new
ArgumentBuilder().withName(MeanShiftCanopyDriver.INPUT_IS_CANOPIES_OPTION)
+ .withMinimum(1).withMaximum(1).create())
.withDescription("If present, the input directory already contains
MeanShiftCanopies").create());
addOption(DefaultOptionCreator.distanceMeasureOption().create());
addOption(DefaultOptionCreator.t1Option().create());
@@ -82,14 +84,12 @@ public final class Job extends MeanShift
String measureClass =
getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
double t1 = Double.parseDouble(getOption(DefaultOptionCreator.T1_OPTION));
double t2 = Double.parseDouble(getOption(DefaultOptionCreator.T2_OPTION));
- boolean runClustering = hasOption(DefaultOptionCreator.CLUSTERING_OPTION);
double convergenceDelta =
Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
int maxIterations =
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
- boolean inputIsCanopies = hasOption(INPUT_IS_CANOPIES_OPTION);
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
DistanceMeasure measure = (DistanceMeasure) ((Class<?>)
ccl.loadClass(measureClass)).newInstance();
- runJob(input, output, measure, t1, t2, convergenceDelta, maxIterations,
inputIsCanopies, runClustering, false);
+ run(getConf(), input, output, measure, t1, t2, convergenceDelta,
maxIterations);
return 0;
}
@@ -100,7 +100,7 @@ public final class Job extends MeanShift
* the job expects the a file containing synthetic_control.data as obtained
from
*
http://archive.ics.uci.edu/ml/datasets/Synthetic+Control+Chart+Time+Series
resides in a directory named
* "testdata", and writes output to a directory named "output".
- *
+ * @param conf TODO
* @param input
* the String denoting the input directory path
* @param output
@@ -116,29 +116,30 @@ public final class Job extends MeanShift
* @param maxIterations
* the int maximum number of iterations
*/
- private static void run(Path input,
- Path output,
- DistanceMeasure measure,
- double t1,
- double t2,
- double convergenceDelta,
- int maxIterations)
- throws IOException, InterruptedException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
+ public void run(Configuration conf,
+ Path input,
+ Path output,
+ DistanceMeasure measure,
+ double t1,
+ double t2,
+ double convergenceDelta,
+ int maxIterations) throws IOException, InterruptedException,
ClassNotFoundException, InstantiationException,
+ IllegalAccessException {
Path directoryContainingConvertedInput = new Path(output,
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
InputDriver.runJob(input, directoryContainingConvertedInput);
- MeanShiftCanopyDriver.runJob(directoryContainingConvertedInput,
- output,
- measure,
- t1,
- t2,
- convergenceDelta,
- maxIterations,
- true,
- true,
- false);
+ new MeanShiftCanopyDriver().run(directoryContainingConvertedInput,
+ output,
+ measure,
+ t1,
+ t2,
+ convergenceDelta,
+ maxIterations,
+ true,
+ true,
+ false);
// run ClusterDumper
- ClusterDumper clusterDumper =
- new ClusterDumper(new Path(output, "clusters-" + maxIterations), new
Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-" + maxIterations), new Path(output,
+
"clusteredPoints"));
clusterDumper.printClusters(null);
}