Author: jeastman
Date: Fri Sep 24 19:18:46 2010
New Revision: 1001034
URL: http://svn.apache.org/viewvc?rev=1001034&view=rev
Log:
MAHOUT-414: Added Configuration arguments to MeanShift and LDA that I
overlooked in last commit.
Added code to find last clusters directory to TestClusterDumper.
All tests run
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/lda/LDADriver.java
Fri Sep 24 19:18:46 2010
@@ -76,36 +76,6 @@ public final class LDADriver extends Abs
new LDADriver().run(args);
}
- /**
- * Run the job using supplied arguments
- *
- * @param input
- * the directory pathname for input points
- * @param output
- * the directory pathname for output points
- * @param numTopics
- * the number of topics
- * @param numWords
- * the number of words
- * @param topicSmoothing
- * pseudocounts for each topic, typically small < .5
- * @param maxIterations
- * the maximum number of iterations
- * @param numReducers
- * the number of Reducers desired
- * @throws IOException
- */
- public static void runJob(Path input,
- Path output,
- int numTopics,
- int numWords,
- double topicSmoothing,
- int maxIterations,
- int numReducers) throws IOException,
InterruptedException, ClassNotFoundException {
-
- new LDADriver().job(input, output, numTopics, numWords, topicSmoothing,
maxIterations, numReducers);
- }
-
static LDAState createState(Configuration job) throws IOException {
String statePath = job.get(STATE_IN_KEY);
int numTopics = Integer.parseInt(job.get(NUM_TOPICS_KEY));
@@ -165,7 +135,6 @@ public final class LDADriver extends Abs
"The total number of words in the corpus (can be approximate,
needs to exceed the actual value)");
addOption(TOPIC_SMOOTHING_OPTION, "a", "Topic smoothing parameter. Default
is 50/numTopics.", "-1.0");
addOption(DefaultOptionCreator.maxIterationsOption().withRequired(false).create());
- addOption(DefaultOptionCreator.numReducersOption().create());
if (parseArguments(args) == null) {
return -1;
@@ -177,7 +146,6 @@ public final class LDADriver extends Abs
HadoopUtil.overwriteOutput(output);
}
int maxIterations =
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
- int numReduceTasks =
Integer.parseInt(getOption(DefaultOptionCreator.MAX_REDUCERS_OPTION));
int numTopics = Integer.parseInt(getOption(NUM_TOPICS_OPTION));
int numWords = Integer.parseInt(getOption(NUM_WORDS_OPTION));
double topicSmoothing =
Double.parseDouble(getOption(TOPIC_SMOOTHING_OPTION));
@@ -185,25 +153,30 @@ public final class LDADriver extends Abs
topicSmoothing = 50.0 / numTopics;
}
- job(input, output, numTopics, numWords, topicSmoothing, maxIterations,
numReduceTasks);
+ run(getConf(), input, output, numTopics, numWords, topicSmoothing,
maxIterations);
return 0;
}
/**
+ * @param conf
* @param input
* @param output
* @param numTopics
* @param numWords
* @param topicSmoothing
* @param maxIterations
- * @param numReducers
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
- private void job(Path input, Path output, int numTopics, int numWords,
double topicSmoothing, int maxIterations, int numReducers)
- throws IOException, InterruptedException, ClassNotFoundException {
+ private void run(Configuration conf,
+ Path input,
+ Path output,
+ int numTopics,
+ int numWords,
+ double topicSmoothing,
+ int maxIterations) throws IOException,
InterruptedException, ClassNotFoundException {
Path stateIn = new Path(output, "state-0");
writeInitialState(stateIn, numTopics, numWords);
double oldLL = Double.NEGATIVE_INFINITY;
@@ -213,7 +186,7 @@ public final class LDADriver extends Abs
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
Path stateOut = new Path(output, "state-" + iteration);
- double ll = runIteration(input, stateIn, stateOut, numTopics, numWords,
topicSmoothing, numReducers);
+ double ll = runIteration(conf, input, stateIn, stateOut, numTopics,
numWords, topicSmoothing);
double relChange = (oldLL - ll) / oldLL;
// now point the input to the old output directory
@@ -280,7 +253,7 @@ public final class LDADriver extends Abs
/**
* Run the job using supplied arguments
- *
+ * @param conf TODO
* @param input
* the directory pathname for input points
* @param stateIn
@@ -289,17 +262,14 @@ public final class LDADriver extends Abs
* the directory pathname for output state
* @param numTopics
* the number of clusters
- * @param numReducers
- * the number of Reducers desired
*/
- private double runIteration(Path input,
+ private double runIteration(Configuration conf,
+ Path input,
Path stateIn,
Path stateOut,
int numTopics,
int numWords,
- double topicSmoothing,
- int numReducers) throws IOException,
InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
+ double topicSmoothing) throws IOException,
InterruptedException, ClassNotFoundException {
conf.set(STATE_IN_KEY, stateIn.toString());
conf.set(NUM_TOPICS_KEY, Integer.toString(numTopics));
conf.set(NUM_WORDS_KEY, Integer.toString(numWords));
@@ -315,7 +285,6 @@ public final class LDADriver extends Abs
job.setMapperClass(LDAMapper.class);
job.setReducerClass(LDAReducer.class);
job.setCombinerClass(LDAReducer.class);
- job.setNumReduceTasks(numReducers);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setJarByClass(LDADriver.class);
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
Fri Sep 24 19:18:46 2010
@@ -35,6 +35,7 @@ import org.apache.hadoop.mapreduce.lib.i
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
+import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.clustering.AbstractCluster;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.WeightedVectorWritable;
@@ -60,7 +61,7 @@ public class MeanShiftCanopyDriver exten
private static final String CONTROL_CONVERGED = "control/converged";
public static void main(String[] args) throws Exception {
- new MeanShiftCanopyDriver().run(args);
+ ToolRunner.run(new Configuration(), new MeanShiftCanopyDriver(), args);
}
@Override
@@ -95,27 +96,17 @@ public class MeanShiftCanopyDriver exten
double convergenceDelta =
Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
int maxIterations =
Integer.parseInt(getOption(DefaultOptionCreator.MAX_ITERATIONS_OPTION));
boolean inputIsCanopies = hasOption(INPUT_IS_CANOPIES_OPTION);
- boolean runSequential =
-
getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
+ boolean runSequential =
getOption(DefaultOptionCreator.METHOD_OPTION).equalsIgnoreCase(DefaultOptionCreator.SEQUENTIAL_METHOD);
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
DistanceMeasure measure =
ccl.loadClass(measureClass).asSubclass(DistanceMeasure.class).newInstance();
- run(input,
- output,
- measure,
- t1,
- t2,
- convergenceDelta,
- maxIterations,
- inputIsCanopies,
- runClustering,
- runSequential);
+ run(getConf(), input, output, measure, t1, t2, convergenceDelta,
maxIterations, inputIsCanopies, runClustering, runSequential);
return 0;
}
/**
* Run an iteration
- *
+ * @param conf TODO
* @param input
* the input pathname String
* @param output
@@ -131,16 +122,15 @@ public class MeanShiftCanopyDriver exten
* @param convergenceDelta
* the double convergence criteria
*/
- private static void runIteration(Path input,
+ private static void runIteration(Configuration conf,
+ Path input,
Path output,
Path control,
String measureClassName,
double t1,
double t2,
- double convergenceDelta)
- throws IOException, InterruptedException, ClassNotFoundException {
+ double convergenceDelta) throws
IOException, InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
conf.set(MeanShiftCanopyConfigKeys.DISTANCE_MEASURE_KEY, measureClassName);
conf.set(MeanShiftCanopyConfigKeys.CLUSTER_CONVERGENCE_KEY,
String.valueOf(convergenceDelta));
conf.set(MeanShiftCanopyConfigKeys.T1_KEY, String.valueOf(t1));
@@ -167,7 +157,7 @@ public class MeanShiftCanopyDriver exten
/**
* Run the job where the input format can be either Vectors or Canopies.
* If requested, cluster the input data using the computed Canopies
- *
+ * @param conf the Configuration to use
* @param input
* the input pathname String
* @param output
@@ -188,7 +178,8 @@ public class MeanShiftCanopyDriver exten
* true if the input points are to be clustered once the iterations
complete
* @param runSequential if true run in sequential execution mode
*/
- public void run(Path input,
+ public void run(Configuration conf,
+ Path input,
Path output,
DistanceMeasure measure,
double t1,
@@ -197,31 +188,35 @@ public class MeanShiftCanopyDriver exten
int maxIterations,
boolean inputIsCanopies,
boolean runClustering,
- boolean runSequential)
- throws IOException, InterruptedException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
+ boolean runSequential) throws IOException,
InterruptedException, ClassNotFoundException, InstantiationException,
+ IllegalAccessException {
Path clustersIn = new Path(output, Cluster.INITIAL_CLUSTERS_DIR);
if (inputIsCanopies) {
clustersIn = input;
} else {
- createCanopyFromVectors(input, clustersIn, measure, runSequential);
+ createCanopyFromVectors(conf, input, clustersIn, measure, runSequential);
}
- Path clustersOut =
- buildClusters(clustersIn, output, measure, t1, t2, convergenceDelta,
maxIterations, runSequential);
+ Path clustersOut = buildClusters(conf, clustersIn, output, measure, t1,
t2, convergenceDelta, maxIterations, runSequential);
if (runClustering) {
- clusterData(inputIsCanopies ? input : new Path(output,
Cluster.INITIAL_CLUSTERS_DIR),
+ clusterData(conf,
+ inputIsCanopies ? input : new Path(output,
Cluster.INITIAL_CLUSTERS_DIR),
clustersOut,
new Path(output, Cluster.CLUSTERED_POINTS_DIR),
runSequential);
}
}
- public static void createCanopyFromVectors(Path input, Path output,
DistanceMeasure measure, boolean runSequential)
- throws IOException, InterruptedException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
+ public static void createCanopyFromVectors(Configuration conf,
+ Path input,
+ Path output,
+ DistanceMeasure measure,
+ boolean runSequential) throws
IOException, InterruptedException,
+ ClassNotFoundException, InstantiationException, IllegalAccessException {
if (runSequential) {
createCanopyFromVectorsSeq(input, output, measure);
} else {
- createCanopyFromVectorsMR(input, output, measure);
+ createCanopyFromVectorsMR(conf, input, output, measure);
}
}
@@ -230,8 +225,8 @@ public class MeanShiftCanopyDriver exten
* @param output the Path to the initial clusters directory
* @param measure the DistanceMeasure
*/
- private static void createCanopyFromVectorsSeq(Path input, Path output,
DistanceMeasure measure)
- throws IOException, InstantiationException, IllegalAccessException {
+ private static void createCanopyFromVectorsSeq(Path input, Path output,
DistanceMeasure measure) throws IOException,
+ InstantiationException, IllegalAccessException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
FileStatus[] status = fs.listStatus(input, new OutputLogFilter());
@@ -258,9 +253,8 @@ public class MeanShiftCanopyDriver exten
}
}
- private static void createCanopyFromVectorsMR(Path input, Path output,
DistanceMeasure measure)
- throws IOException, InterruptedException, ClassNotFoundException {
- Configuration conf = new Configuration();
+ private static void createCanopyFromVectorsMR(Configuration conf, Path
input, Path output, DistanceMeasure measure)
+ throws IOException, InterruptedException, ClassNotFoundException {
conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY,
measure.getClass().getName());
Job job = new Job(conf);
job.setJarByClass(MeanShiftCanopyDriver.class);
@@ -279,7 +273,7 @@ public class MeanShiftCanopyDriver exten
/**
* Iterate over the input clusters to produce the next cluster directories
for each iteration
- *
+ * @param conf TODO
* @param clustersIn
* the input directory Path
* @param output
@@ -296,19 +290,20 @@ public class MeanShiftCanopyDriver exten
* an int number of iterations
* @param runSequential if true run in sequential execution mode
*/
- public Path buildClusters(Path clustersIn,
- Path output,
- DistanceMeasure measure,
- double t1,
- double t2,
- double convergenceDelta,
- int maxIterations,
- boolean runSequential)
- throws IOException, InterruptedException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
+ public Path buildClusters(Configuration conf,
+ Path clustersIn,
+ Path output,
+ DistanceMeasure measure,
+ double t1,
+ double t2,
+ double convergenceDelta,
+ int maxIterations,
+ boolean runSequential) throws IOException,
InterruptedException, ClassNotFoundException,
+ InstantiationException, IllegalAccessException {
if (runSequential) {
return buildClustersSeq(clustersIn, output, measure, t1, t2,
convergenceDelta, maxIterations);
} else {
- return buildClustersMR(clustersIn, output, measure, t1, t2,
convergenceDelta, maxIterations);
+ return buildClustersMR(conf, clustersIn, output, measure, t1, t2,
convergenceDelta, maxIterations);
}
}
@@ -318,8 +313,7 @@ public class MeanShiftCanopyDriver exten
double t1,
double t2,
double convergenceDelta,
- int maxIterations)
- throws IOException, InstantiationException, IllegalAccessException {
+ int maxIterations) throws IOException,
InstantiationException, IllegalAccessException {
MeanShiftCanopyClusterer clusterer = new MeanShiftCanopyClusterer(measure,
t1, t2, convergenceDelta);
List<MeanShiftCanopy> clusters = new ArrayList<MeanShiftCanopy>();
Configuration conf = new Configuration();
@@ -338,7 +332,7 @@ public class MeanShiftCanopyDriver exten
reader.close();
}
}
- boolean[] converged = {false};
+ boolean[] converged = { false };
int iteration = 1;
while (!converged[0] && iteration <= maxIterations) {
log.info("Iteration: {}", iteration);
@@ -351,12 +345,9 @@ public class MeanShiftCanopyDriver exten
MeanShiftCanopy.class);
try {
for (MeanShiftCanopy cluster : clusters) {
- log.info("Writing Cluster:{} center:{} numPoints:{} radius:{} to:
{}",
- new Object[] { cluster.getId(),
-
AbstractCluster.formatVector(cluster.getCenter(), null),
- cluster.getNumPoints(),
-
AbstractCluster.formatVector(cluster.getRadius(), null),
- clustersOut.getName() });
+ log.info("Writing Cluster:{} center:{} numPoints:{} radius:{} to:
{}", new Object[] { cluster.getId(),
+ AbstractCluster.formatVector(cluster.getCenter(), null),
cluster.getNumPoints(),
+ AbstractCluster.formatVector(cluster.getRadius(), null),
clustersOut.getName() });
writer.append(new Text(cluster.getIdentifier()), cluster);
}
} finally {
@@ -368,14 +359,14 @@ public class MeanShiftCanopyDriver exten
return clustersIn;
}
- private static Path buildClustersMR(Path clustersIn,
+ private static Path buildClustersMR(Configuration conf,
+ Path clustersIn,
Path output,
DistanceMeasure measure,
double t1,
double t2,
double convergenceDelta,
- int maxIterations)
- throws IOException, InterruptedException, ClassNotFoundException {
+ int maxIterations) throws IOException,
InterruptedException, ClassNotFoundException {
// iterate until the clusters converge
boolean converged = false;
int iteration = 1;
@@ -384,7 +375,7 @@ public class MeanShiftCanopyDriver exten
// point the output to a new directory per iteration
Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
Path controlOut = new Path(output, CONTROL_CONVERGED);
- runIteration(clustersIn, clustersOut, controlOut,
measure.getClass().getName(), t1, t2, convergenceDelta);
+ runIteration(conf, clustersIn, clustersOut, controlOut,
measure.getClass().getName(), t1, t2, convergenceDelta);
converged = FileSystem.get(new Configuration()).exists(controlOut);
// now point the input to the old output directory
clustersIn = clustersOut;
@@ -395,7 +386,7 @@ public class MeanShiftCanopyDriver exten
/**
* Run the job using supplied arguments
- *
+ * @param conf TODO
* @param input
* the directory pathname for input points
* @param clustersIn
@@ -404,11 +395,8 @@ public class MeanShiftCanopyDriver exten
* the directory pathname for output clustered points
* @param runSequential if true run in sequential execution mode
*/
- public static void clusterData(Path input,
- Path clustersIn,
- Path output,
- boolean runSequential)
- throws IOException, InterruptedException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
+ public static void clusterData(Configuration conf, Path input, Path
clustersIn, Path output, boolean runSequential)
+ throws IOException, InterruptedException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
if (runSequential) {
clusterDataSeq(input, clustersIn, output);
} else {
@@ -416,8 +404,8 @@ public class MeanShiftCanopyDriver exten
}
}
- private static void clusterDataSeq(Path input, Path clustersIn, Path output)
- throws IOException, InstantiationException, IllegalAccessException {
+ private static void clusterDataSeq(Path input, Path clustersIn, Path output)
throws IOException, InstantiationException,
+ IllegalAccessException {
Collection<MeanShiftCanopy> clusters = new ArrayList<MeanShiftCanopy>();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(clustersIn.toUri(), conf);
@@ -452,8 +440,7 @@ public class MeanShiftCanopyDriver exten
MeanShiftCanopy canopy =
reader.getValueClass().asSubclass(MeanShiftCanopy.class).newInstance();
while (reader.next(key, canopy)) {
MeanShiftCanopy closest =
MeanShiftCanopyClusterer.findCoveringCanopy(canopy, clusters);
- writer.append(new IntWritable(closest.getId()),
- new WeightedVectorWritable(1, canopy.getCenter()));
+ writer.append(new IntWritable(closest.getId()), new
WeightedVectorWritable(1, canopy.getCenter()));
canopy =
reader.getValueClass().asSubclass(MeanShiftCanopy.class).newInstance();
}
} finally {
@@ -463,8 +450,8 @@ public class MeanShiftCanopyDriver exten
}
}
- private static void clusterDataMR(Path input, Path clustersIn, Path output)
- throws IOException, InterruptedException, ClassNotFoundException {
+ private static void clusterDataMR(Path input, Path clustersIn, Path output)
throws IOException, InterruptedException,
+ ClassNotFoundException {
Configuration conf = new Configuration();
conf.set(STATE_IN_KEY, clustersIn.toString());
Job job = new Job(conf);
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayMeanShift.java
Fri Sep 24 19:18:46 2010
@@ -24,6 +24,7 @@ import java.awt.geom.AffineTransform;
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.meanshift.MeanShiftCanopy;
@@ -108,7 +109,7 @@ final class DisplayMeanShift extends Dis
writeSampleData(samples);
boolean b = true;
if (b) {
- new MeanShiftCanopyDriver().run(samples, output, measure, t1, t2, 0.005,
20, false, true, true);
+ new MeanShiftCanopyDriver().run(new Configuration(), samples, output,
measure, t1, t2, 0.005, 20, false, true, true);
loadClusters(output);
} else {
List<Vector> points = new ArrayList<Vector>();
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/Job.java
Fri Sep 24 19:18:46 2010
@@ -127,7 +127,8 @@ public final class Job extends AbstractJ
IllegalAccessException {
Path directoryContainingConvertedInput = new Path(output,
Constants.DIRECTORY_CONTAINING_CONVERTED_INPUT);
InputDriver.runJob(input, directoryContainingConvertedInput);
- new MeanShiftCanopyDriver().run(directoryContainingConvertedInput,
+ new MeanShiftCanopyDriver().run(conf,
+ directoryContainingConvertedInput,
output,
measure,
t1,
@@ -135,8 +136,7 @@ public final class Job extends AbstractJ
convergenceDelta,
maxIterations,
true,
- true,
- false);
+ true, false);
// run ClusterDumper
ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-" + maxIterations), new Path(output,
"clusteredPoints"));
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
Fri Sep 24 19:18:46 2010
@@ -150,6 +150,26 @@ public final class TestClusterDumper ext
}
}
+ /**
+ * Return the path to the final iteration's clusters
+ *
+ * @param conf
+ * @param output
+ * @param maxIterations
+ * @return
+ * @throws IOException
+ */
+ private Path finalClusterPath(Configuration conf, Path output, int
maxIterations) throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+ for (int i = maxIterations; i >= 0; i--) {
+ Path clusters = new Path(output, "clusters-" + i);
+ if (fs.exists(clusters)) {
+ return clusters;
+ }
+ }
+ return null;
+ }
+
@Test
public void testCanopy() throws Exception { // now run the Job
DistanceMeasure measure = new EuclideanDistanceMeasure();
@@ -166,18 +186,12 @@ public final class TestClusterDumper ext
DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
Path output = getTestTempDirPath("output");
- CanopyDriver.run(new Configuration(), getTestTempDirPath("testdata"),
output, measure, 8, 4, false, false);
+ Configuration conf = new Configuration();
+ CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
4, false, false);
// now run the KMeans job
- KMeansDriver.run(getTestTempDirPath("testdata"),
- new Path(output, "clusters-0"),
- output,
- measure,
- 0.001,
- 10,
- true,
- false);
+ KMeansDriver.run(conf, getTestTempDirPath("testdata"), new Path(output,
"clusters-0"), output, measure, 0.001, 10, true, false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-2"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@@ -186,21 +200,23 @@ public final class TestClusterDumper ext
DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
Path output = getTestTempDirPath("output");
- CanopyDriver.run(new Configuration(), getTestTempDirPath("testdata"),
output, measure, 8, 4, false, false);
+ Configuration conf = new Configuration();
+ CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, measure, 8,
4, false, false);
// now run the Fuzzy KMeans job
- FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
- new Path(output, "clusters-0"),
- output,
- measure,
- 0.001,
- 10,
- ((float) 1.1),
- true,
- true,
- 0,
- false);
+ FuzzyKMeansDriver.run(conf,
+ getTestTempDirPath("testdata"),
+ new Path(output, "clusters-0"),
+ output,
+ measure,
+ 0.001,
+ 10,
+ ((float) 1.1),
+ true,
+ true,
+ 0,
+ false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-3"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@@ -208,18 +224,10 @@ public final class TestClusterDumper ext
public void testMeanShift() throws Exception {
DistanceMeasure measure = new CosineDistanceMeasure();
Path output = getTestTempDirPath("output");
- new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
- output,
- measure,
- 0.5,
- 0.01,
- 0.05,
- 10,
- false,
- true,
- false);
+ Configuration conf = new Configuration();
+ new MeanShiftCanopyDriver().run(conf, getTestTempDirPath("testdata"),
output, measure, 0.5, 0.01, 0.05, 10, false, true, false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-1"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@@ -228,9 +236,10 @@ public final class TestClusterDumper ext
Path output = getTestTempDirPath("output");
NamedVector prototype = (NamedVector) sampleData.get(0).get();
AbstractVectorModelDistribution modelDistribution = new
SampledNormalDistribution(new VectorWritable(prototype));
- DirichletDriver.run(getTestTempDirPath("testdata"), output,
modelDistribution, 15, 10, 1.0, true, true, 0, false);
+ Configuration conf = new Configuration();
+ DirichletDriver.run(conf, getTestTempDirPath("testdata"), output,
modelDistribution, 15, 10, 1.0, true, true, 0, false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-10"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@@ -239,9 +248,10 @@ public final class TestClusterDumper ext
Path output = getTestTempDirPath("output");
NamedVector prototype = (NamedVector) sampleData.get(0).get();
AbstractVectorModelDistribution modelDistribution = new
GaussianClusterDistribution(new VectorWritable(prototype));
- DirichletDriver.run(getTestTempDirPath("testdata"), output,
modelDistribution, 15, 10, 1.0, true, true, 0, true);
+ Configuration conf = new Configuration();
+ DirichletDriver.run(conf, getTestTempDirPath("testdata"), output,
modelDistribution, 15, 10, 1.0, true, true, 0, true);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-10"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@@ -250,9 +260,10 @@ public final class TestClusterDumper ext
Path output = getTestTempDirPath("output");
NamedVector prototype = (NamedVector) sampleData.get(0).get();
AbstractVectorModelDistribution modelDistribution = new
DistanceMeasureClusterDistribution(new VectorWritable(prototype));
- DirichletDriver.run(getTestTempDirPath("testdata"), output,
modelDistribution, 15, 10, 1.0, true, true, 0, true);
+ Configuration conf = new Configuration();
+ DirichletDriver.run(conf, getTestTempDirPath("testdata"), output,
modelDistribution, 15, 10, 1.0, true, true, 0, true);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-10"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@@ -315,16 +326,9 @@ public final class TestClusterDumper ext
// now run the Canopy job to prime kMeans canopies
CanopyDriver.run(conf, svdData, output, measure, 8, 4, false, false);
// now run the KMeans job
- KMeansDriver.run(svdData,
- new Path(output, "clusters-0"),
- output,
- measure,
- 0.001,
- 10,
- true,
- false);
+ KMeansDriver.run(svdData, new Path(output, "clusters-0"), output, measure,
0.001, 10, true, false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-2"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@@ -355,16 +359,9 @@ public final class TestClusterDumper ext
// now run the Canopy job to prime kMeans canopies
CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false,
false);
// now run the KMeans job
- KMeansDriver.run(sData.getRowPath(),
- new Path(output, "clusters-0"),
- output,
- measure,
- 0.001,
- 10,
- true,
- false);
+ KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
output, measure, 0.001, 10, true, false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-2"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
@@ -397,16 +394,9 @@ public final class TestClusterDumper ext
// now run the Canopy job to prime kMeans canopies
CanopyDriver.run(conf, sData.getRowPath(), output, measure, 8, 4, false,
false);
// now run the KMeans job
- KMeansDriver.run(sData.getRowPath(),
- new Path(output, "clusters-0"),
- output,
- measure,
- 0.001,
- 10,
- true,
- false);
+ KMeansDriver.run(sData.getRowPath(), new Path(output, "clusters-0"),
output, measure, 0.001, 10, true, false);
// run ClusterDumper
- ClusterDumper clusterDumper = new ClusterDumper(new Path(output,
"clusters-2"), new Path(output, "clusteredPoints"));
+ ClusterDumper clusterDumper = new ClusterDumper(finalClusterPath(conf,
output, 10), new Path(output, "clusteredPoints"));
clusterDumper.printClusters(termDictionary);
}
}
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
Fri Sep 24 19:18:46 2010
@@ -280,7 +280,9 @@ public final class TestClusterEvaluator
@Test
public void testMeanShift() throws Exception {
DistanceMeasure measure = new EuclideanDistanceMeasure();
- new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
+ Configuration conf = new Configuration();
+ new MeanShiftCanopyDriver().run(conf,
+ getTestTempDirPath("testdata"),
getTestTempDirPath("output"),
measure,
2.1,
@@ -288,11 +290,9 @@ public final class TestClusterEvaluator
0.001,
10,
false,
- true,
- false);
+ true, false);
int numIterations = 2;
Path output = getTestTempDirPath("output");
- Configuration conf = new Configuration();
Path clustersIn = new Path(output, "clusters-2");
RepresentativePointsDriver.run(conf,
clustersIn,
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1001034&r1=1001033&r2=1001034&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Fri Sep 24 19:18:46 2010
@@ -297,7 +297,9 @@ public final class TestCDbwEvaluator ext
@Test
public void testMeanShift() throws Exception {
DistanceMeasure measure = new EuclideanDistanceMeasure();
- new MeanShiftCanopyDriver().run(getTestTempDirPath("testdata"),
+ Configuration conf = new Configuration();
+ new MeanShiftCanopyDriver().run(conf,
+ getTestTempDirPath("testdata"),
getTestTempDirPath("output"),
measure,
2.1,
@@ -305,11 +307,9 @@ public final class TestCDbwEvaluator ext
0.001,
10,
false,
- true,
- false);
+ true, false);
int numIterations = 2;
Path output = getTestTempDirPath("output");
- Configuration conf = new Configuration();
Path clustersIn = new Path(output, "clusters-2");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
"clusteredPoints"), output, measure, numIterations);
CDbwEvaluator evaluator = new CDbwEvaluator(conf, clustersIn);