Author: jeastman
Date: Fri May 29 23:07:50 2009
New Revision: 780137
URL: http://svn.apache.org/viewvc?rev=780137&view=rev
Log:
- KMeansDriver: renamed parameter numCentroids to numReduceTasks to comply with
its usage
- cleaned up commented-out code and improved comments in TestKmeansClustering
and kmeans/Job
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL:
http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=780137&r1=780136&r2=780137&view=diff
==============================================================================
---
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
(original)
+++
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
Fri May 29 23:07:50 2009
@@ -57,10 +57,11 @@
* @param measureClass the classname of the DistanceMeasure
* @param convergenceDelta the convergence delta value
* @param maxIterations the maximum number of iterations
+ * @param numReduceTasks the number of reducers
*/
public static void runJob(String input, String clustersIn, String output,
String measureClass, double convergenceDelta, int maxIterations,
- int numCentroids) {
+ int numReduceTasks) {
// iterate until the clusters converge
boolean converged = false;
int iteration = 0;
@@ -71,7 +72,7 @@
// point the output to a new directory per iteration
String clustersOut = output + "/clusters-" + iteration;
converged = runIteration(input, clustersIn, clustersOut, measureClass,
- delta, numCentroids);
+ delta, numReduceTasks);
// now point the input to the old output directory
clustersIn = output + "/clusters-" + iteration;
iteration++;
@@ -89,6 +90,7 @@
* @param clustersOut the directory pathname for output clusters
* @param measureClass the classname of the DistanceMeasure
* @param convergenceDelta the convergence delta value
+ * @param numReduceTasks the number of reducer tasks
* @return true if the iteration successfully runs
*/
private static boolean runIteration(String input, String clustersIn,
@@ -107,16 +109,10 @@
conf.setMapperClass(KMeansMapper.class);
conf.setCombinerClass(KMeansCombiner.class);
conf.setReducerClass(KMeansReducer.class);
- // conf.setNumMapTasks(numMapTasks);
conf.setNumReduceTasks(numReduceTasks);
conf.set(Cluster.CLUSTER_PATH_KEY, clustersIn);
conf.set(Cluster.DISTANCE_MEASURE_KEY, measureClass);
conf.set(Cluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
-
-// conf.set("mapred.child.java.opts", "-Xmx1536m");
- // uncomment it to run locally
-// conf.set("mapred.job.tracker", "local");
-
client.setConf(conf);
try {
JobClient.runJob(conf);
@@ -156,9 +152,6 @@
conf.set(Cluster.CLUSTER_CONVERGENCE_KEY, convergenceDelta);
client.setConf(conf);
- // uncomment it to run locally
- // conf.set("mapred.job.tracker", "local");
-// conf.set("mapred.child.java.opts", "-Xmx1536m");
try {
JobClient.runJob(conf);
} catch (IOException e) {
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
URL:
http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java?rev=780137&r1=780136&r2=780137&view=diff
==============================================================================
---
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
(original)
+++
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansUtil.java
Fri May 29 23:07:50 2009
@@ -70,12 +70,10 @@
// iterate thru the result path list
for (Path path : result) {
SequenceFile.Reader reader = null;
-// RecordReader<Text, Text> recordReader = null;
try {
reader =new SequenceFile.Reader(fs, path, job);
Text key = new Text();
Text value = new Text();
- int counter = 1;
while (reader.next(key, value)) {
// get the cluster info
Cluster cluster = Cluster.decodeCluster(value.toString());
Modified:
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
URL:
http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java?rev=780137&r1=780136&r2=780137&view=diff
==============================================================================
---
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
(original)
+++
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/kmeans/TestKmeansClustering.java
Fri May 29 23:07:50 2009
@@ -410,7 +410,6 @@
// now compare the expected clusters with actual
File outDir = new File("output/points");
assertTrue("output dir exists?", outDir.exists());
- String[] outFiles = outDir.list();
// assertEquals("output dir files?", 4, outFiles.length);
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream("output/points/part-00000"), Charset
Modified:
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL:
http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=780137&r1=780136&r2=780137&view=diff
==============================================================================
---
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
(original)
+++
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
Fri May 29 23:07:50 2009
@@ -17,6 +17,8 @@
package org.apache.mahout.clustering.syntheticcontrol.kmeans;
+import java.io.IOException;
+
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
@@ -25,10 +27,6 @@
import org.apache.mahout.clustering.kmeans.KMeansDriver;
import org.apache.mahout.clustering.syntheticcontrol.canopy.InputDriver;
-import java.io.IOException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
public class Job {
private Job() {
}
@@ -69,7 +67,8 @@
* @param maxIterations the int maximum number of iterations
*/
private static void runJob(String input, String output, String measureClass,
- double t1, double t2, double convergenceDelta, int maxIterations) throws
IOException {
+ double t1, double t2, double convergenceDelta, int maxIterations)
+ throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(Job.class);
@@ -79,10 +78,9 @@
if (dfs.exists(outPath))
dfs.delete(outPath, true);
InputDriver.runJob(input, output + "/data");
- CanopyClusteringJob
- .runJob(output + "/data", output, measureClass, t1, t2);
+ CanopyClusteringJob.runJob(output + "/data", output, measureClass, t1, t2);
KMeansDriver.runJob(output + "/data", output + "/canopies", output,
- measureClass, convergenceDelta, maxIterations,1);
-// OutputDriver.runJob(output + "/points", output + "/clustered-points");
+ measureClass, convergenceDelta, maxIterations, 1);
+ // OutputDriver.runJob(output + "/points", output +
"/clustered-points");
}
}