Author: srowen
Date: Fri Sep 30 19:20:47 2011
New Revision: 1177786
URL: http://svn.apache.org/viewvc?rev=1177786&view=rev
Log:
MAHOUT-778 label final output as "clusters-N-final"
Added:
mahout/trunk/examples/src/test/java/org/apache/mahout/clustering/
mahout/trunk/examples/src/test/java/org/apache/mahout/clustering/display/
mahout/trunk/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Cluster.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Cluster.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Cluster.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Cluster.java
(original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/Cluster.java
Fri Sep 30 19:20:47 2011
@@ -35,6 +35,9 @@ public interface Cluster extends Model<V
// default directory for output of clusters per iteration
String CLUSTERS_DIR = "clusters-";
+ // default suffix for output of clusters for final iteration
+ String FINAL_ITERATION_SUFFIX = "-final";
+
/**
* Get the id of the Cluster
*
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/dirichlet/DirichletDriver.java
Fri Sep 30 19:20:47 2011
@@ -374,7 +374,8 @@ public class DirichletDriver extends Abs
int maxIterations,
double alpha0,
Path clustersIn) throws IOException {
- for (int iteration = 1; iteration <= maxIterations; iteration++) {
+ int iteration = 1;
+ while (iteration <= maxIterations) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
@@ -383,7 +384,7 @@ public class DirichletDriver extends Abs
description,
alpha0,
numClusters);
-
+
List<DirichletCluster> oldModels = state.getClusters();
for (DirichletCluster oldModel : oldModels) {
oldModel.getModel().configure(conf);
@@ -405,8 +406,11 @@ public class DirichletDriver extends Abs
// now point the input to the old output directory
clustersIn = clustersOut;
+ iteration++;
}
- return clustersIn;
+ Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1) + Cluster.FINAL_ITERATION_SUFFIX);
+ FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1)), finalClustersIn);
+ return finalClustersIn;
}
private static Path buildClustersMR(Configuration conf,
@@ -418,15 +422,19 @@ public class DirichletDriver extends Abs
double alpha0,
Path clustersIn)
throws IOException, InterruptedException, ClassNotFoundException {
- for (int iteration = 1; iteration <= maxIterations; iteration++) {
+ int iteration = 1;
+ while (iteration <= maxIterations) {
log.info("Iteration {}", iteration);
// point the output to a new directory per iteration
Path clustersOut = new Path(output, Cluster.CLUSTERS_DIR + iteration);
runIteration(conf, input, clustersIn, clustersOut, description,
numClusters, alpha0);
// now point the input to the old output directory
clustersIn = clustersOut;
+ iteration++;
}
- return clustersIn;
+ Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1) + Cluster.FINAL_ITERATION_SUFFIX);
+ FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1)), finalClustersIn);
+ return finalClustersIn;
}
/**
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/FuzzyKMeansDriver.java
Fri Sep 30 19:20:47 2011
@@ -362,9 +362,9 @@ public class FuzzyKMeansDriver extends A
}
boolean converged = false;
int iteration = 1;
+ Configuration conf = new Configuration();
while (!converged && iteration <= maxIterations) {
log.info("Fuzzy k-Means Iteration: " + iteration);
- Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(input.toUri(), conf);
for (VectorWritable value
: new SequenceFileDirValueIterable<VectorWritable>(input,
@@ -398,7 +398,9 @@ public class FuzzyKMeansDriver extends A
clustersIn = clustersOut;
iteration++;
}
- return clustersIn;
+ Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1) + Cluster.FINAL_ITERATION_SUFFIX);
+ FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1)), finalClustersIn);
+ return finalClustersIn;
}
private static Path buildClustersMR(Configuration conf,
@@ -424,7 +426,9 @@ public class FuzzyKMeansDriver extends A
clustersIn = clustersOut;
iteration++;
}
- return clustersIn;
+ Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1) + "-final");
+ FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1)), finalClustersIn);
+ return finalClustersIn;
}
/**
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/kmeans/KMeansDriver.java
Fri Sep 30 19:20:47 2011
@@ -292,7 +292,9 @@ public class KMeansDriver extends Abstra
clustersIn = clustersOut;
iteration++;
}
- return clustersIn;
+ Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR +
(iteration-1) + org.apache.mahout.clustering.Cluster.FINAL_ITERATION_SUFFIX);
+ FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR
+ (iteration-1)), finalClustersIn);
+ return finalClustersIn;
}
private static Path buildClustersMR(Configuration conf,
@@ -314,7 +316,9 @@ public class KMeansDriver extends Abstra
clustersIn = clustersOut;
iteration++;
}
- return clustersIn;
+ Path finalClustersIn = new Path(output, AbstractCluster.CLUSTERS_DIR +
(iteration-1) + "-final");
+ FileSystem.get(conf).rename(new Path(output, AbstractCluster.CLUSTERS_DIR
+ (iteration-1)), finalClustersIn);
+ return finalClustersIn;
}
/**
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
Fri Sep 30 19:20:47 2011
@@ -333,7 +333,9 @@ public class MeanShiftCanopyDriver exten
clustersIn = clustersOut;
iteration++;
}
- return clustersIn;
+ Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1) + "-final");
+ FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1)), finalClustersIn);
+ return finalClustersIn;
}
/**
@@ -369,7 +371,9 @@ public class MeanShiftCanopyDriver exten
conf.set(MAPRED_REDUCE_TASKS, String.valueOf(numReducers));
}
}
- return clustersIn;
+ Path finalClustersIn = new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1) + Cluster.FINAL_ITERATION_SUFFIX);
+ FileSystem.get(conf).rename(new Path(output, Cluster.CLUSTERS_DIR +
(iteration-1)), finalClustersIn);
+ return finalClustersIn;
}
/**
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/clustering/meanshift/TestMeanShift.java
Fri Sep 30 19:20:47 2011
@@ -61,7 +61,7 @@ public final class TestMeanShift extends
/**
* Print the canopies to the transcript
- *
+ *
* @param canopies
* a List<Canopy>
*/
@@ -376,7 +376,7 @@ public final class TestMeanShift extends
optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
ToolRunner.run(conf, new MeanShiftCanopyDriver(), args);
- Path outPart = new Path(output, "clusters-4/part-r-00000");
+ Path outPart = new Path(output, "clusters-4-final/part-r-00000");
long count = HadoopUtil.countRecords(outPart, conf);
assertEquals("count", 3, count);
outPart = new Path(output, "clusters-0/part-m-00000");
@@ -430,7 +430,7 @@ public final class TestMeanShift extends
optKey(DefaultOptionCreator.METHOD_OPTION),
DefaultOptionCreator.SEQUENTIAL_METHOD };
ToolRunner.run(new Configuration(), new MeanShiftCanopyDriver(), args);
- Path outPart = new Path(output, "clusters-7/part-r-00000");
+ Path outPart = new Path(output, "clusters-7-final/part-r-00000");
long count = HadoopUtil.countRecords(outPart, conf);
assertEquals("count", 3, count);
}
@@ -470,7 +470,7 @@ public final class TestMeanShift extends
optKey(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION), "0.2",
optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
ToolRunner.run(conf, new MeanShiftCanopyDriver(), args);
- Path outPart = new Path(output, "clusters-3/part-r-00000");
+ Path outPart = new Path(output, "clusters-3-final/part-r-00000");
long count = HadoopUtil.countRecords(outPart, conf);
assertEquals("count", 3, count);
Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart,
@@ -520,7 +520,7 @@ public final class TestMeanShift extends
optKey(DefaultOptionCreator.METHOD_OPTION),
DefaultOptionCreator.SEQUENTIAL_METHOD };
ToolRunner.run(new Configuration(), new MeanShiftCanopyDriver(), args);
- Path outPart = new Path(output, "clusters-7/part-r-00000");
+ Path outPart = new Path(output, "clusters-7-final/part-r-00000");
long count = HadoopUtil.countRecords(outPart, conf);
assertEquals("count", 3, count);
Iterator<?> iterator = new SequenceFileValueIterator<Writable>(outPart,
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/ClustersFilter.java
Fri Sep 30 19:20:47 2011
@@ -19,10 +19,12 @@ package org.apache.mahout.clustering.dis
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
+import org.apache.mahout.clustering.Cluster;
public class ClustersFilter implements PathFilter {
@Override
public boolean accept(Path path) {
- return path.toString().contains("/clusters-");
+ String pathString = path.toString();
+ return pathString.contains("/clusters-") &&
pathString.endsWith(Cluster.FINAL_ITERATION_SUFFIX);
}
}
Added:
mahout/trunk/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java?rev=1177786&view=auto
==============================================================================
---
mahout/trunk/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
(added)
+++
mahout/trunk/examples/src/test/java/org/apache/mahout/clustering/display/ClustersFilterTest.java
Fri Sep 30 19:20:47 2011
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.clustering.display;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.mahout.common.MahoutTestCase;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+
+public class ClustersFilterTest extends MahoutTestCase {
+
+ private Configuration configuration;
+ private Path output;
+
+ @Override
+ @Before
+ public void setUp() throws Exception {
+ super.setUp();
+ configuration = new Configuration();
+ output = getTestTempDirPath();
+ }
+
+ @Test
+ public void testAccept_notFinal() throws Exception {
+ Path path0 = new Path(output, "clusters-0");
+ Path path1 = new Path(output, "clusters-1");
+
+ path0.getFileSystem(configuration).createNewFile(path0);
+ path1.getFileSystem(configuration).createNewFile(path1);
+
+ PathFilter clustersFilter = new ClustersFilter();
+
+ assertFalse(clustersFilter.accept(path0));
+ assertFalse(clustersFilter.accept(path1));
+ }
+
+ @Test
+ public void testAccept_finalPath() throws IOException {
+ Path path0 = new Path(output, "clusters-0");
+ Path path1 = new Path(output, "clusters-1");
+ Path path2 = new Path(output, "clusters-2");
+ Path path3Final = new Path(output, "clusters-3-final");
+
+ path0.getFileSystem(configuration).createNewFile(path0);
+ path1.getFileSystem(configuration).createNewFile(path1);
+ path2.getFileSystem(configuration).createNewFile(path2);
+ path3Final.getFileSystem(configuration).createNewFile(path3Final);
+
+ PathFilter clustersFilter = new ClustersFilter();
+
+ assertFalse(clustersFilter.accept(path0));
+ assertFalse(clustersFilter.accept(path1));
+ assertFalse(clustersFilter.accept(path2));
+ assertTrue(clustersFilter.accept(path3Final));
+ }
+}
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterDumper.java
Fri Sep 30 19:20:47 2011
@@ -96,7 +96,7 @@ public final class TestClusterDumper ext
private List<VectorWritable> sampleData;
private String[] termDictionary;
-
+
@Override
@Before
public void setUp() throws Exception {
@@ -176,7 +176,7 @@ public final class TestClusterDumper ext
int maxIterations) throws IOException {
FileSystem fs = FileSystem.get(conf);
for (int i = maxIterations; i >= 0; i--) {
- Path clusters = new Path(output, "clusters-" + i);
+ Path clusters = new Path(output, "clusters-" + i + "-final");
if (fs.exists(clusters)) {
return clusters;
}
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1177786&r1=1177785&r2=1177786&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
Fri Sep 30 19:20:47 2011
@@ -391,7 +391,7 @@ public final class TestClusterEvaluator
MeanShiftCanopyDriver.run(conf, testdata, output, measure, kernelProfile,
2.1, 1.0, 0.001, 10, false, true, true);
int numIterations = 10;
- Path clustersIn = new Path(output, "clusters-7");
+ Path clustersIn = new Path(output, "clusters-7-final");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
"clusteredPoints"), output, measure, numIterations, true);
ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
@@ -414,7 +414,7 @@ public final class TestClusterEvaluator
0, true);
int numIterations = 10;
Configuration conf = new Configuration();
- Path clustersIn = new Path(output, "clusters-5");
+ Path clustersIn = new Path(output, "clusters-5-final");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
"clusteredPoints"), output, new EuclideanDistanceMeasure(),
numIterations, true);