Author: jeastman
Date: Sat Sep 25 16:33:05 2010
New Revision: 1001258
URL: http://svn.apache.org/viewvc?rev=1001258&view=rev
Log:
MAHOUT-236:
- Modified ClusterEvaluator to use the same dataset as the clustering Display
examples.
- Switched evaluator to run sequential versions of the clustering jobs to
reduce execution time.
- Fixed a clusteredPoints path bug in sequential Mean Shift clustering
All tests run
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java?rev=1001258&r1=1001257&r2=1001258&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/meanshift/MeanShiftCanopyDriver.java
Sat Sep 25 16:33:05 2010
@@ -426,13 +426,12 @@ public class MeanShiftCanopyDriver exten
// iterate over all points, assigning each to the closest canopy and
outputting that clustering
fs = FileSystem.get(input.toUri(), conf);
status = fs.listStatus(input, new OutputLogFilter());
- Path outPath = new Path(output,
CanopyDriver.DEFAULT_CLUSTERED_POINTS_DIRECTORY);
int part = 0;
for (FileStatus s : status) {
SequenceFile.Reader reader = new SequenceFile.Reader(fs, s.getPath(),
conf);
SequenceFile.Writer writer = new SequenceFile.Writer(fs,
conf,
- new Path(outPath,
"part-m-" + part++),
+ new Path(output,
"part-m-" + part++),
IntWritable.class,
WeightedVectorWritable.class);
try {
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java?rev=1001258&r1=1001257&r2=1001258&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/evaluation/RepresentativePointsDriver.java
Sat Sep 25 16:33:05 2010
@@ -116,7 +116,7 @@ public final class RepresentativePointsD
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
IntWritable.class, VectorWritable.class);
while (reader.next(key, value)) {
Cluster cluster = (Cluster) value;
- log.debug("C-" + cluster.getId() + ": " +
AbstractCluster.formatVector(cluster.getCenter(), null));
+ log.info("C-" + cluster.getId() + ": " +
AbstractCluster.formatVector(cluster.getCenter(), null));
writer.append(new IntWritable(cluster.getId()), new
VectorWritable(cluster.getCenter()));
}
writer.close();
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java?rev=1001258&r1=1001257&r2=1001258&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/TestClusterEvaluator.java
Sat Sep 25 16:33:05 2010
@@ -33,6 +33,7 @@ import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.canopy.Canopy;
import org.apache.mahout.clustering.canopy.CanopyDriver;
import org.apache.mahout.clustering.dirichlet.DirichletDriver;
+import org.apache.mahout.clustering.dirichlet.UncommonDistributions;
import
org.apache.mahout.clustering.dirichlet.models.GaussianClusterDistribution;
import org.apache.mahout.clustering.evaluation.ClusterEvaluator;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
@@ -47,25 +48,70 @@ import org.apache.mahout.math.DenseVecto
import org.apache.mahout.math.VectorWritable;
import org.junit.Before;
import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public final class TestClusterEvaluator extends MahoutTestCase {
private static final double[][] REFERENCE = { { 1, 1 }, { 2, 1 }, { 1, 2 },
{ 2, 2 }, { 3, 3 }, { 4, 4 }, { 5, 4 }, { 4, 5 },
{ 5, 5 } };
+ private List<VectorWritable> referenceData = new ArrayList<VectorWritable>();
+
+ private List<VectorWritable> sampleData = new ArrayList<VectorWritable>();
+
private Map<Integer, List<VectorWritable>> representativePoints;
private List<Cluster> clusters;
+ private static final Logger log =
LoggerFactory.getLogger(TestClusterEvaluator.class);
+
+ private Configuration conf;
+
+ private FileSystem fs;
+
+ private Path testdata;
+
+ private Path output;
+
@Override
@Before
public void setUp() throws Exception {
super.setUp();
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
- // Create test data
- List<VectorWritable> sampleData =
TestKmeansClustering.getPointsWritable(REFERENCE);
- ClusteringTestUtils.writePointsToFile(sampleData,
getTestTempFilePath("testdata/file1"), fs, conf);
+ conf = new Configuration();
+ fs = FileSystem.get(conf);
+ testdata = getTestTempDirPath("testdata");
+ output = getTestTempDirPath("output");
+ // Create small reference data set
+ referenceData = TestKmeansClustering.getPointsWritable(REFERENCE);
+ // generate larger test data set for the clustering tests to chew on
+ generateSamples();
+ }
+
+ /**
+ * Generate random samples and add them to the sampleData
+ *
+ * @param num
+ * int number of samples to generate
+ * @param mx
+ * double x-value of the sample mean
+ * @param my
+ * double y-value of the sample mean
+ * @param sd
+ * double standard deviation of the samples
+ */
+ private void generateSamples(int num, double mx, double my, double sd) {
+ log.info("Generating {} samples m=[{}, {}] sd={}", new Object[] { num, mx,
my, sd });
+ for (int i = 0; i < num; i++) {
+ sampleData.add(new VectorWritable(new DenseVector(new double[] {
UncommonDistributions.rNorm(mx, sd),
+ UncommonDistributions.rNorm(my, sd) })));
+ }
+ }
+
+ private void generateSamples() {
+ generateSamples(500, 1, 1, 3);
+ generateSamples(300, 1, 0, 0.5);
+ generateSamples(300, 0, 2, 0.1);
}
private void checkRefPoints(int numIterations) throws IOException {
@@ -115,7 +161,8 @@ public final class TestClusterEvaluator
}
@Test
- public void testCluster0() {
+ public void testCluster0() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.25, measure);
ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints,
clusters, measure);
@@ -124,7 +171,8 @@ public final class TestClusterEvaluator
}
@Test
- public void testCluster1() {
+ public void testCluster1() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.5, measure);
ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints,
clusters, measure);
@@ -133,7 +181,8 @@ public final class TestClusterEvaluator
}
@Test
- public void testCluster2() {
+ public void testCluster2() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.75, measure);
ClusterEvaluator evaluator = new ClusterEvaluator(representativePoints,
clusters, measure);
@@ -142,7 +191,8 @@ public final class TestClusterEvaluator
}
@Test
- public void testEmptyCluster() {
+ public void testEmptyCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.25, measure);
Canopy cluster = new Canopy(new DenseVector(new double[] { 10, 10 }), 19,
measure);
@@ -155,7 +205,8 @@ public final class TestClusterEvaluator
}
@Test
- public void testSingleValueCluster() {
+ public void testSingleValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.25, measure);
Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19,
measure);
@@ -171,9 +222,11 @@ public final class TestClusterEvaluator
/**
* Representative points extraction will duplicate the cluster center if the
cluster has no
* assigned points. These clusters should be ignored like empty clusters
above
+ * @throws IOException
*/
@Test
- public void testAllSameValueCluster() {
+ public void testAllSameValueCluster() throws IOException {
+ ClusteringTestUtils.writePointsToFile(referenceData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.25, measure);
Canopy cluster = new Canopy(new DenseVector(new double[] { 0, 0 }), 19,
measure);
@@ -189,12 +242,12 @@ public final class TestClusterEvaluator
}
@Test
- public void testCanopy() throws Exception { // now run the Job
+ public void testCanopy() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
Configuration conf = new Configuration();
- CanopyDriver.run(conf, getTestTempDirPath("testdata"),
getTestTempDirPath("output"), measure, 3.1, 2.1, true, false);
- int numIterations = 2;
- Path output = getTestTempDirPath("output");
+ CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, true, true);
+ int numIterations = 10;
Path clustersIn = new Path(output, "clusters-0");
RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
"clusteredPoints"), output, measure, numIterations);
ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
@@ -207,28 +260,16 @@ public final class TestClusterEvaluator
@Test
public void testKmeans() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
Configuration conf = new Configuration();
- CanopyDriver.run(conf,
- getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- measure,
- 3.1,
- 2.1,
- false,
- false);
+ CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, true);
// now run the KMeans job
- Path output = getTestTempDirPath("output");
- KMeansDriver.run(getTestTempDirPath("testdata"), new Path(output,
"clusters-0"), output, measure, 0.001, 10, true, false);
- int numIterations = 2;
+ KMeansDriver.run(testdata, new Path(output, "clusters-0"), output,
measure, 0.001, 10, true, true);
+ int numIterations = 10;
Path clustersIn = new Path(output, "clusters-2");
- RepresentativePointsDriver.run(conf,
- clustersIn,
- new Path(output, "clusteredPoints"),
- output,
- measure,
- numIterations);
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
"clusteredPoints"), output, measure, numIterations);
ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
// now print out the Results
System.out.println("Intra-cluster density = " +
evaluator.intraClusterDensity());
@@ -238,38 +279,16 @@ public final class TestClusterEvaluator
@Test
public void testFuzzyKmeans() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
// now run the Canopy job to prime kMeans canopies
Configuration conf = new Configuration();
- CanopyDriver.run(conf,
- getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- measure,
- 3.1,
- 2.1,
- false,
- false);
+ CanopyDriver.run(conf, testdata, output, measure, 3.1, 1.1, false, true);
// now run the KMeans job
- Path output = getTestTempDirPath("output");
- FuzzyKMeansDriver.run(getTestTempDirPath("testdata"),
- new Path(output, "clusters-0"),
- output,
- measure,
- 0.001,
- 10,
- 2,
- true,
- true,
- 0,
- false);
- int numIterations = 2;
+ FuzzyKMeansDriver.run(testdata, new Path(output, "clusters-0"), output,
measure, 0.001, 10, 2, true, true, 0, true);
+ int numIterations = 10;
Path clustersIn = new Path(output, "clusters-4");
- RepresentativePointsDriver.run(conf,
- clustersIn,
- new Path(output, "clusteredPoints"),
- output,
- measure,
- numIterations);
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
"clusteredPoints"), output, measure, numIterations);
ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
// now print out the Results
System.out.println("Intra-cluster density = " +
evaluator.intraClusterDensity());
@@ -279,27 +298,13 @@ public final class TestClusterEvaluator
@Test
public void testMeanShift() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata,
"file1"), fs, conf);
DistanceMeasure measure = new EuclideanDistanceMeasure();
Configuration conf = new Configuration();
- new MeanShiftCanopyDriver().run(conf,
- getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- measure,
- 2.1,
- 1.0,
- 0.001,
- 10,
- false,
- true, false);
- int numIterations = 2;
- Path output = getTestTempDirPath("output");
- Path clustersIn = new Path(output, "clusters-2");
- RepresentativePointsDriver.run(conf,
- clustersIn,
- new Path(output, "clusteredPoints"),
- output,
- measure,
- numIterations);
+ new MeanShiftCanopyDriver().run(conf, testdata, output, measure, 2.1, 1.0,
0.001, 10, false, true, true);
+ int numIterations = 10;
+ Path clustersIn = new Path(output, "clusters-10");
+ RepresentativePointsDriver.run(conf, clustersIn, new Path(output,
"clusteredPoints"), output, measure, numIterations);
ClusterEvaluator evaluator = new ClusterEvaluator(conf, clustersIn);
// now print out the Results
System.out.println("Intra-cluster density = " +
evaluator.intraClusterDensity());
@@ -309,19 +314,10 @@ public final class TestClusterEvaluator
@Test
public void testDirichlet() throws Exception {
+ ClusteringTestUtils.writePointsToFile(sampleData, new Path(testdata,
"file1"), fs, conf);
ModelDistribution<VectorWritable> modelDistribution = new
GaussianClusterDistribution(new VectorWritable(new DenseVector(2)));
- DirichletDriver.run(getTestTempDirPath("testdata"),
- getTestTempDirPath("output"),
- modelDistribution,
- 15,
- 5,
- 1.0,
- true,
- true,
- 0,
- true);
- int numIterations = 2;
- Path output = getTestTempDirPath("output");
+ DirichletDriver.run(testdata, output, modelDistribution, 15, 5, 1.0, true,
true, 0, true);
+ int numIterations = 10;
Configuration conf = new Configuration();
Path clustersIn = new Path(output, "clusters-5");
RepresentativePointsDriver.run(conf,