Author: jeastman
Date: Wed Sep 29 20:27:46 2010
New Revision: 1002836
URL: http://svn.apache.org/viewvc?rev=1002836&view=rev
Log:
MAHOUT-513:
- replaced RunningSumsGaussianAccumulator with OnlineGaussianAccumulator in
CDbwEvaluator
- updated unit tests to new metric values
- added a test to ignore empty lines in synthetic control input mappers
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
Wed Sep 29 20:27:46 2010
@@ -20,9 +20,8 @@ import org.apache.mahout.math.Vector;
import org.apache.mahout.math.function.SquareRootFunction;
/**
- * An online Gaussian statistics accumulator based upon Knuth (who cites
Wellford) which is declared to be
+ * An online Gaussian statistics accumulator based upon Knuth (who cites
Welford) which is declared to be
* numerically-stable. See
http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- * The cited algorithm has been modified to accumulate weighted Vectors
*/
public class OnlineGaussianAccumulator implements GaussianAccumulator {
private double n = 0;
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
Wed Sep 29 20:27:46 2010
@@ -48,21 +48,24 @@ public class InputMapper extends Mapper<
doubles.add(Double.valueOf(value));
}
}
- try {
- Vector result = (Vector) constructor.newInstance(doubles.size());
- int index = 0;
- for (Double d : doubles) {
- result.set(index++, d);
- }
- VectorWritable vectorWritable = new VectorWritable(result);
- context.write(new Text(String.valueOf(index)), vectorWritable);
+ // ignore empty lines in data file
+ if (!doubles.isEmpty()) {
+ try {
+ Vector result = (Vector) constructor.newInstance(doubles.size());
+ int index = 0;
+ for (Double d : doubles) {
+ result.set(index++, d);
+ }
+ VectorWritable vectorWritable = new VectorWritable(result);
+ context.write(new Text(String.valueOf(index)), vectorWritable);
- } catch (InstantiationException e) {
- throw new IllegalStateException(e);
- } catch (IllegalAccessException e) {
- throw new IllegalStateException(e);
- } catch (InvocationTargetException e) {
- throw new IllegalStateException(e);
+ } catch (InstantiationException e) {
+ throw new IllegalStateException(e);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(e);
+ } catch (InvocationTargetException e) {
+ throw new IllegalStateException(e);
+ }
}
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
Wed Sep 29 20:27:46 2010
@@ -46,12 +46,15 @@ public class InputMapper extends Mapper<
doubles.add(Double.valueOf(value));
}
}
- Vector point = new DenseVector(doubles.size());
- int index = 0;
- for (Double d : doubles) {
- point.set(index++, d);
+ // ignore empty lines in input data
+ if (!doubles.isEmpty()) {
+ Vector point = new DenseVector(doubles.size());
+ int index = 0;
+ for (Double d : doubles) {
+ point.set(index++, d);
+ }
+ MeanShiftCanopy canopy = new MeanShiftCanopy(point, nextCanopyId++, new
EuclideanDistanceMeasure());
+ context.write(new Text(), canopy);
}
- MeanShiftCanopy canopy = new MeanShiftCanopy(point, nextCanopyId++, new
EuclideanDistanceMeasure());
- context.write(new Text(), canopy);
}
}
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
Wed Sep 29 20:27:46 2010
@@ -32,7 +32,7 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.hadoop.io.Writable;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.GaussianAccumulator;
-import org.apache.mahout.clustering.RunningSumsGaussianAccumulator;
+import org.apache.mahout.clustering.OnlineGaussianAccumulator;
import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
import org.apache.mahout.common.distance.DistanceMeasure;
@@ -134,7 +134,7 @@ public class CDbwEvaluator {
*/
private void computeStd(int cI) {
List<VectorWritable> repPts = representativePoints.get(cI);
- GaussianAccumulator accumulator = new RunningSumsGaussianAccumulator();
+ GaussianAccumulator accumulator = new OnlineGaussianAccumulator();
for (VectorWritable vw : repPts) {
accumulator.observe(vw.get());
}
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
Wed Sep 29 20:27:46 2010
@@ -175,8 +175,8 @@ public final class TestCDbwEvaluator ext
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints,
clusters, measure);
assertEquals("inter cluster density", 0.0,
evaluator.interClusterDensity(), EPSILON);
assertEquals("separation", 20.485281374238568, evaluator.separation(),
EPSILON);
- assertEquals("intra cluster density", 0.8944271909999157,
evaluator.intraClusterDensity(), EPSILON);
- assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
+ assertEquals("intra cluster density", 0.8,
evaluator.intraClusterDensity(), EPSILON);
+ assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON);
}
@Test
@@ -185,10 +185,10 @@ public final class TestCDbwEvaluator ext
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.5, measure);
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints,
clusters, measure);
- assertEquals("inter cluster density", 0.0,
evaluator.interClusterDensity(), EPSILON);
- assertEquals("separation", 13.656854249492381, evaluator.separation(),
EPSILON);
- assertEquals("intra cluster density", 0.44721359549995787,
evaluator.intraClusterDensity(), EPSILON);
- assertEquals("CDbw", 6.107530892134367, evaluator.getCDbw(), EPSILON);
+ assertEquals("inter cluster density", 1.2,
evaluator.interClusterDensity(), EPSILON);
+ assertEquals("separation", 6.207661022496537, evaluator.separation(),
EPSILON);
+ assertEquals("intra cluster density", 0.4,
evaluator.intraClusterDensity(), EPSILON);
+ assertEquals("CDbw", 2.483064408998615, evaluator.getCDbw(), EPSILON);
}
@Test
@@ -197,10 +197,10 @@ public final class TestCDbwEvaluator ext
DistanceMeasure measure = new EuclideanDistanceMeasure();
initData(1, 0.75, measure);
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints,
clusters, measure);
- assertEquals("inter cluster density", 0.7634413615167959,
evaluator.interClusterDensity(), EPSILON);
- assertEquals("separation", 3.8722167199667066, evaluator.separation(),
EPSILON);
- assertEquals("intra cluster density", 0.29814239699997197,
evaluator.intraClusterDensity(), EPSILON);
- assertEquals("CDbw", 1.1544719745942431, evaluator.getCDbw(), EPSILON);
+ assertEquals("inter cluster density", 0.682842712474619,
evaluator.interClusterDensity(), EPSILON);
+ assertEquals("separation", 4.0576740025245694, evaluator.separation(),
EPSILON);
+ assertEquals("intra cluster density", 0.26666666666666666,
evaluator.intraClusterDensity(), EPSILON);
+ assertEquals("CDbw", 1.0820464006732184, evaluator.getCDbw(), EPSILON);
}
@Test
@@ -215,8 +215,8 @@ public final class TestCDbwEvaluator ext
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints,
clusters, measure);
assertEquals("inter cluster density", 0.0,
evaluator.interClusterDensity(), EPSILON);
assertEquals("separation", 20.485281374238568, evaluator.separation(),
EPSILON);
- assertEquals("intra cluster density", 0.8944271909999157,
evaluator.intraClusterDensity(), EPSILON);
- assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
+ assertEquals("intra cluster density", 0.8,
evaluator.intraClusterDensity(), EPSILON);
+ assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON);
}
@Test
@@ -232,8 +232,8 @@ public final class TestCDbwEvaluator ext
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints,
clusters, measure);
assertEquals("inter cluster density", 0.0,
evaluator.interClusterDensity(), EPSILON);
assertEquals("separation", 20.485281374238568, evaluator.separation(),
EPSILON);
- assertEquals("intra cluster density", 0.8944271909999157,
evaluator.intraClusterDensity(), EPSILON);
- assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
+ assertEquals("intra cluster density", 0.8,
evaluator.intraClusterDensity(), EPSILON);
+ assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON);
}
/**
@@ -256,8 +256,8 @@ public final class TestCDbwEvaluator ext
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints,
clusters, measure);
assertEquals("inter cluster density", 0.0,
evaluator.interClusterDensity(), EPSILON);
assertEquals("separation", 20.485281374238568, evaluator.separation(),
EPSILON);
- assertEquals("intra cluster density", 0.8944271909999157,
evaluator.intraClusterDensity(), EPSILON);
- assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
+ assertEquals("intra cluster density", 0.8,
evaluator.intraClusterDensity(), EPSILON);
+ assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON);
}
/**
@@ -274,17 +274,17 @@ public final class TestCDbwEvaluator ext
clusters.add(cluster);
List<VectorWritable> points = new ArrayList<VectorWritable>();
Vector delta = new DenseVector(new double[] { 0, Double.MIN_NORMAL });
- points.add(new VectorWritable(delta));
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
- points.add(new VectorWritable(cluster.getCenter()));
+ points.add(new VectorWritable(delta.clone()));
+ points.add(new VectorWritable(delta.clone()));
+ points.add(new VectorWritable(delta.clone()));
+ points.add(new VectorWritable(delta.clone()));
+ points.add(new VectorWritable(delta.clone()));
representativePoints.put(cluster.getId(), points);
CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints,
clusters, measure);
assertEquals("inter cluster density", 0.0,
evaluator.interClusterDensity(), EPSILON);
assertEquals("separation", 28.970562748477143, evaluator.separation(),
EPSILON);
- assertEquals("intra cluster density", 2.0124611797498106,
evaluator.intraClusterDensity(), EPSILON);
- assertEquals("CDbw", 58.30213288681623, evaluator.getCDbw(), EPSILON);
+ assertEquals("intra cluster density", 1.8,
evaluator.intraClusterDensity(), EPSILON);
+ assertEquals("CDbw", 52.147012947258865, evaluator.getCDbw(), EPSILON);
}
@Test