Author: jeastman
Date: Wed Sep 29 20:27:46 2010
New Revision: 1002836

URL: http://svn.apache.org/viewvc?rev=1002836&view=rev
Log:
MAHOUT-513:
- replaced RunningSumsGaussianAccumulator with OnlineGaussianAccumulator in 
CDbwEvaluator
- updated unit tests to new metric values
- added a test to ignore empty lines in synthetic control input mappers

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/OnlineGaussianAccumulator.java
 Wed Sep 29 20:27:46 2010
@@ -20,9 +20,8 @@ import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.function.SquareRootFunction;
 
 /**
- * An online Gaussian statistics accumulator based upon Knuth (who cites 
Wellford) which is declared to be
+ * An online Gaussian statistics accumulator based upon Knuth (who cites 
Welford) which is declared to be
  * numerically-stable. See 
http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- * The cited algorithm has been modified to accumulate weighted Vectors
  */
 public class OnlineGaussianAccumulator implements GaussianAccumulator {
   private double n = 0;

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
 Wed Sep 29 20:27:46 2010
@@ -48,21 +48,24 @@ public class InputMapper extends Mapper<
         doubles.add(Double.valueOf(value));
       }
     }
-    try {
-      Vector result = (Vector) constructor.newInstance(doubles.size());
-      int index = 0;
-      for (Double d : doubles) {
-        result.set(index++, d);
-      }
-      VectorWritable vectorWritable = new VectorWritable(result);
-      context.write(new Text(String.valueOf(index)), vectorWritable);
+    // ignore empty lines in data file
+    if (!doubles.isEmpty()) {
+      try {
+        Vector result = (Vector) constructor.newInstance(doubles.size());
+        int index = 0;
+        for (Double d : doubles) {
+          result.set(index++, d);
+        }
+        VectorWritable vectorWritable = new VectorWritable(result);
+        context.write(new Text(String.valueOf(index)), vectorWritable);
 
-    } catch (InstantiationException e) {
-      throw new IllegalStateException(e);
-    } catch (IllegalAccessException e) {
-      throw new IllegalStateException(e);
-    } catch (InvocationTargetException e) {
-      throw new IllegalStateException(e);
+      } catch (InstantiationException e) {
+        throw new IllegalStateException(e);
+      } catch (IllegalAccessException e) {
+        throw new IllegalStateException(e);
+      } catch (InvocationTargetException e) {
+        throw new IllegalStateException(e);
+      }
     }
   }
 

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/meanshift/InputMapper.java
 Wed Sep 29 20:27:46 2010
@@ -46,12 +46,15 @@ public class InputMapper extends Mapper<
         doubles.add(Double.valueOf(value));
       }
     }
-    Vector point = new DenseVector(doubles.size());
-    int index = 0;
-    for (Double d : doubles) {
-      point.set(index++, d);
+    // ignore empty lines in input data
+    if (!doubles.isEmpty()) {
+      Vector point = new DenseVector(doubles.size());
+      int index = 0;
+      for (Double d : doubles) {
+        point.set(index++, d);
+      }
+      MeanShiftCanopy canopy = new MeanShiftCanopy(point, nextCanopyId++, new 
EuclideanDistanceMeasure());
+      context.write(new Text(), canopy);
     }
-    MeanShiftCanopy canopy = new MeanShiftCanopy(point, nextCanopyId++, new 
EuclideanDistanceMeasure());
-    context.write(new Text(), canopy);
   }
 }

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/clustering/cdbw/CDbwEvaluator.java
 Wed Sep 29 20:27:46 2010
@@ -32,7 +32,7 @@ import org.apache.hadoop.io.SequenceFile
 import org.apache.hadoop.io.Writable;
 import org.apache.mahout.clustering.Cluster;
 import org.apache.mahout.clustering.GaussianAccumulator;
-import org.apache.mahout.clustering.RunningSumsGaussianAccumulator;
+import org.apache.mahout.clustering.OnlineGaussianAccumulator;
 import org.apache.mahout.clustering.evaluation.RepresentativePointsDriver;
 import org.apache.mahout.clustering.evaluation.RepresentativePointsMapper;
 import org.apache.mahout.common.distance.DistanceMeasure;
@@ -134,7 +134,7 @@ public class CDbwEvaluator {
    */
   private void computeStd(int cI) {
     List<VectorWritable> repPts = representativePoints.get(cI);
-    GaussianAccumulator accumulator = new RunningSumsGaussianAccumulator();
+    GaussianAccumulator accumulator = new OnlineGaussianAccumulator();
     for (VectorWritable vw : repPts) {
       accumulator.observe(vw.get());
     }

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java?rev=1002836&r1=1002835&r2=1002836&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/clustering/cdbw/TestCDbwEvaluator.java
 Wed Sep 29 20:27:46 2010
@@ -175,8 +175,8 @@ public final class TestCDbwEvaluator ext
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, 
clusters, measure);
     assertEquals("inter cluster density", 0.0, 
evaluator.interClusterDensity(), EPSILON);
     assertEquals("separation", 20.485281374238568, evaluator.separation(), 
EPSILON);
-    assertEquals("intra cluster density", 0.8944271909999157, 
evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
+    assertEquals("intra cluster density", 0.8, 
evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON);
   }
 
   @Test
@@ -185,10 +185,10 @@ public final class TestCDbwEvaluator ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.5, measure);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, 
clusters, measure);
-    assertEquals("inter cluster density", 0.0, 
evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 13.656854249492381, evaluator.separation(), 
EPSILON);
-    assertEquals("intra cluster density", 0.44721359549995787, 
evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 6.107530892134367, evaluator.getCDbw(), EPSILON);
+    assertEquals("inter cluster density", 1.2, 
evaluator.interClusterDensity(), EPSILON);
+    assertEquals("separation", 6.207661022496537, evaluator.separation(), 
EPSILON);
+    assertEquals("intra cluster density", 0.4, 
evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 2.483064408998615, evaluator.getCDbw(), EPSILON);
   }
 
   @Test
@@ -197,10 +197,10 @@ public final class TestCDbwEvaluator ext
     DistanceMeasure measure = new EuclideanDistanceMeasure();
     initData(1, 0.75, measure);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, 
clusters, measure);
-    assertEquals("inter cluster density", 0.7634413615167959, 
evaluator.interClusterDensity(), EPSILON);
-    assertEquals("separation", 3.8722167199667066, evaluator.separation(), 
EPSILON);
-    assertEquals("intra cluster density", 0.29814239699997197, 
evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 1.1544719745942431, evaluator.getCDbw(), EPSILON);
+    assertEquals("inter cluster density", 0.682842712474619, 
evaluator.interClusterDensity(), EPSILON);
+    assertEquals("separation", 4.0576740025245694, evaluator.separation(), 
EPSILON);
+    assertEquals("intra cluster density", 0.26666666666666666, 
evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 1.0820464006732184, evaluator.getCDbw(), EPSILON);
   }
 
   @Test
@@ -215,8 +215,8 @@ public final class TestCDbwEvaluator ext
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, 
clusters, measure);
     assertEquals("inter cluster density", 0.0, 
evaluator.interClusterDensity(), EPSILON);
     assertEquals("separation", 20.485281374238568, evaluator.separation(), 
EPSILON);
-    assertEquals("intra cluster density", 0.8944271909999157, 
evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
+    assertEquals("intra cluster density", 0.8, 
evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON);
   }
 
   @Test
@@ -232,8 +232,8 @@ public final class TestCDbwEvaluator ext
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, 
clusters, measure);
     assertEquals("inter cluster density", 0.0, 
evaluator.interClusterDensity(), EPSILON);
     assertEquals("separation", 20.485281374238568, evaluator.separation(), 
EPSILON);
-    assertEquals("intra cluster density", 0.8944271909999157, 
evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
+    assertEquals("intra cluster density", 0.8, 
evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON);
   }
 
   /**
@@ -256,8 +256,8 @@ public final class TestCDbwEvaluator ext
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, 
clusters, measure);
     assertEquals("inter cluster density", 0.0, 
evaluator.interClusterDensity(), EPSILON);
     assertEquals("separation", 20.485281374238568, evaluator.separation(), 
EPSILON);
-    assertEquals("intra cluster density", 0.8944271909999157, 
evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 18.322592676403097, evaluator.getCDbw(), EPSILON);
+    assertEquals("intra cluster density", 0.8, 
evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 16.388225099390855, evaluator.getCDbw(), EPSILON);
   }
 
   /**
@@ -274,17 +274,17 @@ public final class TestCDbwEvaluator ext
     clusters.add(cluster);
     List<VectorWritable> points = new ArrayList<VectorWritable>();
     Vector delta = new DenseVector(new double[] { 0, Double.MIN_NORMAL });
-    points.add(new VectorWritable(delta));
-    points.add(new VectorWritable(cluster.getCenter()));
-    points.add(new VectorWritable(cluster.getCenter()));
-    points.add(new VectorWritable(cluster.getCenter()));
-    points.add(new VectorWritable(cluster.getCenter()));
+    points.add(new VectorWritable(delta.clone()));
+    points.add(new VectorWritable(delta.clone()));
+    points.add(new VectorWritable(delta.clone()));
+    points.add(new VectorWritable(delta.clone()));
+    points.add(new VectorWritable(delta.clone()));
     representativePoints.put(cluster.getId(), points);
     CDbwEvaluator evaluator = new CDbwEvaluator(representativePoints, 
clusters, measure);
     assertEquals("inter cluster density", 0.0, 
evaluator.interClusterDensity(), EPSILON);
     assertEquals("separation", 28.970562748477143, evaluator.separation(), 
EPSILON);
-    assertEquals("intra cluster density", 2.0124611797498106, 
evaluator.intraClusterDensity(), EPSILON);
-    assertEquals("CDbw", 58.30213288681623, evaluator.getCDbw(), EPSILON);
+    assertEquals("intra cluster density", 1.8, 
evaluator.intraClusterDensity(), EPSILON);
+    assertEquals("CDbw", 52.147012947258865, evaluator.getCDbw(), EPSILON);
   }
 
   @Test


Reply via email to