http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM.java 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM.java
new file mode 100644
index 0000000..1a41f6b
--- /dev/null
+++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM.java
@@ -0,0 +1,509 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2011 RWTH Aachen University, Germany
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import com.yahoo.labs.samoa.evaluation.measures.CMM_GTAnalysis.CMMPoint;
+import com.yahoo.labs.samoa.moa.cluster.Cluster;
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.cluster.SphereCluster;
+import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import java.util.ArrayList;
+
+/**
+ * [CMM.java]
+ * 
+ * CMM: Main class
+ * 
+ * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering 
on Evolving Data Streams", KDD, 2011
+ * 
+ * @author Timm jansen
+ * Data Management and Data Exploration Group, RWTH Aachen University
+*/
+
+public class CMM extends MeasureCollection{
+       
+       private static final long serialVersionUID = 1L;
+
+       /**
+     * found clustering
+     */
+       private Clustering clustering;
+
+       /**
+        * the ground truth analysis
+        */
+    private CMM_GTAnalysis gtAnalysis;
+
+    /**
+     * number of points within the horizon
+     */
+    private int numPoints;
+    
+    /**
+     * number of clusters in the found clustering
+     */
+    private int numFClusters;
+    
+    /**
+     * number of cluster in the adjusted groundtruth clustering that 
+     * was calculated through the groundtruth analysis 
+     */
+    private int numGT0Classes;
+
+    /**
+     * match found clusters to GT clusters
+     */
+    private int matchMap[]; 
+    
+    /**
+     * pointInclusionProbFC[p][C] contains the probability of point p 
+     * being included in cluster C 
+     */
+    private double[][] pointInclusionProbFC;
+    
+    /**
+     * threshold that defines when a point is being considered belonging to a 
cluster
+     */
+    private double pointInclusionProbThreshold = 0.5;
+    
+    /**
+     * parameterize the error weight of missed points (default 1)
+     */
+    private double lamdaMissed = 1;
+
+    
+    /**
+     * enable/disable debug mode
+     */
+    public boolean debug = false;
+
+    
+    /**
+     * enable/disable class merge (main feature of ground truth analysis)
+     */
+    public boolean enableClassMerge = true;
+    
+    /**
+     * enable/disable model error
+     * when enabled errors that are caused by the underling cluster model will 
not be counted
+     */
+    public boolean enableModelError = true;
+    
+
+    @Override
+    protected String[] getNames() {
+        String[] names = {"CMM","CMM Basic","CMM Missed","CMM Misplaced","CMM 
Noise",
+                            "CA Seperability", "CA Noise", "CA Modell"};
+        return names;
+    }
+
+    @Override
+    protected boolean[] getDefaultEnabled() {
+        boolean [] defaults = {false, false, false, false, false, false, 
false, false};
+        return defaults;
+    }
+
+    
+    @Override
+    public void evaluateClustering(Clustering clustering, Clustering 
trueClustering, ArrayList<DataPoint> points) throws Exception{
+        this.clustering = clustering;
+
+        numPoints = points.size();
+        numFClusters = clustering.size();
+
+        gtAnalysis = new CMM_GTAnalysis(trueClustering, points, 
enableClassMerge);
+
+        numGT0Classes = gtAnalysis.getNumberOfGT0Classes();
+
+        addValue("CA Seperability",gtAnalysis.getClassSeparability());
+        addValue("CA Noise",gtAnalysis.getNoiseSeparability());
+        addValue("CA Modell",gtAnalysis.getModelQuality());
+
+        /* init the matching and point distances */
+        calculateMatching();
+
+        /* calculate the actual error */
+        calculateError();
+    }
+
+    
+    /**
+     * calculates the CMM specific matching between found clusters and ground 
truth clusters 
+     */
+    private void calculateMatching(){
+
+       /**
+        * found cluster frequencies  
+        */
+        int[][] mapFC = new int[numFClusters][numGT0Classes];
+
+        /**
+        * ground truth cluster frequencies
+        */
+        int[][] mapGT = new int[numGT0Classes][numGT0Classes];
+        int [] sumsFC = new int[numFClusters];
+
+        //calculate fuzzy mapping from
+        pointInclusionProbFC = new double[numPoints][numFClusters];
+        for (int p = 0; p < numPoints; p++) {
+            CMMPoint cmdp = gtAnalysis.getPoint(p);
+            //found cluster frequencies
+            for (int fc = 0; fc < numFClusters; fc++) {
+                Cluster cl = clustering.get(fc);
+                pointInclusionProbFC[p][fc] = cl.getInclusionProbability(cmdp);
+                if (pointInclusionProbFC[p][fc] >= 
pointInclusionProbThreshold) {
+                    //make sure we don't count points twice that are contained 
in two merged clusters
+                    if(cmdp.isNoise()) continue;
+                    mapFC[fc][cmdp.workclass()]++;
+                    sumsFC[fc]++;
+                }
+            }
+
+            //ground truth cluster frequencies
+            if(!cmdp.isNoise()){
+                for(int hc = 0; hc < numGT0Classes;hc++){
+                    if(hc == cmdp.workclass()){
+                        mapGT[hc][hc]++;
+                    }
+                    else{
+                        
if(gtAnalysis.getGT0Cluster(hc).getInclusionProbability(cmdp) >= 1){
+                            mapGT[hc][cmdp.workclass()]++;
+                        }
+                    }
+                }
+            }
+        }
+
+        //assign each found cluster to a hidden cluster
+        matchMap = new int[numFClusters];
+        for (int fc = 0; fc < numFClusters; fc++) {
+            int matchIndex = -1;
+            //check if we only have one entry anyway
+            for (int hc0 = 0; hc0 < numGT0Classes; hc0++) {
+                if(mapFC[fc][hc0]!=0){
+                    if(matchIndex == -1)
+                        matchIndex = hc0;
+                    else{
+                        matchIndex = -1;
+                        break;
+                    }
+                }
+            }
+
+            //more then one entry, so look for most similar frequency profile
+            int minDiff = Integer.MAX_VALUE;
+            if(sumsFC[fc]!=0 && matchIndex == -1){
+                ArrayList<Integer> fitCandidates = new ArrayList<Integer>();
+                for (int hc0 = 0; hc0 < numGT0Classes; hc0++) {
+                    int errDiff = 0;
+                    for (int hc1 = 0; hc1 < numGT0Classes; hc1++) {
+                        //fc profile doesn't fit into current hc profile
+                        double freq_diff = mapFC[fc][hc1] - mapGT[hc0][hc1];
+                        if(freq_diff > 0){
+                            errDiff+= freq_diff;
+                        }
+                    }
+                    if(errDiff == 0){
+                        fitCandidates.add(hc0);
+                    }
+                    if(errDiff < minDiff){
+                        minDiff = errDiff;
+                        matchIndex = hc0;
+                    }
+                    if(debug){
+                        
//System.out.println("FC"+fc+"("+Arrays.toString(mapFC[fc])+") - 
HC0_"+hc0+"("+Arrays.toString(mapGT[hc0])+"):"+errDiff);
+                    }
+                }
+                //if we have a fitting profile overwrite the min error choice
+                //if we have multiple fit candidates, use majority vote of 
corresponding classes
+                if(fitCandidates.size()!=0){
+                    int bestGTfit = fitCandidates.get(0);
+                    for(int i = 1; i < fitCandidates.size(); i++){
+                        int GTfit = fitCandidates.get(i);
+                        if(mapFC[fc][GTfit] > mapFC[fc][bestGTfit])
+                            bestGTfit=fitCandidates.get(i);
+                    }
+                    matchIndex = bestGTfit;
+                }
+            }
+            
+            matchMap[fc] = matchIndex;
+            int realMatch = -1;
+            if(matchIndex==-1){
+                if(debug)
+                    System.out.println("No cluster match: needs to be 
implemented?");
+            }
+            else{
+                realMatch = gtAnalysis.getGT0Cluster(matchMap[fc]).getLabel();
+            }
+            clustering.get(fc).setMeasureValue("CMM Match", "C"+realMatch);
+            clustering.get(fc).setMeasureValue("CMM Workclass", 
"C"+matchMap[fc]);
+        }
+
+        //print matching table
+        if(debug){
+            for (int i = 0; i < numFClusters; i++) {
+                    System.out.print("C"+((int)clustering.get(i).getId()) + " 
N:"+((int)clustering.get(i).getWeight())+"  |  ");
+                    for (int j = 0; j < numGT0Classes; j++) {
+                          System.out.print(mapFC[i][j] + " ");
+                    }
+                    System.out.print(" = "+sumsFC[i] + " | ");
+                    String match = "-";
+                    if (matchMap[i]!=-1) {
+                        match = 
Integer.toString(gtAnalysis.getGT0Cluster(matchMap[i]).getLabel());
+                    }
+                    System.out.println(" --> " + match + 
"(work:"+matchMap[i]+")");
+             }
+        }
+    }
+    
+    
+    /**
+     * Calculate the actual error values
+     */
+    private void calculateError(){
+        int totalErrorCount = 0;
+        int totalRedundancy = 0;
+        int trueCoverage = 0;
+        int totalCoverage = 0;
+
+        int numNoise = 0;
+        double errorNoise = 0;
+        double errorNoiseMax = 0;
+
+        double errorMissed = 0;
+        double errorMissedMax = 0;
+
+        double errorMisplaced = 0;
+        double errorMisplacedMax = 0;
+
+        double totalError = 0.0;
+        double totalErrorMax = 0.0;
+
+        /** mainly iterate over all points and find the right error value for 
the point.
+         *  within the same run calculate various other stuff like coverage 
etc...
+         */
+        for (int p = 0; p < numPoints; p++) {
+            CMMPoint cmdp = gtAnalysis.getPoint(p);
+            double weight = cmdp.weight();
+            //noise counter
+            if(cmdp.isNoise()){
+                numNoise++;
+                //this is always 1
+                errorNoiseMax+=cmdp.connectivity*weight;
+            }
+            else{
+                errorMissedMax+=cmdp.connectivity*weight;
+                errorMisplacedMax+=cmdp.connectivity*weight;
+            }
+            //sum up maxError as the individual errors are the quality 
weighted between 0-1
+            totalErrorMax+=cmdp.connectivity*weight;
+
+
+            double err = 0;
+            int coverage = 0;
+
+            //check every FCluster
+            for (int c = 0; c < numFClusters; c++) {
+                //contained in cluster c?
+                if(pointInclusionProbFC[p][c] >= pointInclusionProbThreshold){
+                    coverage++;
+
+                    if(!cmdp.isNoise()){
+                        //PLACED CORRECTLY
+                        if(matchMap[c] == cmdp.workclass()){
+                        }
+                        //MISPLACED
+                        else{
+                            double errvalue = misplacedError(cmdp, c);
+                            if(errvalue > err)
+                                err = errvalue;
+                        }
+                    }
+                    else{
+                        //NOISE
+                        double errvalue = noiseError(cmdp, c);
+                        if(errvalue > err) err = errvalue;
+                    }
+                }
+            }
+            //not in any cluster
+            if(coverage == 0){
+                //MISSED
+                if(!cmdp.isNoise()){
+                    err = missedError(cmdp,true);
+                    errorMissed+= weight*err;
+                }
+                //NOISE
+                else{
+                }
+            }
+            else{
+                if(!cmdp.isNoise()){
+                    errorMisplaced+= err*weight;
+                }
+                else{
+                    errorNoise+= err*weight;
+                }
+            }
+
+            /* processing of other evaluation values */
+            totalError+= err*weight;
+            if(err!=0)totalErrorCount++;
+            if(coverage>0) totalCoverage++;  //points covered by clustering 
(incl. noise)
+            if(coverage>0 && !cmdp.isNoise()) trueCoverage++; //points covered 
by clustering, don't count noise
+            if(coverage>1) totalRedundancy++; //include noise
+
+            cmdp.p.setMeasureValue("CMM",err);
+            cmdp.p.setMeasureValue("Redundancy", coverage);
+        }
+
+        addValue("CMM", (totalErrorMax!=0)?1-totalError/totalErrorMax:1);
+        addValue("CMM Missed", 
(errorMissedMax!=0)?1-errorMissed/errorMissedMax:1);
+        addValue("CMM Misplaced", 
(errorMisplacedMax!=0)?1-errorMisplaced/errorMisplacedMax:1);
+        addValue("CMM Noise", (errorNoiseMax!=0)?1-errorNoise/errorNoiseMax:1);
+        addValue("CMM Basic", 1-((double)totalErrorCount/(double)numPoints));
+
+        if(debug){
+            System.out.println("-------------");
+        }
+    }
+
+
+    private double noiseError(CMMPoint cmdp, int assignedClusterID){
+        int gtAssignedID = matchMap[assignedClusterID];
+        double error;
+        
+        //Cluster wasn't matched, so just contains noise
+        //TODO: Noiscluster?
+        //also happens when we decrease the radius and there is only a noise 
point in the center
+        if(gtAssignedID==-1){
+            error = 1;
+            cmdp.p.setMeasureValue("CMM Type","noise - cluster");
+        }
+        else{
+            if(enableModelError && 
gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= 
pointInclusionProbThreshold){
+                //set to MIN_ERROR so we can still track the error
+               error = 0.00001;
+                cmdp.p.setMeasureValue("CMM Type","noise - byModel");
+            }
+            else{
+                error = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID);
+                cmdp.p.setMeasureValue("CMM Type","noise");
+            }
+        }
+
+        return error;
+    }
+
+    private double missedError(CMMPoint cmdp, boolean useHullDistance){
+        cmdp.p.setMeasureValue("CMM Type","missed");
+        if(!useHullDistance){
+            return cmdp.connectivity;
+        }
+        else{
+            //main idea: look at relative distance of missed point to cluster
+            double minHullDist = 1;
+            for (int fc = 0; fc < numFClusters; fc++){
+                //if fc is mappend onto the class of the point, check it for 
its hulldist
+                if(matchMap[fc]!=-1 && matchMap[fc] == cmdp.workclass()){
+                    if(clustering.get(fc) instanceof SphereCluster){
+                        SphereCluster sc = (SphereCluster)clustering.get(fc);
+                        double distanceFC = sc.getCenterDistance(cmdp);
+                        double radius = sc.getRadius();
+                        double hullDist = 
(distanceFC-radius)/(distanceFC+radius);
+                        if(hullDist < minHullDist)
+                            minHullDist = hullDist;
+                    }
+                    else{
+                        double min = 1;
+                        double max = 1;
+
+                        //TODO: distance for random shape
+                        //generate X points from the cluster with 
clustering.get(fc).sample(null)
+                        //and find Min and Max values
+
+                        double hullDist = min/max;
+                        if(hullDist < minHullDist)
+                            minHullDist = hullDist;
+                    }
+                }
+            }
+
+            //use distance as weight
+            if(minHullDist>1) minHullDist = 1;
+
+            double weight = (1-Math.exp(-lamdaMissed*minHullDist));
+            cmdp.p.setMeasureValue("HullDistWeight",weight);
+
+            return weight*cmdp.connectivity;
+        }
+    }
+
+
+    private double misplacedError(CMMPoint cmdp, int assignedClusterID){
+        double weight = 0;
+
+        int gtAssignedID = matchMap[assignedClusterID];
+        //TODO take care of noise cluster?
+        if(gtAssignedID ==-1){
+            System.out.println("Point "+cmdp.getTimestamp()+" from gtcluster 
"+cmdp.trueClass+" assigned to noise cluster "+assignedClusterID);
+            return 1;
+        }
+
+        if(gtAssignedID == cmdp.workclass())
+            return 0;
+        else{
+            //assigned and real GT0 cluster are not connected, but does the 
model have the
+            //chance of separating this point after all?
+            if(enableModelError && 
gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= 
pointInclusionProbThreshold){
+                weight = 0;
+                cmdp.p.setMeasureValue("CMM Type","missplaced - byModel");
+            }
+            else{
+                //point was mapped onto wrong cluster (assigned), so check how 
far away
+                //the nearest point is within the wrongly assigned cluster
+                weight = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID);
+            }
+        }
+        double err_value;
+        //set to MIN_ERROR so we can still track the error
+        if(weight == 0){
+            err_value= 0.00001;
+        }
+        else{
+            err_value = weight*cmdp.connectivity;
+            cmdp.p.setMeasureValue("CMM Type","missplaced");
+        }
+
+        return err_value;
+    }
+
+    public String getParameterString(){
+        String para = gtAnalysis.getParameterString();
+        para+="lambdaMissed="+lamdaMissed+";";
+        return para;
+    }
+
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java
 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java
new file mode 100644
index 0000000..53fb4dc
--- /dev/null
+++ 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java
@@ -0,0 +1,843 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2011 RWTH Aachen University, Germany
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import com.yahoo.labs.samoa.instances.Instance;
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.core.AutoExpandVector;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+
+/**
+ * [CMM_GTAnalysis.java]
+ * 
+ * CMM: Ground truth analysis
+ * 
+ * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering 
on Evolving Data Streams", KDD, 2011
+ * 
+ * @author Timm jansen
+ * Data Management and Data Exploration Group, RWTH Aachen University
+*/
+
+/*
+ * TODO:
+ * - try to avoid calcualting the radius multiple times
+ * - avoid the full distance map?
+ * - knn functionality in clusters
+ * - noise error
+ */
+public class CMM_GTAnalysis{
+       
+    /**
+     * the given ground truth clustering
+     */
+    private Clustering gtClustering;
+    
+    /**
+     * list of given points within the horizon  
+     */
+    private ArrayList<CMMPoint> cmmpoints;
+    
+    /**
+     * the newly calculate ground truth clustering
+     */
+    private ArrayList<GTCluster> gt0Clusters;
+
+    /**
+     * IDs of noise points
+     */
+    private ArrayList<Integer> noise;
+    
+    /**
+     * total number of points
+     */
+    private int numPoints;
+
+    /**
+     * number of clusters of the original ground truth 
+     */
+    private int numGTClusters;
+
+    /**
+     *    number of classes of the original ground truth, in case of a 
+     *    micro clustering ground truth this differs from numGTClusters
+     */
+    private int numGTClasses;
+
+    /**
+     * number of classes after we are done with the analysis 
+     */
+    private int numGT0Classes;
+
+    /**
+     * number of dimensions
+     */
+    private int numDims;
+
+    /**
+     * mapping between true cluster ID/class label of the original ground 
truth 
+     * and the internal cluster ID/working class label. 
+     * 
+     * different original cluster IDs might map to the same new cluster ID due 
to merging of two clusters 
+     */
+    private HashMap<Integer, Integer> mapTrueLabelToWorkLabel;
+
+    /**
+     * log of how clusters have been merged (for debugging)
+     */
+    private int[] mergeMap;
+
+    /**
+     * number of non-noise points that will create an error due to the 
underlying clustering model
+     * (e.g. point being covered by two clusters representing different 
classes)
+     */
+    private int noiseErrorByModel;
+
+    /**
+     * number of noise points that will create an error due to the underlying 
clustering model
+     * (e.g. noise point being covered by a cluster)
+     */
+    private int pointErrorByModel;    
+    
+    /**
+     * CMM debug mode
+     */
+    private boolean debug = false;
+
+    
+    /******* CMM parameter ***********/
+
+    /**
+     * defines how many nearest neighbors will be used
+     */
+    private int knnNeighbourhood = 2;
+
+    /**
+     * the threshold which defines when ground truth clusters will be merged.
+     * set to 1 to disable merging 
+     */
+    private double tauConnection = 0.5;
+    
+    /**
+     *  experimental (default: disabled)
+     *  separate k for points to cluster and cluster to cluster 
+     */
+    private double clusterConnectionMaxPoints = knnNeighbourhood;
+    
+    /** 
+     * experimental (default: disabled)
+     * use exponential connectivity function to model different behavior: 
+     * closer points will have a stronger connection compared to the linear 
function.
+     * Use ConnRefXValue and ConnX to better parameterize lambda, which 
controls 
+     * the decay of the connectivity
+     */
+    private boolean useExpConnectivity = false;
+    private double lambdaConnRefXValue = 0.01;
+    private double lambdaConnX = 4;
+    private double lamdaConn;
+    
+    
+    /******************************************/
+    
+    
+    /**
+     * Wrapper class for data points to store CMM relevant attributes
+     *
+     */
+    protected class CMMPoint extends DataPoint{
+        /**
+         * Reference to original point
+         */
+        protected DataPoint p = null;
+        
+        /**
+         * point ID
+         */
+        protected int pID = 0;
+        
+        
+        /**
+         * true class label
+         */
+        protected int trueClass = -1;
+
+        
+        /**
+         * the connectivity of the point to its cluster
+         */
+        protected double connectivity = 1.0;
+        
+        
+        /**
+         * knn distnace within own cluster
+         */
+        protected double knnInCluster = 0.0; 
+        
+        
+        /**
+         * knn indices (for debugging only)
+         */
+        protected ArrayList<Integer> knnIndices;
+
+        public CMMPoint(DataPoint point, int id) {
+            //make a copy, but keep reference
+            super(point,point.getTimestamp());
+            p = point;
+            pID = id;
+            trueClass = (int)point.classValue();
+        }
+
+        
+        /**
+         * Retruns the current working label of the cluster the point belongs 
to. 
+         * The label can change due to merging of clusters.  
+         * 
+         * @return the current working class label
+         */
+        protected int workclass(){
+            if(trueClass == -1 )
+                return -1;
+            else
+                return mapTrueLabelToWorkLabel.get(trueClass);
+        }
+    }
+
+    
+    
+    /**
+     * Main class to model the new clusters that will be the output of the 
cluster analysis
+     *
+     */
+    protected class GTCluster{
+       /** points that are per definition in the cluster */
+        private ArrayList<Integer> points = new ArrayList<Integer>();
+        
+        /** a new GT cluster consists of one or more "old" GT clusters. 
+         * Connected/overlapping clusters cannot be merged directly because of 
the 
+         * underlying cluster model. E.g. for merging two spherical clusters 
the new 
+         * cluster sphere can cover a lot more space then two separate smaller 
spheres. 
+         * To keep the original coverage we need to keep the orignal clusters 
and merge
+         * them on an abstract level. */
+        private ArrayList<Integer> clusterRepresentations = new 
ArrayList<Integer>();
+     
+        /** current work class (changes when merging) */
+        private int workclass;
+        
+        /** original work class */
+        private final int orgWorkClass;
+        
+        /** original class label*/
+        private final int label;
+        
+        /** clusters that have been merged into this cluster (debugging)*/
+        private ArrayList<Integer> mergedWorkLabels = null;
+        
+        /** average knn distance of all points in the cluster*/
+        private double knnMeanAvg = 0;
+        
+        /** average deviation of knn distance of all points*/
+        private double knnDevAvg = 0;
+        
+        /** connectivity of the cluster to all other clusters */
+        private ArrayList<Double> connections = new ArrayList<Double>();
+        
+
+        private GTCluster(int workclass, int label, int gtClusteringID) {
+           this.orgWorkClass = workclass;
+           this.workclass = workclass;
+           this.label = label;
+           this.clusterRepresentations.add(gtClusteringID);
+        }
+
+        
+        /**
+         * The original class label the cluster represents
+         * @return original class label
+         */
+        protected int getLabel(){
+            return label;
+        }
+
+        /**
+         * Calculate the probability of the point being covered through the 
cluster
+         * @param point to calculate the probability for
+         * @return probability of the point being covered through the cluster
+         */
+        protected double getInclusionProbability(CMMPoint point){
+            double prob = Double.MIN_VALUE;
+            //check all cluster representatives for coverage
+            for (int c = 0; c < clusterRepresentations.size(); c++) {
+               double tmp_prob = 
gtClustering.get(clusterRepresentations.get(c)).getInclusionProbability(point);
+               if(tmp_prob > prob) prob = tmp_prob;
+            }
+            return prob;
+        }
+
+        
+        /**
+         * calculate knn distances of points within own cluster 
+         * + average knn distance and average knn distance deviation of all 
points 
+         */
+        private void calculateKnn(){
+            for (int p0 : points) {
+                CMMPoint cmdp = cmmpoints.get(p0);
+                if(!cmdp.isNoise()){
+                    AutoExpandVector<Double> knnDist = new 
AutoExpandVector<Double>();
+                    AutoExpandVector<Integer> knnPointIndex = new 
AutoExpandVector<Integer>();
+                    
+                    //calculate nearest neighbours 
+                    getKnnInCluster(cmdp, knnNeighbourhood, points, 
knnDist,knnPointIndex);
+
+                    //TODO: What to do if we have less then k neighbours?
+                    double avgKnn = 0;
+                    for (int i = 0; i < knnDist.size(); i++) {
+                        avgKnn+= knnDist.get(i);
+                    }
+                    if(knnDist.size()!=0)
+                        avgKnn/=knnDist.size();
+                    cmdp.knnInCluster = avgKnn;
+                    cmdp.knnIndices = knnPointIndex;
+                    cmdp.p.setMeasureValue("knnAvg", cmdp.knnInCluster);
+
+                    knnMeanAvg+=avgKnn;
+                    knnDevAvg+=Math.pow(avgKnn,2);
+                }
+            }
+            knnMeanAvg=knnMeanAvg/(double)points.size();
+            knnDevAvg=knnDevAvg/(double)points.size();
+
+            double variance = knnDevAvg-Math.pow(knnMeanAvg,2.0);
+            // Due to numerical errors, small negative values can occur.
+            if (variance <= 0.0) variance = 1e-50;
+            knnDevAvg = Math.sqrt(variance);
+
+        }
+
+        
+        /**
+         * Calculate the connection of a cluster to this cluster
+         * @param otherCid cluster id of the other cluster
+         * @param initial flag for initial run
+         */
+        private void calculateClusterConnection(int otherCid, boolean initial){
+            double avgConnection = 0;
+            if(workclass==otherCid){
+                avgConnection = 1;
+            }
+            else{
+                AutoExpandVector<Double> kmax = new AutoExpandVector<Double>();
+                AutoExpandVector<Integer> kmaxIndexes = new 
AutoExpandVector<Integer>();
+
+                for(int p : points){
+                    CMMPoint cmdp = cmmpoints.get(p);
+                    double con_p_Cj = getConnectionValue(cmmpoints.get(p), 
otherCid);
+                    double connection = cmdp.connectivity * con_p_Cj;
+                    if(initial){
+                        cmdp.p.setMeasureValue("Connection to C"+otherCid, 
con_p_Cj);
+                    }
+
+                    //connection
+                    if(kmax.size() < clusterConnectionMaxPoints || connection 
> kmax.get(kmax.size()-1)){
+                        int index = 0;
+                        while(index < kmax.size() && connection < 
kmax.get(index)) {
+                            index++;
+                        }
+                        kmax.add(index, connection);
+                        kmaxIndexes.add(index, p);
+                        if(kmax.size() > clusterConnectionMaxPoints){
+                            kmax.remove(kmax.size()-1);
+                            kmaxIndexes.add(kmaxIndexes.size()-1);
+                        }
+                    }
+                }
+                //connection
+                for (int k = 0; k < kmax.size(); k++) {
+                    avgConnection+= kmax.get(k);
+                }
+                avgConnection/=kmax.size();
+            }
+
+            if(otherCid<connections.size()){
+                connections.set(otherCid, avgConnection);
+            }
+            else
+                if(connections.size() == otherCid){
+                    connections.add(avgConnection);
+                }
+                else
+                    System.out.println("Something is going really wrong with 
the connection listing!"+knnNeighbourhood+" "+tauConnection);
+        }
+
+        
+        /**
+         * Merge a cluster into this cluster
+         * @param mergeID the ID of the cluster to be merged
+         */
+        private void mergeCluster(int mergeID){
+            if(mergeID < gt0Clusters.size()){
+                //track merging (debugging)
+               for (int i = 0; i < numGTClasses; i++) {
+                    if(mergeMap[i]==mergeID)
+                        mergeMap[i]=workclass;
+                    if(mergeMap[i]>mergeID)
+                        mergeMap[i]--;
+                }
+                GTCluster gtcMerge  = gt0Clusters.get(mergeID);
+                if(debug)
+                    System.out.println("Merging C"+gtcMerge.workclass+" into 
C"+workclass+
+                            " with Con "+connections.get(mergeID)+" / 
"+gtcMerge.connections.get(workclass));
+
+
+                //update mapTrueLabelToWorkLabel
+                mapTrueLabelToWorkLabel.put(gtcMerge.label, workclass);
+                Iterator iterator = 
mapTrueLabelToWorkLabel.keySet().iterator();
+                while (iterator.hasNext()) {
+                    Integer key = (Integer)iterator.next();
+                    //update pointer of already merged cluster
+                    int value = mapTrueLabelToWorkLabel.get(key);
+                    if(value == mergeID)
+                        mapTrueLabelToWorkLabel.put(key, workclass);
+                    if(value > mergeID)
+                        mapTrueLabelToWorkLabel.put(key, value-1);
+                }
+
+                //merge points from B into A
+                points.addAll(gtcMerge.points);
+                clusterRepresentations.addAll(gtcMerge.clusterRepresentations);
+                if(mergedWorkLabels==null){
+                    mergedWorkLabels = new ArrayList<Integer>();
+                }
+                mergedWorkLabels.add(gtcMerge.orgWorkClass);
+                if(gtcMerge.mergedWorkLabels!=null)
+                    mergedWorkLabels.addAll(gtcMerge.mergedWorkLabels);
+
+                gt0Clusters.remove(mergeID);
+
+                //update workclass labels
+                for(int c=mergeID; c < gt0Clusters.size(); c++){
+                    gt0Clusters.get(c).workclass = c;
+                }
+
+                //update knn distances
+                calculateKnn();
+                for(int c=0; c < gt0Clusters.size(); c++){
+                    gt0Clusters.get(c).connections.remove(mergeID);
+                    
+                    //recalculate connection from other clusters to the new 
merged one
+                    
gt0Clusters.get(c).calculateClusterConnection(workclass,false);
+                    //and from new merged one to other clusters
+                    
gt0Clusters.get(workclass).calculateClusterConnection(c,false);
+                }
+            }
+            else{
+                System.out.println("Merge indices are not valid");
+            }
+        }
+    }
+
+    
+    /**
+     * @param trueClustering the ground truth clustering
+     * @param points data points
+     * @param enableClassMerge allow class merging (should be set to true on 
default)
+     */
+    public CMM_GTAnalysis(Clustering trueClustering, ArrayList<DataPoint> 
points, boolean enableClassMerge){
+        if(debug)
+            System.out.println("GT Analysis Debug Output");
+
+        noiseErrorByModel = 0;
+        pointErrorByModel = 0;
+        if(!enableClassMerge){
+               tauConnection = 1.0;
+        }
+
+        lamdaConn = -Math.log(lambdaConnRefXValue)/Math.log(2)/lambdaConnX;
+        
+        this.gtClustering = trueClustering;
+
+        numPoints = points.size();
+        numDims = points.get(0).numAttributes()-1;
+        numGTClusters = gtClustering.size();
+
+        //init mappings between work and true labels
+        mapTrueLabelToWorkLabel = new HashMap<Integer, Integer>();
+        
+        //set up base of new clustering
+        gt0Clusters = new ArrayList<GTCluster>();
+        int numWorkClasses = 0;
+        //create label to worklabel mapping as real labels can be just a set 
of unordered integers
+        for (int i = 0; i < numGTClusters; i++) {
+            int label = (int)gtClustering.get(i).getGroundTruth();
+            if(!mapTrueLabelToWorkLabel.containsKey(label)){
+                gt0Clusters.add(new GTCluster(numWorkClasses,label,i));
+                mapTrueLabelToWorkLabel.put(label,numWorkClasses);
+                numWorkClasses++;
+            }
+            else{
+                
gt0Clusters.get(mapTrueLabelToWorkLabel.get(label)).clusterRepresentations.add(i);
+            }
+        }
+        numGTClasses = numWorkClasses;
+
+        mergeMap = new int[numGTClasses];
+        for (int i = 0; i < numGTClasses; i++) {
+            mergeMap[i]=i;
+        }
+
+        //create cmd point wrapper instances
+        cmmpoints = new ArrayList<CMMPoint>();
+        for (int p = 0; p < points.size(); p++) {
+            CMMPoint cmdp = new CMMPoint(points.get(p), p);
+            cmmpoints.add(cmdp);
+        }
+
+
+        //split points up into their GTClusters and Noise (according to class 
labels)
+        noise = new ArrayList<Integer>();
+        for (int p = 0; p < numPoints; p++) {
+            if(cmmpoints.get(p).isNoise()){
+                noise.add(p);
+            }
+            else{
+                gt0Clusters.get(cmmpoints.get(p).workclass()).points.add(p);
+            }
+        }
+
+        //calculate initial knnMean and knnDev
+        for (GTCluster gtc : gt0Clusters) {
+            gtc.calculateKnn();
+        }
+
+        //calculate cluster connections
+        calculateGTClusterConnections();
+
+        //calculate point connections with own clusters
+        calculateGTPointQualities();
+
+        if(debug)
+            System.out.println("GT Analysis Debug End");
+
+   }
+
+    /**
+     * Calculate the connection of a point to a cluster
+     *  
+     * @param cmmp the point to calculate the connection for
+     * @param clusterID the corresponding cluster
+     * @return the connection value
+     */
+    //TODO: Cache the connection value for a point to the different clusters???
+    protected double getConnectionValue(CMMPoint cmmp, int clusterID){
+        AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>();
+        AutoExpandVector<Integer> knnPointIndex = new 
AutoExpandVector<Integer>();
+        
+        //calculate the knn distance of the point to the cluster
+        getKnnInCluster(cmmp, knnNeighbourhood, 
gt0Clusters.get(clusterID).points, knnDist, knnPointIndex);
+
+        //TODO: What to do if we have less then k neighbors?
+        double avgDist = 0;
+        for (int i = 0; i < knnDist.size(); i++) {
+            avgDist+= knnDist.get(i);
+        }
+        //what to do if we only have a single point???
+        if(knnDist.size()!=0)
+            avgDist/=knnDist.size();
+        else
+            return 0;
+
+        //get the upper knn distance of the cluster
+        double upperKnn = gt0Clusters.get(clusterID).knnMeanAvg + 
gt0Clusters.get(clusterID).knnDevAvg;
+        
+        /* calculate the connectivity based on knn distance of the point 
within the cluster
+           and the upper knn distance of the cluster*/ 
+        if(avgDist < upperKnn){
+            return 1;
+        }
+        else{
+            //value that should be reached at upperKnn distance
+            //Choose connection formula
+            double conn;
+            if(useExpConnectivity)
+                conn = Math.pow(2,-lamdaConn*(avgDist-upperKnn)/upperKnn);
+            else
+                conn = upperKnn/avgDist;
+
+            if(Double.isNaN(conn))
+                System.out.println("Connectivity NaN at 
"+cmmp.p.getTimestamp());
+
+            return conn;
+        }
+    }
+
+    
+    /**
+     * @param cmmp point to calculate knn distance for
+     * @param k number of nearest neighbors to look for
+     * @param pointIDs list of point IDs to check
+     * @param knnDist sorted list of smallest knn distances (can already be 
filled to make updates possible)  
+     * @param knnPointIndex list of corresponding knn indices
+     */
+    private void getKnnInCluster(CMMPoint cmmp, int k,
+                                 ArrayList<Integer> pointIDs,
+                                 AutoExpandVector<Double> knnDist,
+                                 AutoExpandVector<Integer> knnPointIndex) {
+
+        //iterate over every point in the choosen cluster, cal distance and 
insert into list
+        for (int p1 = 0; p1 < pointIDs.size(); p1++) {
+            int pid = pointIDs.get(p1);
+            if(cmmp.pID == pid) continue;
+            double dist = distance(cmmp,cmmpoints.get(pid));
+            if(knnDist.size() < k || dist < knnDist.get(knnDist.size()-1)){
+                int index = 0;
+                while(index < knnDist.size() && dist > knnDist.get(index)) {
+                    index++;
+                }
+                knnDist.add(index, dist);
+                knnPointIndex.add(index,pid);
+                if(knnDist.size() > k){
+                    knnDist.remove(knnDist.size()-1);
+                    knnPointIndex.remove(knnPointIndex.size()-1);
+                }
+            }
+        }
+    }
+
+
+    
+    /**
+     * calculate initial connectivities
+     */
+    private void calculateGTPointQualities(){
+        for (int p = 0; p < numPoints; p++) {
+            CMMPoint cmdp = cmmpoints.get(p);
+            if(!cmdp.isNoise()){
+                cmdp.connectivity = getConnectionValue(cmdp, cmdp.workclass());
+                cmdp.p.setMeasureValue("Connectivity", cmdp.connectivity);
+            }
+        }
+    }
+
+    
+    
+    /**
+     * Calculate connections between clusters and merge clusters accordingly 
as 
+     * long as connections exceed threshold 
+     */
+    private void calculateGTClusterConnections(){
+        for (int c0 = 0; c0 < gt0Clusters.size(); c0++) {
+            for (int c1 = 0; c1 < gt0Clusters.size(); c1++) {
+                    gt0Clusters.get(c0).calculateClusterConnection(c1, true);
+            }
+        }
+
+        boolean changedConnection = true;
+        while(changedConnection){
+            if(debug){
+                System.out.println("Cluster Connection");
+                for (int c = 0; c < gt0Clusters.size(); c++) {
+                    System.out.print("C"+gt0Clusters.get(c).label+" --> ");
+                    for (int c1 = 0; c1 < 
gt0Clusters.get(c).connections.size(); c1++) {
+                        System.out.print(" C"+gt0Clusters.get(c1).label+": 
"+gt0Clusters.get(c).connections.get(c1));
+                    }
+                    System.out.println("");
+                }
+                System.out.println("");
+            }
+
+            double max = 0;
+            int maxIndexI = -1;
+            int maxIndexJ = -1;
+
+            changedConnection = false;
+            for (int c0 = 0; c0 < gt0Clusters.size(); c0++) {
+                for (int c1 = c0+1; c1 < gt0Clusters.size(); c1++) {
+                    if(c0==c1) continue;
+                        double min 
=Math.min(gt0Clusters.get(c0).connections.get(c1), 
gt0Clusters.get(c1).connections.get(c0));
+                        if(min > max){
+                            max = min;
+                            maxIndexI = c0;
+                            maxIndexJ = c1;
+                        }
+                }
+            }
+            if(maxIndexI!=-1 && max > tauConnection){
+                gt0Clusters.get(maxIndexI).mergeCluster(maxIndexJ);
+                if(debug)
+                    System.out.println("Merging "+maxIndexI+" and 
"+maxIndexJ+" because of connection "+max);
+
+                changedConnection = true;
+            }
+        }
+        numGT0Classes = gt0Clusters.size();
+    }
+
+    
+    /** 
+     * Calculates how well the original clusters are separable. 
+     * Small values indicate bad separability, values close to 1 indicate good 
separability 
+     * @return index of seperability 
+     */
+    public double getClassSeparability(){
+//        int totalConn = numGTClasses*(numGTClasses-1)/2;
+//        int mergedConn = 0;
+//        for(GTCluster gt : gt0Clusters){
+//            int merged = gt.clusterRepresentations.size();
+//            if(merged > 1)
+//                mergedConn+=merged * (merged-1)/2;
+//        }
+//        if(totalConn == 0)
+//            return 0;
+//        else
+//            return 1-mergedConn/(double)totalConn;
+        return numGT0Classes/(double)numGTClasses;
+
+    }
+
+    
+    /**
+     * Calculates how well noise is separable from the given clusters
+     * Small values indicate bad separability, values close to 1 indicate good 
separability
+     * @return index of noise separability
+     */
+    public double getNoiseSeparability(){
+        if(noise.isEmpty()) 
+            return 1;
+
+        double connectivity = 0;
+        for(int p : noise){
+            CMMPoint npoint = cmmpoints.get(p);
+            double maxConnection = 0;
+
+            //TODO: some kind of pruning possible. what about weighting?
+            for (int c = 0; c < gt0Clusters.size(); c++) {
+                double connection = getConnectionValue(npoint, c);
+                if(connection > maxConnection)
+                    maxConnection = connection;
+            }
+            connectivity+=maxConnection;
+            npoint.p.setMeasureValue("MaxConnection", maxConnection);
+        }
+
+        return 1-(connectivity / noise.size());
+    }
+
+    
+    /** 
+     * Calculates the relative number of errors being caused by the underlying 
cluster model
+     *  @return quality of the model 
+     */
+    public double getModelQuality(){
+        for(int p = 0; p < numPoints; p++){
+            CMMPoint cmdp = cmmpoints.get(p);
+            for(int hc = 0; hc < numGTClusters;hc++){
+                if(gtClustering.get(hc).getGroundTruth() != cmdp.trueClass){
+                    if(gtClustering.get(hc).getInclusionProbability(cmdp) >= 
1){
+                        if(!cmdp.isNoise())
+                            pointErrorByModel++;
+                        else
+                            noiseErrorByModel++;
+                        break;
+                    }
+                }
+            }
+        }
+        if(debug)
+            System.out.println("Error by model: noise "+noiseErrorByModel+" 
point "+pointErrorByModel);
+
+        return 1-((pointErrorByModel + noiseErrorByModel)/(double) numPoints);
+    }
+
+    
+    /**
+     * Get CMM internal point
+     * @param index of the point
+     * @return cmm point
+     */
+    protected CMMPoint getPoint(int index){
+        return cmmpoints.get(index);
+    }
+
+    
+    /**
+     * Return cluster
+     * @param index of the cluster to return
+     * @return cluster
+     */
+    protected GTCluster getGT0Cluster(int index){
+        return gt0Clusters.get(index);
+    }
+
+    /**
+     * Number of classes/clusters of the new clustering
+     * @return number of new clusters
+     */
+    protected int getNumberOfGT0Classes() {
+        return numGT0Classes;
+    }
+    
+    /**
+     * Calculates Euclidian distance 
+     * @param inst1 point as double array
+     * @param inst2 point as double array
+     * @return euclidian distance
+     */
+    private double distance(Instance inst1, Instance inst2){
+          return distance(inst1, inst2.toDoubleArray());
+
+    }
+    
+    /**
+     * Calculates Euclidian distance 
+     * @param inst1 point as an instance
+     * @param inst2 point as double array
+     * @return euclidian distance
+     */
+    private double distance(Instance inst1, double[] inst2){
+        double distance = 0.0;
+        for (int i = 0; i < numDims; i++) {
+            double d = inst1.value(i) - inst2[i];
+            distance += d * d;
+        }
+        return Math.sqrt(distance);
+    }
+    
+    /**
+     * String with main CMM parameters
+     * @return main CMM parameter
+     */
+    public String getParameterString(){
+        String para = "";
+        para+="k="+knnNeighbourhood+";";
+        if(useExpConnectivity){
+               para+="lambdaConnX="+lambdaConnX+";";
+               para+="lambdaConn="+lamdaConn+";";
+               para+="lambdaConnRef="+lambdaConnRefXValue+";";
+        }
+        para+="m="+clusterConnectionMaxPoints+";";
+        para+="tauConn="+tauConnection+";";
+
+        return para;
+    }    
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java
 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java
new file mode 100644
index 0000000..0d311e4
--- /dev/null
+++ 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java
@@ -0,0 +1,174 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2010 RWTH Aachen University, Germany
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import java.util.ArrayList;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection;
+import com.yahoo.labs.samoa.moa.evaluation.MembershipMatrix;
+
+public class EntropyCollection extends MeasureCollection{
+
+    private static final Logger logger = 
LoggerFactory.getLogger(EntropyCollection.class);
+
+    @Override
+    protected String[] getNames() {
+        return new String[]{"GT cross entropy","FC cross 
entropy","Homogeneity","Completeness","V-Measure","VarInformation"};
+    }
+
+    @Override
+    protected boolean[] getDefaultEnabled() {
+        return new boolean[]{false, false, false, false, false, false};
+    }
+
+    @Override
+    public void evaluateClustering(Clustering fclustering, Clustering 
hClustering, ArrayList<DataPoint> points) throws Exception {
+
+        MembershipMatrix mm = new MembershipMatrix(fclustering, points);
+        int numClasses = mm.getNumClasses();
+        int numCluster = fclustering.size()+1;
+        int n = mm.getTotalEntries();
+
+
+        double FCentropy = 0;
+        if(numCluster > 1){
+            for (int fc = 0; fc < numCluster; fc++){
+                double weight = mm.getClusterSum(fc)/(double)n;
+                if(weight > 0)
+                    FCentropy+= weight * Math.log10(weight);
+            }
+            FCentropy/=(-1*Math.log10(numCluster));
+        }
+
+        logger.debug("FC entropy: {}", FCentropy);
+
+        double GTentropy = 0;
+        if(numClasses > 1){
+            for (int hc = 0; hc < numClasses; hc++){
+                double weight = mm.getClassSum(hc)/(double)n;
+                if(weight > 0)
+                    GTentropy+= weight * Math.log10(weight);
+            }
+            GTentropy/=(-1*Math.log10(numClasses));
+        }
+
+        logger.debug("GT entropy: {}", GTentropy);
+
+        //cluster based entropy
+        double FCcrossEntropy = 0;
+        
+        for (int fc = 0; fc < numCluster; fc++){
+            double e = 0;
+            int clusterWeight = mm.getClusterSum(fc);
+            if(clusterWeight>0){
+                for (int hc = 0; hc < numClasses; hc++) {
+                    double p = mm.getClusterClassWeight(fc, 
hc)/(double)clusterWeight;
+                    if(p!=0){
+                        e+=p * Math.log10(p);
+                    }
+                }
+                FCcrossEntropy+=((clusterWeight/(double)n) * e);
+            }
+        }
+        if(numCluster > 1){
+            FCcrossEntropy/=-1*Math.log10(numCluster);
+        }
+
+        addValue("FC cross entropy", 1-FCcrossEntropy);
+        logger.debug("FC cross entropy: {}", 1 - FCcrossEntropy);
+
+        //class based entropy
+        double GTcrossEntropy = 0;
+        for (int hc = 0; hc < numClasses; hc++){
+            double e = 0;
+            int classWeight = mm.getClassSum(hc);
+            if(classWeight>0){
+                for (int fc = 0; fc < numCluster; fc++) {
+                    double p = mm.getClusterClassWeight(fc, 
hc)/(double)classWeight;
+                    if(p!=0){
+                        e+=p * Math.log10(p);
+                    }
+                }
+            }
+            GTcrossEntropy+=((classWeight/(double)n) * e);
+        }
+        if(numClasses > 1)
+            GTcrossEntropy/=-1*Math.log10(numClasses);
+        addValue("GT cross entropy", 1-GTcrossEntropy);
+        logger.debug("GT cross entropy: {}", 1 - GTcrossEntropy);
+
+        double homogeneity;
+        if(FCentropy == 0)
+            homogeneity = 1;
+        else
+            homogeneity = 1 - FCcrossEntropy/FCentropy;
+
+        //TODO set err values for now, needs to be debugged
+        if(homogeneity > 1 || homogeneity < 0)
+            addValue("Homogeneity",-1);
+        else
+            addValue("Homogeneity",homogeneity);
+
+        double completeness;
+        if(GTentropy == 0)
+            completeness = 1;
+        else
+            completeness = 1 - GTcrossEntropy/GTentropy;
+        addValue("Completeness",completeness);
+
+        double beta = 1;
+        double vmeasure = (1+ beta)*homogeneity*completeness/(beta 
*homogeneity+completeness);
+
+        if(vmeasure > 1 || homogeneity < 0)
+            addValue("V-Measure",-1);
+        else
+            addValue("V-Measure",vmeasure);
+
+
+
+        double mutual = 0;
+        for (int i = 0; i < numCluster; i++){
+                for (int j = 0; j < numClasses; j++) {
+                   if(mm.getClusterClassWeight(i, j)==0) continue;
+                   double m = Math.log10(mm.getClusterClassWeight(i, 
j)/(double)mm.getClusterSum(i)/(double)mm.getClassSum(j)*(double)n);
+                   m*= mm.getClusterClassWeight(i, j)/(double)n;
+                    logger.debug("( {} / {}): ",m, m);
+                   mutual+=m;
+                }
+        }
+        if(numClasses > 1)
+            mutual/=Math.log10(numClasses);
+
+        double varInfo = 1;
+        if(FCentropy + GTentropy > 0)
+            varInfo = 2*mutual/(FCentropy + GTentropy);
+
+        logger.debug("mutual: {} / VI: {}", mutual, varInfo);
+        addValue("VarInformation", varInfo);
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java
new file mode 100644
index 0000000..6533f36
--- /dev/null
+++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java
@@ -0,0 +1,115 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2010 RWTH Aachen University, Germany
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection;
+import com.yahoo.labs.samoa.moa.evaluation.MembershipMatrix;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import java.util.ArrayList;
+
+
+public class F1 extends MeasureCollection{
+
+    @Override
+    protected String[] getNames() {
+        return new String[]{"F1-P","F1-R","Purity"};
+    }
+
+    public void evaluateClustering(Clustering clustering, Clustering 
trueClustering, ArrayList<DataPoint> points) {
+
+        if (clustering.size()<0){
+            addValue(0,0);
+            addValue(1,0);
+            return;
+        }
+
+        MembershipMatrix mm = new MembershipMatrix(clustering, points);
+        //System.out.println(mm.toString());
+
+        int numClasses = mm.getNumClasses();
+        if(mm.hasNoiseClass())
+            numClasses--;
+
+
+
+        //F1 as defined in P3C, try using F1 optimization
+        double F1_P = 0.0;
+        double purity = 0;
+        int realClusters = 0;
+        for (int i = 0; i < clustering.size(); i++) {
+            int max_weight = 0;
+            int max_weight_index = -1;
+
+            //find max index
+            for (int j = 0; j < numClasses; j++) {
+                if(mm.getClusterClassWeight(i, j) > max_weight){
+                    max_weight = mm.getClusterClassWeight(i, j);
+                    max_weight_index = j;
+                }
+            }
+            if(max_weight_index!=-1){
+                realClusters++;
+                double precision = mm.getClusterClassWeight(i, 
max_weight_index)/(double)mm.getClusterSum(i);
+                double recall = mm.getClusterClassWeight(i, 
max_weight_index)/(double) mm.getClassSum(max_weight_index);
+                double f1 = 0;
+                if(precision > 0 || recall > 0){
+                    f1 = 2*precision*recall/(precision+recall);
+                }
+                F1_P += f1;
+                purity += precision;
+
+                //TODO should we move setMeasure stuff into the Cluster 
interface?
+                clustering.get(i).setMeasureValue("F1-P", Double.toString(f1));
+            }
+        }
+        if(realClusters > 0){
+            F1_P/=realClusters;
+            purity/=realClusters;
+        }
+        addValue("F1-P",F1_P);
+        addValue("Purity",purity);
+
+
+
+        //F1 as defined in .... mainly maximizes F1 for each class
+        double F1_R = 0.0;
+        for (int j = 0; j < numClasses; j++) {
+            double max_f1 = 0;
+            for (int i = 0; i < clustering.size(); i++) {
+                double precision = mm.getClusterClassWeight(i, 
j)/(double)mm.getClusterSum(i);
+                double recall = mm.getClusterClassWeight(i, 
j)/(double)mm.getClassSum(j);
+                double f1 = 0;
+                if(precision > 0 || recall > 0){
+                    f1 = 2*precision*recall/(precision+recall);
+                }
+                if(max_f1 < f1){
+                    max_f1 = f1;
+                }
+            }
+            F1_R+= max_f1;
+        }
+        F1_R/=numClasses;
+
+        addValue("F1-R",F1_R);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java
new file mode 100644
index 0000000..7f23c1b
--- /dev/null
+++ 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java
@@ -0,0 +1,191 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2010 RWTH Aachen University, Germany
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+
+import com.yahoo.labs.samoa.instances.Instance;
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.cluster.SphereCluster;
+import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import java.util.ArrayList;
+
+
+public class General extends MeasureCollection{
+    private int numPoints;
+    private int numFClusters;
+    private int numDims;
+    private double pointInclusionProbThreshold = 0.8;
+    private Clustering clustering;
+    private ArrayList<DataPoint> points;
+
+
+    public General() {
+        super();
+    }
+
+
+    @Override
+    protected String[] getNames() {
+        //String[] names = 
{"GPrecision","GRecall","Redundancy","Overlap","numCluster","numClasses","Compactness"};
+        return new 
String[]{"GPrecision","GRecall","Redundancy","numCluster","numClasses"};
+    }
+
+//    @Override
+//    protected boolean[] getDefaultEnabled() {
+//        boolean [] defaults = {false, false, false, false, false ,false};
+//        return defaults;
+//    }
+
+    @Override
+    public void evaluateClustering(Clustering clustering, Clustering 
trueClustering, ArrayList<DataPoint> points) throws Exception{
+
+        this.points = points;
+        this.clustering = clustering;
+        numPoints = points.size();
+        numFClusters = clustering.size();
+        numDims = points.get(0).numAttributes()-1;
+
+
+        int totalRedundancy = 0;
+        int trueCoverage = 0;
+        int totalCoverage = 0;
+
+        int numNoise = 0;
+        for (int p = 0; p < numPoints; p++) {
+            int coverage = 0;
+            for (int c = 0; c < numFClusters; c++) {
+                //contained in cluster c?
+                if(clustering.get(c).getInclusionProbability(points.get(p)) >= 
pointInclusionProbThreshold){
+                    coverage++;
+                }
+            }
+
+            if(points.get(p).classValue()==-1){
+                numNoise++;
+            }
+            else{
+                if(coverage>0) trueCoverage++;
+            }
+
+            if(coverage>0) totalCoverage++;  //points covered by clustering 
(incl. noise)
+            if(coverage>1) totalRedundancy++; //include noise
+        }
+
+        addValue("numCluster", clustering.size());
+        addValue("numClasses", trueClustering.size());
+        addValue("Redundancy", ((double)totalRedundancy/(double)numPoints));
+        addValue("GPrecision", 
(totalCoverage==0?0:((double)trueCoverage/(double)(totalCoverage))));
+        addValue("GRecall", 
((double)trueCoverage/(double)(numPoints-numNoise)));
+//        if(isEnabled(3)){
+//            addValue("Compactness", computeCompactness());
+//        }
+//        if(isEnabled(3)){
+//            addValue("Overlap", computeOverlap());
+//        }
+    }
+
+    private double computeOverlap(){
+        for (int c = 0; c < numFClusters; c++) {
+            if(!(clustering.get(c) instanceof SphereCluster)){
+                System.out.println("Overlap only supports Sphere Cluster. 
Found: "+clustering.get(c).getClass());
+                return Double.NaN;
+            }
+        }
+
+        boolean[] overlap = new boolean[numFClusters];
+
+        for (int c0 = 0; c0 < numFClusters; c0++) {
+            if(overlap[c0]) continue;
+            SphereCluster s0 = (SphereCluster)clustering.get(c0);
+            for (int c1 = c0; c1 < clustering.size(); c1++) {
+                if(c1 == c0) continue;
+                SphereCluster s1 = (SphereCluster)clustering.get(c1);
+                if(s0.overlapRadiusDegree(s1) > 0){
+                    overlap[c0] = overlap[c1] = true;
+                }
+            }
+        }
+
+        double totalOverlap = 0;
+        for (int c0 = 0; c0 < numFClusters; c0++) {
+            if(overlap[c0])
+                totalOverlap++;
+        }
+
+//        if(totalOverlap/(double)numFClusters > .8) RunVisualizer.pause();
+        if(numFClusters>0) totalOverlap/=(double)numFClusters;
+        return totalOverlap;
+    }
+
+
+    private double computeCompactness(){
+        if(numFClusters == 0) return 0;
+        for (int c = 0; c < numFClusters; c++) {
+            if(!(clustering.get(c) instanceof SphereCluster)){
+                System.out.println("Compactness only supports Sphere Cluster. 
Found: "+clustering.get(c).getClass());
+                return Double.NaN;
+            }
+        }
+
+        //TODO weight radius by number of dimensions
+        double totalCompactness = 0;
+        for (int c = 0; c < numFClusters; c++) {
+            ArrayList<Instance> containedPoints = new ArrayList<Instance>();
+            for (int p = 0; p < numPoints; p++) {
+                //p in c
+                if(clustering.get(c).getInclusionProbability(points.get(p)) >= 
pointInclusionProbThreshold){
+                    containedPoints.add(points.get(p));
+                }
+            }
+            double compactness = 0;
+            if(containedPoints.size()>1){
+                //cluster not empty
+                SphereCluster minEnclosingCluster = new 
SphereCluster(containedPoints, numDims);
+                double minRadius = minEnclosingCluster.getRadius();
+                double cfRadius = 
((SphereCluster)clustering.get(c)).getRadius();
+                if(Math.abs(minRadius-cfRadius) < 0.1e-10){
+                    compactness = 1;
+                }
+                else
+                    if(minRadius < cfRadius)
+                        compactness = minRadius/cfRadius;
+                    else{
+                        System.out.println("Optimal radius bigger then real 
one ("+(cfRadius-minRadius)+"), this is really wrong");
+                        compactness = 1;
+                    }
+            }
+            else{
+                double cfRadius = 
((SphereCluster)clustering.get(c)).getRadius();
+                if(cfRadius==0) compactness = 1;
+            }
+
+            //weight by weight of cluster???
+            totalCompactness+=compactness;
+            clustering.get(c).setMeasureValue("Compactness", 
Double.toString(compactness));
+        }
+        return (totalCompactness/numFClusters);
+    }
+
+
+}
+
+

http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java
new file mode 100644
index 0000000..4f57788
--- /dev/null
+++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java
@@ -0,0 +1,96 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2010 RWTH Aachen University, Germany
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+import java.util.ArrayList;
+
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection;
+import com.yahoo.labs.samoa.instances.Instance;
+
+public class SSQ extends MeasureCollection {
+
+    public SSQ() {
+        super();
+    }
+
+    @Override
+    public String[] getNames() {
+        return new String[]{"SSQ"};
+    }
+
+    @Override
+    protected boolean[] getDefaultEnabled() {
+        return new boolean[]{false};
+    }
+
+    // TODO Work on this later
+    //@Override
+    public void evaluateClusteringSamoa(Clustering clustering,
+            Clustering trueClsutering, ArrayList<Instance> points) {
+        double sum = 0.0;
+        for (Instance point : points) {
+            // don't include noise
+            if (point.classValue() == -1) {
+                continue;
+            }
+
+            double minDistance = Double.MAX_VALUE;
+            for (int c = 0; c < clustering.size(); c++) {
+                double distance = 0.0;
+                double[] center = clustering.get(c).getCenter();
+                for (int i = 0; i < center.length; i++) {
+                    double d = point.value(i) - center[i];
+                    distance += d * d;
+                }
+                minDistance = Math.min(distance, minDistance);
+            }
+
+            sum += minDistance;
+        }
+
+        addValue(0, sum);
+    }
+
+    @Override
+     public void evaluateClustering(Clustering clustering, Clustering 
trueClsutering, ArrayList<DataPoint> points) {
+        double sum = 0.0;
+        for (int p = 0; p < points.size(); p++) {
+            //don't include noise
+            if(points.get(p).classValue()==-1) continue;
+
+            double minDistance = Double.MAX_VALUE;
+            for (int c = 0; c < clustering.size(); c++) {
+                double distance = 0.0;
+                double[] center = clustering.get(c).getCenter();
+                for (int i = 0; i < center.length; i++) {
+                    double d = points.get(p).value(i) - center[i];
+                    distance += d * d;
+                }
+                minDistance = Math.min(distance, minDistance);
+            }
+            
+            sum+=minDistance;
+        }
+        
+        addValue(0,sum);
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/Separation.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/Separation.java
 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/Separation.java
new file mode 100644
index 0000000..25534a6
--- /dev/null
+++ 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/Separation.java
@@ -0,0 +1,118 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2013 Yahoo! Inc.
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+import com.yahoo.labs.samoa.instances.DenseInstance;
+import com.yahoo.labs.samoa.instances.Instance;
+import com.yahoo.labs.samoa.moa.cluster.Cluster;
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.cluster.SphereCluster;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection;
+import java.util.ArrayList;
+import java.util.List;
+
+public class Separation extends MeasureCollection {
+
+    public Separation() {
+        super();
+    }
+
+    @Override
+    protected String[] getNames() {
+        return new String[]{"BSS", "BSS-GT", "BSS-Ratio"};
+    }
+
+    //@Override
+    public void evaluateClusteringSamoa(Clustering clustering,
+            Clustering trueClustering, ArrayList<Instance> points)
+            throws Exception {
+
+        double BSS_GT = 1.0;
+        double BSS;
+        int dimension = points.get(0).numAttributes() - 1;
+        SphereCluster sc = new SphereCluster(points, dimension);
+
+        // DO INTERNAL EVALUATION
+        //clustering.getClustering().get(0).getCenter();
+
+        BSS = getBSS(clustering, sc.getCenter());
+
+        if (trueClustering != null) {
+            List<Instance> listInstances = new ArrayList<>();
+            for (Cluster c : trueClustering.getClustering()) {
+                DenseInstance inst = new DenseInstance(c.getWeight(), 
c.getCenter());
+                listInstances.add(inst);
+            }
+            SphereCluster gt = new SphereCluster(listInstances, dimension);
+            BSS_GT = getBSS(trueClustering, gt.getCenter());
+        }
+
+        addValue("BSS", BSS);
+        addValue("BSS-GT", BSS_GT);
+        addValue("BSS-Ratio", BSS / BSS_GT);
+
+    }
+
+    private double getBSS(Clustering clustering, double[] mean) {
+        double bss = 0.0;
+        for (int i = 0; i < clustering.size(); i++) {
+            double weight = clustering.get(i).getWeight();
+            double sum = 0.0;
+            for (int j = 0; j < mean.length; j++) {
+                sum += Math.pow((mean[j] - clustering.get(i).getCenter()[j]), 
2);
+            }
+            bss += weight * sum;
+        }
+
+        return bss;
+    }
+
+    @Override
+    protected void evaluateClustering(Clustering clustering,
+            Clustering trueClustering, ArrayList<DataPoint> points)
+            throws Exception {
+        double BSS_GT = 1.0;
+        double BSS;
+        int dimension = points.get(0).numAttributes() - 1;
+        SphereCluster sc = new SphereCluster(points, dimension);
+
+        // DO INTERNAL EVALUATION
+        //clustering.getClustering().get(0).getCenter();
+
+        BSS = getBSS(clustering, sc.getCenter());
+
+        if (trueClustering != null) {
+            String s = "";
+            List<Instance> listInstances = new ArrayList<>();
+            for (Cluster c : trueClustering.getClustering()) {
+                DenseInstance inst = new DenseInstance(c.getWeight(), 
c.getCenter());
+                listInstances.add(inst);
+                s += " " + c.getWeight();
+            }
+            SphereCluster gt = new SphereCluster(listInstances, dimension);
+            BSS_GT = getBSS(trueClustering, gt.getCenter());
+        }
+
+        addValue("BSS", BSS);
+        addValue("BSS-GT", BSS_GT);
+        addValue("BSS-Ratio", BSS / BSS_GT);
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SilhouetteCoefficient.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SilhouetteCoefficient.java
 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SilhouetteCoefficient.java
new file mode 100644
index 0000000..6dee336
--- /dev/null
+++ 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SilhouetteCoefficient.java
@@ -0,0 +1,137 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2010 RWTH Aachen University, Germany
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+
+import com.yahoo.labs.samoa.moa.cluster.Cluster;
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import java.util.ArrayList;
+
+
+public class SilhouetteCoefficient extends MeasureCollection{
+    private double pointInclusionProbThreshold = 0.8;
+
+    public SilhouetteCoefficient() {
+        super();
+    }
+
+    @Override
+    protected boolean[] getDefaultEnabled() {
+        return new boolean[]{false};
+    }
+
+    @Override
+    public String[] getNames() {
+        return new String[]{"SilhCoeff"};
+    }
+
+    public void evaluateClustering(Clustering clustering, Clustering 
trueClustering, ArrayList<DataPoint> points) {
+        int numFCluster = clustering.size();
+
+        double [][] pointInclusionProbFC = new 
double[points.size()][numFCluster];
+        for (int p = 0; p < points.size(); p++) {
+            DataPoint point = points.get(p);
+            for (int fc = 0; fc < numFCluster; fc++) {
+                Cluster cl = clustering.get(fc);
+                pointInclusionProbFC[p][fc] = 
cl.getInclusionProbability(point);
+            }
+        }
+
+        double silhCoeff = 0.0;
+        int totalCount = 0;
+        for (int p = 0; p < points.size(); p++) {
+            DataPoint point = points.get(p);
+            ArrayList<Integer> ownClusters = new ArrayList<>();
+            for (int fc = 0; fc < numFCluster; fc++) {
+                if(pointInclusionProbFC[p][fc] > pointInclusionProbThreshold){
+                    ownClusters.add(fc);
+                }
+            }
+
+            if(ownClusters.size() > 0){
+                double[] distanceByClusters = new double[numFCluster];
+                int[] countsByClusters = new int[numFCluster];
+                    //calculate averageDistance of p to all cluster
+                for (int p1 = 0; p1 < points.size(); p1++) {
+                    DataPoint point1 = points.get(p1);
+                    if(p1!= p && point1.classValue() != -1){
+                        for (int fc = 0; fc < numFCluster; fc++) {
+                            if(pointInclusionProbFC[p1][fc] > 
pointInclusionProbThreshold){
+                                double distance = distance(point, point1);
+                                distanceByClusters[fc]+=distance;
+                                countsByClusters[fc]++;
+                            }
+                        }
+                    }
+                }
+
+                //find closest OWN cluster as clusters might overlap
+                double minAvgDistanceOwn = Double.MAX_VALUE;
+                int minOwnIndex = -1;
+                for (int fc : ownClusters) {
+                        double normDist = 
distanceByClusters[fc]/(double)countsByClusters[fc];
+                        if(normDist < minAvgDistanceOwn){// && 
pointInclusionProbFC[p][fc] > pointInclusionProbThreshold){
+                            minAvgDistanceOwn = normDist;
+                            minOwnIndex = fc;
+                        }
+                }
+
+
+                //find closest other (or other own) cluster
+                double minAvgDistanceOther = Double.MAX_VALUE;
+                for (int fc = 0; fc < numFCluster; fc++) {
+                    if(fc != minOwnIndex){
+                        double normDist = 
distanceByClusters[fc]/(double)countsByClusters[fc];
+                        if(normDist < minAvgDistanceOther){
+                            minAvgDistanceOther = normDist;
+                        }
+                    }
+                }
+
+                double silhP = 
(minAvgDistanceOther-minAvgDistanceOwn)/Math.max(minAvgDistanceOther, 
minAvgDistanceOwn);
+                point.setMeasureValue("SC - own", minAvgDistanceOwn);
+                point.setMeasureValue("SC - other", minAvgDistanceOther);
+                point.setMeasureValue("SC", silhP);
+
+                silhCoeff+=silhP;
+                totalCount++;
+                //System.out.println(point.getTimestamp()+" Silh "+silhP+" / 
"+avgDistanceOwn+" "+minAvgDistanceOther+" (C"+minIndex+")");
+            }
+        }
+        if(totalCount>0)
+            silhCoeff/=(double)totalCount;
+        //normalize from -1, 1 to 0,1
+        silhCoeff = (silhCoeff+1)/2.0;
+        addValue(0,silhCoeff);
+    }
+
+    private double distance(DataPoint inst1, DataPoint inst2){
+        double distance = 0.0;
+        int numDims = inst1.numAttributes();
+        for (int i = 0; i < numDims; i++) {
+            double d = inst1.value(i) - inst2.value(i);
+            distance += d * d;
+        }
+        return Math.sqrt(distance);
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/StatisticalCollection.java
----------------------------------------------------------------------
diff --git 
a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/StatisticalCollection.java
 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/StatisticalCollection.java
new file mode 100644
index 0000000..6fc7adc
--- /dev/null
+++ 
b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/StatisticalCollection.java
@@ -0,0 +1,186 @@
+package com.yahoo.labs.samoa.evaluation.measures;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2010 RWTH Aachen University, Germany
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import com.yahoo.labs.samoa.moa.cluster.Clustering;
+import com.yahoo.labs.samoa.moa.core.DataPoint;
+import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection;
+import com.yahoo.labs.samoa.moa.evaluation.MembershipMatrix;
+
+public class StatisticalCollection extends MeasureCollection{
+    private boolean debug = false;
+
+    @Override
+    protected String[] getNames() {
+        //String[] names = {"van Dongen","Rand statistic", "C Index"};
+        return new String[]{"van Dongen","Rand statistic"};
+    }
+
+    @Override
+    protected boolean[] getDefaultEnabled() {
+        return new boolean[]{false, false};
+    }
+
+    @Override
+    public void evaluateClustering(Clustering clustering, Clustering 
trueClustering, ArrayList<DataPoint> points) throws Exception {
+
+
+       MembershipMatrix mm = new MembershipMatrix(clustering, points);
+        int numClasses = mm.getNumClasses();
+        int numCluster = clustering.size()+1;
+        int n = mm.getTotalEntries();
+
+        double dongenMaxFC = 0;
+        double dongenMaxSumFC = 0;
+        for (int i = 0; i < numCluster; i++){
+                double max = 0;
+                for (int j = 0; j < numClasses; j++) {
+                    if(mm.getClusterClassWeight(i, j)>max) max = 
mm.getClusterClassWeight(i, j);
+                }
+                dongenMaxFC+=max;
+                if(mm.getClusterSum(i)>dongenMaxSumFC) dongenMaxSumFC = 
mm.getClusterSum(i);
+        }
+
+        double dongenMaxHC = 0;
+        double dongenMaxSumHC = 0;
+        for (int j = 0; j < numClasses; j++) {
+                double max = 0;
+                for (int i = 0; i < numCluster; i++){
+                    if(mm.getClusterClassWeight(i, j)>max) max = 
mm.getClusterClassWeight(i, j);
+                }
+                dongenMaxHC+=max;
+                if(mm.getClassSum(j)>dongenMaxSumHC) dongenMaxSumHC = 
mm.getClassSum(j);
+        }
+
+        double dongen = (dongenMaxFC + dongenMaxHC)/(2*n);
+        //normalized dongen
+        //double dongen = 1-(2*n - dongenMaxFC - dongenMaxHC)/(2*n - 
dongenMaxSumFC - dongenMaxSumHC);
+        if(debug)
+            System.out.println("Dongen HC:"+dongenMaxHC+" FC:"+dongenMaxFC+" 
Total:"+dongen+" n "+n);
+
+        addValue("van Dongen", dongen);
+
+
+        //Rand index
+        //http://www.cais.ntu.edu.sg/~qihe/menu4.html
+        double m1 = 0;
+        for (int j = 0; j < numClasses; j++) {
+            double v = mm.getClassSum(j);
+            m1+= v*(v-1)/2.0;
+        }
+        double m2 = 0;
+        for (int i = 0; i < numCluster; i++){
+            double v = mm.getClusterSum(i);
+            m2+= v*(v-1)/2.0;
+        }
+
+        double m = 0;
+        for (int i = 0; i < numCluster; i++){
+            for (int j = 0; j < numClasses; j++) {
+                    double v = mm.getClusterClassWeight(i, j);
+                    m+= v*(v-1)/2.0;
+                }
+        }
+        double M = n*(n-1)/2.0;
+        double rand = (M - m1 - m2 +2*m)/M;
+        //normalized rand
+        //double rand = (m - m1*m2/M)/(m1/2.0 + m2/2.0 - m1*m2/M);
+
+        addValue("Rand statistic", rand);
+
+
+        //addValue("C Index",cindex(clustering, points));
+    }
+
+
+
+    public double cindex(Clustering clustering,  ArrayList<DataPoint> points){
+        int numClusters = clustering.size();
+        double withinClustersDistance = 0;
+        int numDistancesWithin = 0;
+        double numDistances = 0;
+
+        //double[] withinClusters = new double[numClusters];
+        double[] minWithinClusters = new double[numClusters];
+        double[] maxWithinClusters = new double[numClusters];
+        ArrayList<Integer>[] pointsInClusters = new ArrayList[numClusters];
+        for (int c = 0; c < numClusters; c++) {
+            pointsInClusters[c] = new ArrayList<>();
+            minWithinClusters[c] = Double.MAX_VALUE;
+            maxWithinClusters[c] = Double.MIN_VALUE;
+        }
+
+        for (int p = 0; p < points.size(); p++) {
+            for (int c = 0; c < clustering.size(); c++) {
+                if(clustering.get(c).getInclusionProbability(points.get(p)) > 
0.8){
+                    pointsInClusters[c].add(p);
+                    numDistances++;
+                }
+            }
+        }
+
+        //calc within cluster distances + min and max values
+        for (int c = 0; c < numClusters; c++) {
+            int numDistancesInC = 0;
+            ArrayList<Integer> pointsInC = pointsInClusters[c];
+            for (int p = 0; p < pointsInC.size(); p++) {
+                DataPoint point = points.get(pointsInC.get(p));
+                for (int p1 = p+1; p1 < pointsInC.size(); p1++) {
+                    numDistancesWithin++;
+                    numDistancesInC++;
+                    DataPoint point1 = points.get(pointsInC.get(p1));
+                    double dist = point.getDistance(point1);
+                    withinClustersDistance+=dist;
+                    if(minWithinClusters[c] > dist) minWithinClusters[c] = 
dist;
+                    if(maxWithinClusters[c] < dist) maxWithinClusters[c] = 
dist;
+                }
+            }
+        }
+
+        double minWithin = Double.MAX_VALUE;
+        double maxWithin = Double.MIN_VALUE;
+        for (int c = 0; c < numClusters; c++) {
+            if(minWithinClusters[c] < minWithin)
+               minWithin = minWithinClusters[c];
+            if(maxWithinClusters[c] > maxWithin)
+               maxWithin = maxWithinClusters[c];
+        }
+
+        double cindex = 0;
+        if(numDistancesWithin != 0){
+            double meanWithinClustersDistance = 
withinClustersDistance/numDistancesWithin;
+            cindex = (meanWithinClustersDistance - 
minWithin)/(maxWithin-minWithin);
+        }
+
+
+        if(debug){
+            System.out.println("Min:"+Arrays.toString(minWithinClusters));
+            System.out.println("Max:"+Arrays.toString(maxWithinClusters));
+            System.out.println("totalWithin:"+numDistancesWithin);
+        }
+        return cindex;
+    }
+
+
+}

Reply via email to