http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM.java new file mode 100644 index 0000000..1a41f6b --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM.java @@ -0,0 +1,509 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2011 RWTH Aachen University, Germany + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.evaluation.measures.CMM_GTAnalysis.CMMPoint; +import com.yahoo.labs.samoa.moa.cluster.Cluster; +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.cluster.SphereCluster; +import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import java.util.ArrayList; + +/** + * [CMM.java] + * + * CMM: Main class + * + * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering on Evolving Data Streams", KDD, 2011 + * + * @author Timm jansen + * Data Management and Data Exploration Group, RWTH Aachen University +*/ + +public class CMM extends MeasureCollection{ + + private static final long serialVersionUID = 1L; + + /** + * found clustering + */ + private Clustering clustering; + + /** + * the ground truth analysis + */ + private CMM_GTAnalysis gtAnalysis; + + /** + * number of points within the horizon + */ + private int numPoints; + + /** + * number of clusters in the found clustering + */ + private int numFClusters; + + /** + * number of cluster in the adjusted groundtruth clustering that + * was calculated through the groundtruth analysis + */ + private int numGT0Classes; + + /** + * match found clusters to GT clusters + */ + private int matchMap[]; + + /** + * pointInclusionProbFC[p][C] contains the probability of point p + * being included in cluster C + */ + private double[][] pointInclusionProbFC; + + /** + * threshold that defines when a point is being considered belonging to a cluster + */ + private double pointInclusionProbThreshold = 0.5; + + /** + * parameterize the error weight of missed points (default 1) + */ + private double lamdaMissed = 1; + + + /** + * enable/disable debug mode + */ + public boolean debug = false; + + + /** + * enable/disable class merge (main feature of ground truth analysis) + */ + public boolean enableClassMerge = true; + + /** + * enable/disable model error + * when enabled errors that are caused by the underling cluster model will not be counted + */ + public boolean enableModelError = true; + + + @Override + protected String[] getNames() { + String[] names = {"CMM","CMM Basic","CMM Missed","CMM Misplaced","CMM Noise", + "CA Seperability", "CA Noise", "CA Modell"}; + return names; + } + + @Override + protected boolean[] getDefaultEnabled() { + boolean [] defaults = {false, false, false, false, false, false, false, false}; + return defaults; + } + + + @Override + public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) throws Exception{ + this.clustering = clustering; + + numPoints = points.size(); + numFClusters = clustering.size(); + + gtAnalysis = new CMM_GTAnalysis(trueClustering, points, enableClassMerge); + + numGT0Classes = gtAnalysis.getNumberOfGT0Classes(); + + addValue("CA Seperability",gtAnalysis.getClassSeparability()); + addValue("CA Noise",gtAnalysis.getNoiseSeparability()); + addValue("CA Modell",gtAnalysis.getModelQuality()); + + /* init the matching and point distances */ + calculateMatching(); + + /* calculate the actual error */ + calculateError(); + } + + + /** + * calculates the CMM specific matching between found clusters and ground truth clusters + */ + private void calculateMatching(){ + + /** + * found cluster frequencies + */ + int[][] mapFC = new int[numFClusters][numGT0Classes]; + + /** + * ground truth cluster frequencies + */ + int[][] mapGT = new int[numGT0Classes][numGT0Classes]; + int [] sumsFC = new int[numFClusters]; + + //calculate fuzzy mapping from + pointInclusionProbFC = new double[numPoints][numFClusters]; + for (int p = 0; p < numPoints; p++) { + CMMPoint cmdp = gtAnalysis.getPoint(p); + //found cluster frequencies + for (int fc = 0; fc < numFClusters; fc++) { + Cluster cl = clustering.get(fc); + pointInclusionProbFC[p][fc] = cl.getInclusionProbability(cmdp); + if (pointInclusionProbFC[p][fc] >= pointInclusionProbThreshold) { + //make sure we don't count points twice that are contained in two merged clusters + if(cmdp.isNoise()) continue; + mapFC[fc][cmdp.workclass()]++; + sumsFC[fc]++; + } + } + + //ground truth cluster frequencies + if(!cmdp.isNoise()){ + for(int hc = 0; hc < numGT0Classes;hc++){ + if(hc == cmdp.workclass()){ + mapGT[hc][hc]++; + } + else{ + if(gtAnalysis.getGT0Cluster(hc).getInclusionProbability(cmdp) >= 1){ + mapGT[hc][cmdp.workclass()]++; + } + } + } + } + } + + //assign each found cluster to a hidden cluster + matchMap = new int[numFClusters]; + for (int fc = 0; fc < numFClusters; fc++) { + int matchIndex = -1; + //check if we only have one entry anyway + for (int hc0 = 0; hc0 < numGT0Classes; hc0++) { + if(mapFC[fc][hc0]!=0){ + if(matchIndex == -1) + matchIndex = hc0; + else{ + matchIndex = -1; + break; + } + } + } + + //more then one entry, so look for most similar frequency profile + int minDiff = Integer.MAX_VALUE; + if(sumsFC[fc]!=0 && matchIndex == -1){ + ArrayList<Integer> fitCandidates = new ArrayList<Integer>(); + for (int hc0 = 0; hc0 < numGT0Classes; hc0++) { + int errDiff = 0; + for (int hc1 = 0; hc1 < numGT0Classes; hc1++) { + //fc profile doesn't fit into current hc profile + double freq_diff = mapFC[fc][hc1] - mapGT[hc0][hc1]; + if(freq_diff > 0){ + errDiff+= freq_diff; + } + } + if(errDiff == 0){ + fitCandidates.add(hc0); + } + if(errDiff < minDiff){ + minDiff = errDiff; + matchIndex = hc0; + } + if(debug){ + //System.out.println("FC"+fc+"("+Arrays.toString(mapFC[fc])+") - HC0_"+hc0+"("+Arrays.toString(mapGT[hc0])+"):"+errDiff); + } + } + //if we have a fitting profile overwrite the min error choice + //if we have multiple fit candidates, use majority vote of corresponding classes + if(fitCandidates.size()!=0){ + int bestGTfit = fitCandidates.get(0); + for(int i = 1; i < fitCandidates.size(); i++){ + int GTfit = fitCandidates.get(i); + if(mapFC[fc][GTfit] > mapFC[fc][bestGTfit]) + bestGTfit=fitCandidates.get(i); + } + matchIndex = bestGTfit; + } + } + + matchMap[fc] = matchIndex; + int realMatch = -1; + if(matchIndex==-1){ + if(debug) + System.out.println("No cluster match: needs to be implemented?"); + } + else{ + realMatch = gtAnalysis.getGT0Cluster(matchMap[fc]).getLabel(); + } + clustering.get(fc).setMeasureValue("CMM Match", "C"+realMatch); + clustering.get(fc).setMeasureValue("CMM Workclass", "C"+matchMap[fc]); + } + + //print matching table + if(debug){ + for (int i = 0; i < numFClusters; i++) { + System.out.print("C"+((int)clustering.get(i).getId()) + " N:"+((int)clustering.get(i).getWeight())+" | "); + for (int j = 0; j < numGT0Classes; j++) { + System.out.print(mapFC[i][j] + " "); + } + System.out.print(" = "+sumsFC[i] + " | "); + String match = "-"; + if (matchMap[i]!=-1) { + match = Integer.toString(gtAnalysis.getGT0Cluster(matchMap[i]).getLabel()); + } + System.out.println(" --> " + match + "(work:"+matchMap[i]+")"); + } + } + } + + + /** + * Calculate the actual error values + */ + private void calculateError(){ + int totalErrorCount = 0; + int totalRedundancy = 0; + int trueCoverage = 0; + int totalCoverage = 0; + + int numNoise = 0; + double errorNoise = 0; + double errorNoiseMax = 0; + + double errorMissed = 0; + double errorMissedMax = 0; + + double errorMisplaced = 0; + double errorMisplacedMax = 0; + + double totalError = 0.0; + double totalErrorMax = 0.0; + + /** mainly iterate over all points and find the right error value for the point. + * within the same run calculate various other stuff like coverage etc... + */ + for (int p = 0; p < numPoints; p++) { + CMMPoint cmdp = gtAnalysis.getPoint(p); + double weight = cmdp.weight(); + //noise counter + if(cmdp.isNoise()){ + numNoise++; + //this is always 1 + errorNoiseMax+=cmdp.connectivity*weight; + } + else{ + errorMissedMax+=cmdp.connectivity*weight; + errorMisplacedMax+=cmdp.connectivity*weight; + } + //sum up maxError as the individual errors are the quality weighted between 0-1 + totalErrorMax+=cmdp.connectivity*weight; + + + double err = 0; + int coverage = 0; + + //check every FCluster + for (int c = 0; c < numFClusters; c++) { + //contained in cluster c? + if(pointInclusionProbFC[p][c] >= pointInclusionProbThreshold){ + coverage++; + + if(!cmdp.isNoise()){ + //PLACED CORRECTLY + if(matchMap[c] == cmdp.workclass()){ + } + //MISPLACED + else{ + double errvalue = misplacedError(cmdp, c); + if(errvalue > err) + err = errvalue; + } + } + else{ + //NOISE + double errvalue = noiseError(cmdp, c); + if(errvalue > err) err = errvalue; + } + } + } + //not in any cluster + if(coverage == 0){ + //MISSED + if(!cmdp.isNoise()){ + err = missedError(cmdp,true); + errorMissed+= weight*err; + } + //NOISE + else{ + } + } + else{ + if(!cmdp.isNoise()){ + errorMisplaced+= err*weight; + } + else{ + errorNoise+= err*weight; + } + } + + /* processing of other evaluation values */ + totalError+= err*weight; + if(err!=0)totalErrorCount++; + if(coverage>0) totalCoverage++; //points covered by clustering (incl. noise) + if(coverage>0 && !cmdp.isNoise()) trueCoverage++; //points covered by clustering, don't count noise + if(coverage>1) totalRedundancy++; //include noise + + cmdp.p.setMeasureValue("CMM",err); + cmdp.p.setMeasureValue("Redundancy", coverage); + } + + addValue("CMM", (totalErrorMax!=0)?1-totalError/totalErrorMax:1); + addValue("CMM Missed", (errorMissedMax!=0)?1-errorMissed/errorMissedMax:1); + addValue("CMM Misplaced", (errorMisplacedMax!=0)?1-errorMisplaced/errorMisplacedMax:1); + addValue("CMM Noise", (errorNoiseMax!=0)?1-errorNoise/errorNoiseMax:1); + addValue("CMM Basic", 1-((double)totalErrorCount/(double)numPoints)); + + if(debug){ + System.out.println("-------------"); + } + } + + + private double noiseError(CMMPoint cmdp, int assignedClusterID){ + int gtAssignedID = matchMap[assignedClusterID]; + double error; + + //Cluster wasn't matched, so just contains noise + //TODO: Noiscluster? + //also happens when we decrease the radius and there is only a noise point in the center + if(gtAssignedID==-1){ + error = 1; + cmdp.p.setMeasureValue("CMM Type","noise - cluster"); + } + else{ + if(enableModelError && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold){ + //set to MIN_ERROR so we can still track the error + error = 0.00001; + cmdp.p.setMeasureValue("CMM Type","noise - byModel"); + } + else{ + error = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID); + cmdp.p.setMeasureValue("CMM Type","noise"); + } + } + + return error; + } + + private double missedError(CMMPoint cmdp, boolean useHullDistance){ + cmdp.p.setMeasureValue("CMM Type","missed"); + if(!useHullDistance){ + return cmdp.connectivity; + } + else{ + //main idea: look at relative distance of missed point to cluster + double minHullDist = 1; + for (int fc = 0; fc < numFClusters; fc++){ + //if fc is mappend onto the class of the point, check it for its hulldist + if(matchMap[fc]!=-1 && matchMap[fc] == cmdp.workclass()){ + if(clustering.get(fc) instanceof SphereCluster){ + SphereCluster sc = (SphereCluster)clustering.get(fc); + double distanceFC = sc.getCenterDistance(cmdp); + double radius = sc.getRadius(); + double hullDist = (distanceFC-radius)/(distanceFC+radius); + if(hullDist < minHullDist) + minHullDist = hullDist; + } + else{ + double min = 1; + double max = 1; + + //TODO: distance for random shape + //generate X points from the cluster with clustering.get(fc).sample(null) + //and find Min and Max values + + double hullDist = min/max; + if(hullDist < minHullDist) + minHullDist = hullDist; + } + } + } + + //use distance as weight + if(minHullDist>1) minHullDist = 1; + + double weight = (1-Math.exp(-lamdaMissed*minHullDist)); + cmdp.p.setMeasureValue("HullDistWeight",weight); + + return weight*cmdp.connectivity; + } + } + + + private double misplacedError(CMMPoint cmdp, int assignedClusterID){ + double weight = 0; + + int gtAssignedID = matchMap[assignedClusterID]; + //TODO take care of noise cluster? + if(gtAssignedID ==-1){ + System.out.println("Point "+cmdp.getTimestamp()+" from gtcluster "+cmdp.trueClass+" assigned to noise cluster "+assignedClusterID); + return 1; + } + + if(gtAssignedID == cmdp.workclass()) + return 0; + else{ + //assigned and real GT0 cluster are not connected, but does the model have the + //chance of separating this point after all? + if(enableModelError && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold){ + weight = 0; + cmdp.p.setMeasureValue("CMM Type","missplaced - byModel"); + } + else{ + //point was mapped onto wrong cluster (assigned), so check how far away + //the nearest point is within the wrongly assigned cluster + weight = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID); + } + } + double err_value; + //set to MIN_ERROR so we can still track the error + if(weight == 0){ + err_value= 0.00001; + } + else{ + err_value = weight*cmdp.connectivity; + cmdp.p.setMeasureValue("CMM Type","missplaced"); + } + + return err_value; + } + + public String getParameterString(){ + String para = gtAnalysis.getParameterString(); + para+="lambdaMissed="+lamdaMissed+";"; + return para; + } + +} + +
http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java new file mode 100644 index 0000000..53fb4dc --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java @@ -0,0 +1,843 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2011 RWTH Aachen University, Germany + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.instances.Instance; +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.core.AutoExpandVector; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; + +/** + * [CMM_GTAnalysis.java] + * + * CMM: Ground truth analysis + * + * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering on Evolving Data Streams", KDD, 2011 + * + * @author Timm jansen + * Data Management and Data Exploration Group, RWTH Aachen University +*/ + +/* + * TODO: + * - try to avoid calcualting the radius multiple times + * - avoid the full distance map? + * - knn functionality in clusters + * - noise error + */ +public class CMM_GTAnalysis{ + + /** + * the given ground truth clustering + */ + private Clustering gtClustering; + + /** + * list of given points within the horizon + */ + private ArrayList<CMMPoint> cmmpoints; + + /** + * the newly calculate ground truth clustering + */ + private ArrayList<GTCluster> gt0Clusters; + + /** + * IDs of noise points + */ + private ArrayList<Integer> noise; + + /** + * total number of points + */ + private int numPoints; + + /** + * number of clusters of the original ground truth + */ + private int numGTClusters; + + /** + * number of classes of the original ground truth, in case of a + * micro clustering ground truth this differs from numGTClusters + */ + private int numGTClasses; + + /** + * number of classes after we are done with the analysis + */ + private int numGT0Classes; + + /** + * number of dimensions + */ + private int numDims; + + /** + * mapping between true cluster ID/class label of the original ground truth + * and the internal cluster ID/working class label. + * + * different original cluster IDs might map to the same new cluster ID due to merging of two clusters + */ + private HashMap<Integer, Integer> mapTrueLabelToWorkLabel; + + /** + * log of how clusters have been merged (for debugging) + */ + private int[] mergeMap; + + /** + * number of non-noise points that will create an error due to the underlying clustering model + * (e.g. point being covered by two clusters representing different classes) + */ + private int noiseErrorByModel; + + /** + * number of noise points that will create an error due to the underlying clustering model + * (e.g. noise point being covered by a cluster) + */ + private int pointErrorByModel; + + /** + * CMM debug mode + */ + private boolean debug = false; + + + /******* CMM parameter ***********/ + + /** + * defines how many nearest neighbors will be used + */ + private int knnNeighbourhood = 2; + + /** + * the threshold which defines when ground truth clusters will be merged. + * set to 1 to disable merging + */ + private double tauConnection = 0.5; + + /** + * experimental (default: disabled) + * separate k for points to cluster and cluster to cluster + */ + private double clusterConnectionMaxPoints = knnNeighbourhood; + + /** + * experimental (default: disabled) + * use exponential connectivity function to model different behavior: + * closer points will have a stronger connection compared to the linear function. + * Use ConnRefXValue and ConnX to better parameterize lambda, which controls + * the decay of the connectivity + */ + private boolean useExpConnectivity = false; + private double lambdaConnRefXValue = 0.01; + private double lambdaConnX = 4; + private double lamdaConn; + + + /******************************************/ + + + /** + * Wrapper class for data points to store CMM relevant attributes + * + */ + protected class CMMPoint extends DataPoint{ + /** + * Reference to original point + */ + protected DataPoint p = null; + + /** + * point ID + */ + protected int pID = 0; + + + /** + * true class label + */ + protected int trueClass = -1; + + + /** + * the connectivity of the point to its cluster + */ + protected double connectivity = 1.0; + + + /** + * knn distnace within own cluster + */ + protected double knnInCluster = 0.0; + + + /** + * knn indices (for debugging only) + */ + protected ArrayList<Integer> knnIndices; + + public CMMPoint(DataPoint point, int id) { + //make a copy, but keep reference + super(point,point.getTimestamp()); + p = point; + pID = id; + trueClass = (int)point.classValue(); + } + + + /** + * Retruns the current working label of the cluster the point belongs to. + * The label can change due to merging of clusters. + * + * @return the current working class label + */ + protected int workclass(){ + if(trueClass == -1 ) + return -1; + else + return mapTrueLabelToWorkLabel.get(trueClass); + } + } + + + + /** + * Main class to model the new clusters that will be the output of the cluster analysis + * + */ + protected class GTCluster{ + /** points that are per definition in the cluster */ + private ArrayList<Integer> points = new ArrayList<Integer>(); + + /** a new GT cluster consists of one or more "old" GT clusters. + * Connected/overlapping clusters cannot be merged directly because of the + * underlying cluster model. E.g. for merging two spherical clusters the new + * cluster sphere can cover a lot more space then two separate smaller spheres. + * To keep the original coverage we need to keep the orignal clusters and merge + * them on an abstract level. */ + private ArrayList<Integer> clusterRepresentations = new ArrayList<Integer>(); + + /** current work class (changes when merging) */ + private int workclass; + + /** original work class */ + private final int orgWorkClass; + + /** original class label*/ + private final int label; + + /** clusters that have been merged into this cluster (debugging)*/ + private ArrayList<Integer> mergedWorkLabels = null; + + /** average knn distance of all points in the cluster*/ + private double knnMeanAvg = 0; + + /** average deviation of knn distance of all points*/ + private double knnDevAvg = 0; + + /** connectivity of the cluster to all other clusters */ + private ArrayList<Double> connections = new ArrayList<Double>(); + + + private GTCluster(int workclass, int label, int gtClusteringID) { + this.orgWorkClass = workclass; + this.workclass = workclass; + this.label = label; + this.clusterRepresentations.add(gtClusteringID); + } + + + /** + * The original class label the cluster represents + * @return original class label + */ + protected int getLabel(){ + return label; + } + + /** + * Calculate the probability of the point being covered through the cluster + * @param point to calculate the probability for + * @return probability of the point being covered through the cluster + */ + protected double getInclusionProbability(CMMPoint point){ + double prob = Double.MIN_VALUE; + //check all cluster representatives for coverage + for (int c = 0; c < clusterRepresentations.size(); c++) { + double tmp_prob = gtClustering.get(clusterRepresentations.get(c)).getInclusionProbability(point); + if(tmp_prob > prob) prob = tmp_prob; + } + return prob; + } + + + /** + * calculate knn distances of points within own cluster + * + average knn distance and average knn distance deviation of all points + */ + private void calculateKnn(){ + for (int p0 : points) { + CMMPoint cmdp = cmmpoints.get(p0); + if(!cmdp.isNoise()){ + AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); + AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); + + //calculate nearest neighbours + getKnnInCluster(cmdp, knnNeighbourhood, points, knnDist,knnPointIndex); + + //TODO: What to do if we have less then k neighbours? + double avgKnn = 0; + for (int i = 0; i < knnDist.size(); i++) { + avgKnn+= knnDist.get(i); + } + if(knnDist.size()!=0) + avgKnn/=knnDist.size(); + cmdp.knnInCluster = avgKnn; + cmdp.knnIndices = knnPointIndex; + cmdp.p.setMeasureValue("knnAvg", cmdp.knnInCluster); + + knnMeanAvg+=avgKnn; + knnDevAvg+=Math.pow(avgKnn,2); + } + } + knnMeanAvg=knnMeanAvg/(double)points.size(); + knnDevAvg=knnDevAvg/(double)points.size(); + + double variance = knnDevAvg-Math.pow(knnMeanAvg,2.0); + // Due to numerical errors, small negative values can occur. + if (variance <= 0.0) variance = 1e-50; + knnDevAvg = Math.sqrt(variance); + + } + + + /** + * Calculate the connection of a cluster to this cluster + * @param otherCid cluster id of the other cluster + * @param initial flag for initial run + */ + private void calculateClusterConnection(int otherCid, boolean initial){ + double avgConnection = 0; + if(workclass==otherCid){ + avgConnection = 1; + } + else{ + AutoExpandVector<Double> kmax = new AutoExpandVector<Double>(); + AutoExpandVector<Integer> kmaxIndexes = new AutoExpandVector<Integer>(); + + for(int p : points){ + CMMPoint cmdp = cmmpoints.get(p); + double con_p_Cj = getConnectionValue(cmmpoints.get(p), otherCid); + double connection = cmdp.connectivity * con_p_Cj; + if(initial){ + cmdp.p.setMeasureValue("Connection to C"+otherCid, con_p_Cj); + } + + //connection + if(kmax.size() < clusterConnectionMaxPoints || connection > kmax.get(kmax.size()-1)){ + int index = 0; + while(index < kmax.size() && connection < kmax.get(index)) { + index++; + } + kmax.add(index, connection); + kmaxIndexes.add(index, p); + if(kmax.size() > clusterConnectionMaxPoints){ + kmax.remove(kmax.size()-1); + kmaxIndexes.add(kmaxIndexes.size()-1); + } + } + } + //connection + for (int k = 0; k < kmax.size(); k++) { + avgConnection+= kmax.get(k); + } + avgConnection/=kmax.size(); + } + + if(otherCid<connections.size()){ + connections.set(otherCid, avgConnection); + } + else + if(connections.size() == otherCid){ + connections.add(avgConnection); + } + else + System.out.println("Something is going really wrong with the connection listing!"+knnNeighbourhood+" "+tauConnection); + } + + + /** + * Merge a cluster into this cluster + * @param mergeID the ID of the cluster to be merged + */ + private void mergeCluster(int mergeID){ + if(mergeID < gt0Clusters.size()){ + //track merging (debugging) + for (int i = 0; i < numGTClasses; i++) { + if(mergeMap[i]==mergeID) + mergeMap[i]=workclass; + if(mergeMap[i]>mergeID) + mergeMap[i]--; + } + GTCluster gtcMerge = gt0Clusters.get(mergeID); + if(debug) + System.out.println("Merging C"+gtcMerge.workclass+" into C"+workclass+ + " with Con "+connections.get(mergeID)+" / "+gtcMerge.connections.get(workclass)); + + + //update mapTrueLabelToWorkLabel + mapTrueLabelToWorkLabel.put(gtcMerge.label, workclass); + Iterator iterator = mapTrueLabelToWorkLabel.keySet().iterator(); + while (iterator.hasNext()) { + Integer key = (Integer)iterator.next(); + //update pointer of already merged cluster + int value = mapTrueLabelToWorkLabel.get(key); + if(value == mergeID) + mapTrueLabelToWorkLabel.put(key, workclass); + if(value > mergeID) + mapTrueLabelToWorkLabel.put(key, value-1); + } + + //merge points from B into A + points.addAll(gtcMerge.points); + clusterRepresentations.addAll(gtcMerge.clusterRepresentations); + if(mergedWorkLabels==null){ + mergedWorkLabels = new ArrayList<Integer>(); + } + mergedWorkLabels.add(gtcMerge.orgWorkClass); + if(gtcMerge.mergedWorkLabels!=null) + mergedWorkLabels.addAll(gtcMerge.mergedWorkLabels); + + gt0Clusters.remove(mergeID); + + //update workclass labels + for(int c=mergeID; c < gt0Clusters.size(); c++){ + gt0Clusters.get(c).workclass = c; + } + + //update knn distances + calculateKnn(); + for(int c=0; c < gt0Clusters.size(); c++){ + gt0Clusters.get(c).connections.remove(mergeID); + + //recalculate connection from other clusters to the new merged one + gt0Clusters.get(c).calculateClusterConnection(workclass,false); + //and from new merged one to other clusters + gt0Clusters.get(workclass).calculateClusterConnection(c,false); + } + } + else{ + System.out.println("Merge indices are not valid"); + } + } + } + + + /** + * @param trueClustering the ground truth clustering + * @param points data points + * @param enableClassMerge allow class merging (should be set to true on default) + */ + public CMM_GTAnalysis(Clustering trueClustering, ArrayList<DataPoint> points, boolean enableClassMerge){ + if(debug) + System.out.println("GT Analysis Debug Output"); + + noiseErrorByModel = 0; + pointErrorByModel = 0; + if(!enableClassMerge){ + tauConnection = 1.0; + } + + lamdaConn = -Math.log(lambdaConnRefXValue)/Math.log(2)/lambdaConnX; + + this.gtClustering = trueClustering; + + numPoints = points.size(); + numDims = points.get(0).numAttributes()-1; + numGTClusters = gtClustering.size(); + + //init mappings between work and true labels + mapTrueLabelToWorkLabel = new HashMap<Integer, Integer>(); + + //set up base of new clustering + gt0Clusters = new ArrayList<GTCluster>(); + int numWorkClasses = 0; + //create label to worklabel mapping as real labels can be just a set of unordered integers + for (int i = 0; i < numGTClusters; i++) { + int label = (int)gtClustering.get(i).getGroundTruth(); + if(!mapTrueLabelToWorkLabel.containsKey(label)){ + gt0Clusters.add(new GTCluster(numWorkClasses,label,i)); + mapTrueLabelToWorkLabel.put(label,numWorkClasses); + numWorkClasses++; + } + else{ + gt0Clusters.get(mapTrueLabelToWorkLabel.get(label)).clusterRepresentations.add(i); + } + } + numGTClasses = numWorkClasses; + + mergeMap = new int[numGTClasses]; + for (int i = 0; i < numGTClasses; i++) { + mergeMap[i]=i; + } + + //create cmd point wrapper instances + cmmpoints = new ArrayList<CMMPoint>(); + for (int p = 0; p < points.size(); p++) { + CMMPoint cmdp = new CMMPoint(points.get(p), p); + cmmpoints.add(cmdp); + } + + + //split points up into their GTClusters and Noise (according to class labels) + noise = new ArrayList<Integer>(); + for (int p = 0; p < numPoints; p++) { + if(cmmpoints.get(p).isNoise()){ + noise.add(p); + } + else{ + gt0Clusters.get(cmmpoints.get(p).workclass()).points.add(p); + } + } + + //calculate initial knnMean and knnDev + for (GTCluster gtc : gt0Clusters) { + gtc.calculateKnn(); + } + + //calculate cluster connections + calculateGTClusterConnections(); + + //calculate point connections with own clusters + calculateGTPointQualities(); + + if(debug) + System.out.println("GT Analysis Debug End"); + + } + + /** + * Calculate the connection of a point to a cluster + * + * @param cmmp the point to calculate the connection for + * @param clusterID the corresponding cluster + * @return the connection value + */ + //TODO: Cache the connection value for a point to the different clusters??? + protected double getConnectionValue(CMMPoint cmmp, int clusterID){ + AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); + AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); + + //calculate the knn distance of the point to the cluster + getKnnInCluster(cmmp, knnNeighbourhood, gt0Clusters.get(clusterID).points, knnDist, knnPointIndex); + + //TODO: What to do if we have less then k neighbors? + double avgDist = 0; + for (int i = 0; i < knnDist.size(); i++) { + avgDist+= knnDist.get(i); + } + //what to do if we only have a single point??? + if(knnDist.size()!=0) + avgDist/=knnDist.size(); + else + return 0; + + //get the upper knn distance of the cluster + double upperKnn = gt0Clusters.get(clusterID).knnMeanAvg + gt0Clusters.get(clusterID).knnDevAvg; + + /* calculate the connectivity based on knn distance of the point within the cluster + and the upper knn distance of the cluster*/ + if(avgDist < upperKnn){ + return 1; + } + else{ + //value that should be reached at upperKnn distance + //Choose connection formula + double conn; + if(useExpConnectivity) + conn = Math.pow(2,-lamdaConn*(avgDist-upperKnn)/upperKnn); + else + conn = upperKnn/avgDist; + + if(Double.isNaN(conn)) + System.out.println("Connectivity NaN at "+cmmp.p.getTimestamp()); + + return conn; + } + } + + + /** + * @param cmmp point to calculate knn distance for + * @param k number of nearest neighbors to look for + * @param pointIDs list of point IDs to check + * @param knnDist sorted list of smallest knn distances (can already be filled to make updates possible) + * @param knnPointIndex list of corresponding knn indices + */ + private void getKnnInCluster(CMMPoint cmmp, int k, + ArrayList<Integer> pointIDs, + AutoExpandVector<Double> knnDist, + AutoExpandVector<Integer> knnPointIndex) { + + //iterate over every point in the choosen cluster, cal distance and insert into list + for (int p1 = 0; p1 < pointIDs.size(); p1++) { + int pid = pointIDs.get(p1); + if(cmmp.pID == pid) continue; + double dist = distance(cmmp,cmmpoints.get(pid)); + if(knnDist.size() < k || dist < knnDist.get(knnDist.size()-1)){ + int index = 0; + while(index < knnDist.size() && dist > knnDist.get(index)) { + index++; + } + knnDist.add(index, dist); + knnPointIndex.add(index,pid); + if(knnDist.size() > k){ + knnDist.remove(knnDist.size()-1); + knnPointIndex.remove(knnPointIndex.size()-1); + } + } + } + } + + + + /** + * calculate initial connectivities + */ + private void calculateGTPointQualities(){ + for (int p = 0; p < numPoints; p++) { + CMMPoint cmdp = cmmpoints.get(p); + if(!cmdp.isNoise()){ + cmdp.connectivity = getConnectionValue(cmdp, cmdp.workclass()); + cmdp.p.setMeasureValue("Connectivity", cmdp.connectivity); + } + } + } + + + + /** + * Calculate connections between clusters and merge clusters accordingly as + * long as connections exceed threshold + */ + private void calculateGTClusterConnections(){ + for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { + for (int c1 = 0; c1 < gt0Clusters.size(); c1++) { + gt0Clusters.get(c0).calculateClusterConnection(c1, true); + } + } + + boolean changedConnection = true; + while(changedConnection){ + if(debug){ + System.out.println("Cluster Connection"); + for (int c = 0; c < gt0Clusters.size(); c++) { + System.out.print("C"+gt0Clusters.get(c).label+" --> "); + for (int c1 = 0; c1 < gt0Clusters.get(c).connections.size(); c1++) { + System.out.print(" C"+gt0Clusters.get(c1).label+": "+gt0Clusters.get(c).connections.get(c1)); + } + System.out.println(""); + } + System.out.println(""); + } + + double max = 0; + int maxIndexI = -1; + int maxIndexJ = -1; + + changedConnection = false; + for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { + for (int c1 = c0+1; c1 < gt0Clusters.size(); c1++) { + if(c0==c1) continue; + double min =Math.min(gt0Clusters.get(c0).connections.get(c1), gt0Clusters.get(c1).connections.get(c0)); + if(min > max){ + max = min; + maxIndexI = c0; + maxIndexJ = c1; + } + } + } + if(maxIndexI!=-1 && max > tauConnection){ + gt0Clusters.get(maxIndexI).mergeCluster(maxIndexJ); + if(debug) + System.out.println("Merging "+maxIndexI+" and "+maxIndexJ+" because of connection "+max); + + changedConnection = true; + } + } + numGT0Classes = gt0Clusters.size(); + } + + + /** + * Calculates how well the original clusters are separable. + * Small values indicate bad separability, values close to 1 indicate good separability + * @return index of seperability + */ + public double getClassSeparability(){ +// int totalConn = numGTClasses*(numGTClasses-1)/2; +// int mergedConn = 0; +// for(GTCluster gt : gt0Clusters){ +// int merged = gt.clusterRepresentations.size(); +// if(merged > 1) +// mergedConn+=merged * (merged-1)/2; +// } +// if(totalConn == 0) +// return 0; +// else +// return 1-mergedConn/(double)totalConn; + return numGT0Classes/(double)numGTClasses; + + } + + + /** + * Calculates how well noise is separable from the given clusters + * Small values indicate bad separability, values close to 1 indicate good separability + * @return index of noise separability + */ + public double getNoiseSeparability(){ + if(noise.isEmpty()) + return 1; + + double connectivity = 0; + for(int p : noise){ + CMMPoint npoint = cmmpoints.get(p); + double maxConnection = 0; + + //TODO: some kind of pruning possible. what about weighting? + for (int c = 0; c < gt0Clusters.size(); c++) { + double connection = getConnectionValue(npoint, c); + if(connection > maxConnection) + maxConnection = connection; + } + connectivity+=maxConnection; + npoint.p.setMeasureValue("MaxConnection", maxConnection); + } + + return 1-(connectivity / noise.size()); + } + + + /** + * Calculates the relative number of errors being caused by the underlying cluster model + * @return quality of the model + */ + public double getModelQuality(){ + for(int p = 0; p < numPoints; p++){ + CMMPoint cmdp = cmmpoints.get(p); + for(int hc = 0; hc < numGTClusters;hc++){ + if(gtClustering.get(hc).getGroundTruth() != cmdp.trueClass){ + if(gtClustering.get(hc).getInclusionProbability(cmdp) >= 1){ + if(!cmdp.isNoise()) + pointErrorByModel++; + else + noiseErrorByModel++; + break; + } + } + } + } + if(debug) + System.out.println("Error by model: noise "+noiseErrorByModel+" point "+pointErrorByModel); + + return 1-((pointErrorByModel + noiseErrorByModel)/(double) numPoints); + } + + + /** + * Get CMM internal point + * @param index of the point + * @return cmm point + */ + protected CMMPoint getPoint(int index){ + return cmmpoints.get(index); + } + + + /** + * Return cluster + * @param index of the cluster to return + * @return cluster + */ + protected GTCluster getGT0Cluster(int index){ + return gt0Clusters.get(index); + } + + /** + * Number of classes/clusters of the new clustering + * @return number of new clusters + */ + protected int getNumberOfGT0Classes() { + return numGT0Classes; + } + + /** + * Calculates Euclidian distance + * @param inst1 point as double array + * @param inst2 point as double array + * @return euclidian distance + */ + private double distance(Instance inst1, Instance inst2){ + return distance(inst1, inst2.toDoubleArray()); + + } + + /** + * Calculates Euclidian distance + * @param inst1 point as an instance + * @param inst2 point as double array + * @return euclidian distance + */ + private double distance(Instance inst1, double[] inst2){ + double distance = 0.0; + for (int i = 0; i < numDims; i++) { + double d = inst1.value(i) - inst2[i]; + distance += d * d; + } + return Math.sqrt(distance); + } + + /** + * String with main CMM parameters + * @return main CMM parameter + */ + public String getParameterString(){ + String para = ""; + para+="k="+knnNeighbourhood+";"; + if(useExpConnectivity){ + para+="lambdaConnX="+lambdaConnX+";"; + para+="lambdaConn="+lamdaConn+";"; + para+="lambdaConnRef="+lambdaConnRefXValue+";"; + } + para+="m="+clusterConnectionMaxPoints+";"; + para+="tauConn="+tauConnection+";"; + + return para; + } +} + + http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java new file mode 100644 index 0000000..0d311e4 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java @@ -0,0 +1,174 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2010 RWTH Aachen University, Germany + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import java.util.ArrayList; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; +import com.yahoo.labs.samoa.moa.evaluation.MembershipMatrix; + +public class EntropyCollection extends MeasureCollection{ + + private static final Logger logger = LoggerFactory.getLogger(EntropyCollection.class); + + @Override + protected String[] getNames() { + return new String[]{"GT cross entropy","FC cross entropy","Homogeneity","Completeness","V-Measure","VarInformation"}; + } + + @Override + protected boolean[] getDefaultEnabled() { + return new boolean[]{false, false, false, false, false, false}; + } + + @Override + public void evaluateClustering(Clustering fclustering, Clustering hClustering, ArrayList<DataPoint> points) throws Exception { + + MembershipMatrix mm = new MembershipMatrix(fclustering, points); + int numClasses = mm.getNumClasses(); + int numCluster = fclustering.size()+1; + int n = mm.getTotalEntries(); + + + double FCentropy = 0; + if(numCluster > 1){ + for (int fc = 0; fc < numCluster; fc++){ + double weight = mm.getClusterSum(fc)/(double)n; + if(weight > 0) + FCentropy+= weight * Math.log10(weight); + } + FCentropy/=(-1*Math.log10(numCluster)); + } + + logger.debug("FC entropy: {}", FCentropy); + + double GTentropy = 0; + if(numClasses > 1){ + for (int hc = 0; hc < numClasses; hc++){ + double weight = mm.getClassSum(hc)/(double)n; + if(weight > 0) + GTentropy+= weight * Math.log10(weight); + } + GTentropy/=(-1*Math.log10(numClasses)); + } + + logger.debug("GT entropy: {}", GTentropy); + + //cluster based entropy + double FCcrossEntropy = 0; + + for (int fc = 0; fc < numCluster; fc++){ + double e = 0; + int clusterWeight = mm.getClusterSum(fc); + if(clusterWeight>0){ + for (int hc = 0; hc < numClasses; hc++) { + double p = mm.getClusterClassWeight(fc, hc)/(double)clusterWeight; + if(p!=0){ + e+=p * Math.log10(p); + } + } + FCcrossEntropy+=((clusterWeight/(double)n) * e); + } + } + if(numCluster > 1){ + FCcrossEntropy/=-1*Math.log10(numCluster); + } + + addValue("FC cross entropy", 1-FCcrossEntropy); + logger.debug("FC cross entropy: {}", 1 - FCcrossEntropy); + + //class based entropy + double GTcrossEntropy = 0; + for (int hc = 0; hc < numClasses; hc++){ + double e = 0; + int classWeight = mm.getClassSum(hc); + if(classWeight>0){ + for (int fc = 0; fc < numCluster; fc++) { + double p = mm.getClusterClassWeight(fc, hc)/(double)classWeight; + if(p!=0){ + e+=p * Math.log10(p); + } + } + } + GTcrossEntropy+=((classWeight/(double)n) * e); + } + if(numClasses > 1) + GTcrossEntropy/=-1*Math.log10(numClasses); + addValue("GT cross entropy", 1-GTcrossEntropy); + logger.debug("GT cross entropy: {}", 1 - GTcrossEntropy); + + double homogeneity; + if(FCentropy == 0) + homogeneity = 1; + else + homogeneity = 1 - FCcrossEntropy/FCentropy; + + //TODO set err values for now, needs to be debugged + if(homogeneity > 1 || homogeneity < 0) + addValue("Homogeneity",-1); + else + addValue("Homogeneity",homogeneity); + + double completeness; + if(GTentropy == 0) + completeness = 1; + else + completeness = 1 - GTcrossEntropy/GTentropy; + addValue("Completeness",completeness); + + double beta = 1; + double vmeasure = (1+ beta)*homogeneity*completeness/(beta *homogeneity+completeness); + + if(vmeasure > 1 || homogeneity < 0) + addValue("V-Measure",-1); + else + addValue("V-Measure",vmeasure); + + + + double mutual = 0; + for (int i = 0; i < numCluster; i++){ + for (int j = 0; j < numClasses; j++) { + if(mm.getClusterClassWeight(i, j)==0) continue; + double m = Math.log10(mm.getClusterClassWeight(i, j)/(double)mm.getClusterSum(i)/(double)mm.getClassSum(j)*(double)n); + m*= mm.getClusterClassWeight(i, j)/(double)n; + logger.debug("( {} / {}): ",m, m); + mutual+=m; + } + } + if(numClasses > 1) + mutual/=Math.log10(numClasses); + + double varInfo = 1; + if(FCentropy + GTentropy > 0) + varInfo = 2*mutual/(FCentropy + GTentropy); + + logger.debug("mutual: {} / VI: {}", mutual, varInfo); + addValue("VarInformation", varInfo); + + } + +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java new file mode 100644 index 0000000..6533f36 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java @@ -0,0 +1,115 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2010 RWTH Aachen University, Germany + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; +import com.yahoo.labs.samoa.moa.evaluation.MembershipMatrix; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import java.util.ArrayList; + + +public class F1 extends MeasureCollection{ + + @Override + protected String[] getNames() { + return new String[]{"F1-P","F1-R","Purity"}; + } + + public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) { + + if (clustering.size()<0){ + addValue(0,0); + addValue(1,0); + return; + } + + MembershipMatrix mm = new MembershipMatrix(clustering, points); + //System.out.println(mm.toString()); + + int numClasses = mm.getNumClasses(); + if(mm.hasNoiseClass()) + numClasses--; + + + + //F1 as defined in P3C, try using F1 optimization + double F1_P = 0.0; + double purity = 0; + int realClusters = 0; + for (int i = 0; i < clustering.size(); i++) { + int max_weight = 0; + int max_weight_index = -1; + + //find max index + for (int j = 0; j < numClasses; j++) { + if(mm.getClusterClassWeight(i, j) > max_weight){ + max_weight = mm.getClusterClassWeight(i, j); + max_weight_index = j; + } + } + if(max_weight_index!=-1){ + realClusters++; + double precision = mm.getClusterClassWeight(i, max_weight_index)/(double)mm.getClusterSum(i); + double recall = mm.getClusterClassWeight(i, max_weight_index)/(double) mm.getClassSum(max_weight_index); + double f1 = 0; + if(precision > 0 || recall > 0){ + f1 = 2*precision*recall/(precision+recall); + } + F1_P += f1; + purity += precision; + + //TODO should we move setMeasure stuff into the Cluster interface? + clustering.get(i).setMeasureValue("F1-P", Double.toString(f1)); + } + } + if(realClusters > 0){ + F1_P/=realClusters; + purity/=realClusters; + } + addValue("F1-P",F1_P); + addValue("Purity",purity); + + + + //F1 as defined in .... mainly maximizes F1 for each class + double F1_R = 0.0; + for (int j = 0; j < numClasses; j++) { + double max_f1 = 0; + for (int i = 0; i < clustering.size(); i++) { + double precision = mm.getClusterClassWeight(i, j)/(double)mm.getClusterSum(i); + double recall = mm.getClusterClassWeight(i, j)/(double)mm.getClassSum(j); + double f1 = 0; + if(precision > 0 || recall > 0){ + f1 = 2*precision*recall/(precision+recall); + } + if(max_f1 < f1){ + max_f1 = f1; + } + } + F1_R+= max_f1; + } + F1_R/=numClasses; + + addValue("F1-R",F1_R); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java new file mode 100644 index 0000000..7f23c1b --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java @@ -0,0 +1,191 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2010 RWTH Aachen University, Germany + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + + +import com.yahoo.labs.samoa.instances.Instance; +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.cluster.SphereCluster; +import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import java.util.ArrayList; + + +public class General extends MeasureCollection{ + private int numPoints; + private int numFClusters; + private int numDims; + private double pointInclusionProbThreshold = 0.8; + private Clustering clustering; + private ArrayList<DataPoint> points; + + + public General() { + super(); + } + + + @Override + protected String[] getNames() { + //String[] names = {"GPrecision","GRecall","Redundancy","Overlap","numCluster","numClasses","Compactness"}; + return new String[]{"GPrecision","GRecall","Redundancy","numCluster","numClasses"}; + } + +// @Override +// protected boolean[] getDefaultEnabled() { +// boolean [] defaults = {false, false, false, false, false ,false}; +// return defaults; +// } + + @Override + public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) throws Exception{ + + this.points = points; + this.clustering = clustering; + numPoints = points.size(); + numFClusters = clustering.size(); + numDims = points.get(0).numAttributes()-1; + + + int totalRedundancy = 0; + int trueCoverage = 0; + int totalCoverage = 0; + + int numNoise = 0; + for (int p = 0; p < numPoints; p++) { + int coverage = 0; + for (int c = 0; c < numFClusters; c++) { + //contained in cluster c? + if(clustering.get(c).getInclusionProbability(points.get(p)) >= pointInclusionProbThreshold){ + coverage++; + } + } + + if(points.get(p).classValue()==-1){ + numNoise++; + } + else{ + if(coverage>0) trueCoverage++; + } + + if(coverage>0) totalCoverage++; //points covered by clustering (incl. noise) + if(coverage>1) totalRedundancy++; //include noise + } + + addValue("numCluster", clustering.size()); + addValue("numClasses", trueClustering.size()); + addValue("Redundancy", ((double)totalRedundancy/(double)numPoints)); + addValue("GPrecision", (totalCoverage==0?0:((double)trueCoverage/(double)(totalCoverage)))); + addValue("GRecall", ((double)trueCoverage/(double)(numPoints-numNoise))); +// if(isEnabled(3)){ +// addValue("Compactness", computeCompactness()); +// } +// if(isEnabled(3)){ +// addValue("Overlap", computeOverlap()); +// } + } + + private double computeOverlap(){ + for (int c = 0; c < numFClusters; c++) { + if(!(clustering.get(c) instanceof SphereCluster)){ + System.out.println("Overlap only supports Sphere Cluster. Found: "+clustering.get(c).getClass()); + return Double.NaN; + } + } + + boolean[] overlap = new boolean[numFClusters]; + + for (int c0 = 0; c0 < numFClusters; c0++) { + if(overlap[c0]) continue; + SphereCluster s0 = (SphereCluster)clustering.get(c0); + for (int c1 = c0; c1 < clustering.size(); c1++) { + if(c1 == c0) continue; + SphereCluster s1 = (SphereCluster)clustering.get(c1); + if(s0.overlapRadiusDegree(s1) > 0){ + overlap[c0] = overlap[c1] = true; + } + } + } + + double totalOverlap = 0; + for (int c0 = 0; c0 < numFClusters; c0++) { + if(overlap[c0]) + totalOverlap++; + } + +// if(totalOverlap/(double)numFClusters > .8) RunVisualizer.pause(); + if(numFClusters>0) totalOverlap/=(double)numFClusters; + return totalOverlap; + } + + + private double computeCompactness(){ + if(numFClusters == 0) return 0; + for (int c = 0; c < numFClusters; c++) { + if(!(clustering.get(c) instanceof SphereCluster)){ + System.out.println("Compactness only supports Sphere Cluster. Found: "+clustering.get(c).getClass()); + return Double.NaN; + } + } + + //TODO weight radius by number of dimensions + double totalCompactness = 0; + for (int c = 0; c < numFClusters; c++) { + ArrayList<Instance> containedPoints = new ArrayList<Instance>(); + for (int p = 0; p < numPoints; p++) { + //p in c + if(clustering.get(c).getInclusionProbability(points.get(p)) >= pointInclusionProbThreshold){ + containedPoints.add(points.get(p)); + } + } + double compactness = 0; + if(containedPoints.size()>1){ + //cluster not empty + SphereCluster minEnclosingCluster = new SphereCluster(containedPoints, numDims); + double minRadius = minEnclosingCluster.getRadius(); + double cfRadius = ((SphereCluster)clustering.get(c)).getRadius(); + if(Math.abs(minRadius-cfRadius) < 0.1e-10){ + compactness = 1; + } + else + if(minRadius < cfRadius) + compactness = minRadius/cfRadius; + else{ + System.out.println("Optimal radius bigger then real one ("+(cfRadius-minRadius)+"), this is really wrong"); + compactness = 1; + } + } + else{ + double cfRadius = ((SphereCluster)clustering.get(c)).getRadius(); + if(cfRadius==0) compactness = 1; + } + + //weight by weight of cluster??? + totalCompactness+=compactness; + clustering.get(c).setMeasureValue("Compactness", Double.toString(compactness)); + } + return (totalCompactness/numFClusters); + } + + +} + + http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java new file mode 100644 index 0000000..4f57788 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java @@ -0,0 +1,96 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2010 RWTH Aachen University, Germany + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ +import java.util.ArrayList; + +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; +import com.yahoo.labs.samoa.instances.Instance; + +public class SSQ extends MeasureCollection { + + public SSQ() { + super(); + } + + @Override + public String[] getNames() { + return new String[]{"SSQ"}; + } + + @Override + protected boolean[] getDefaultEnabled() { + return new boolean[]{false}; + } + + // TODO Work on this later + //@Override + public void evaluateClusteringSamoa(Clustering clustering, + Clustering trueClsutering, ArrayList<Instance> points) { + double sum = 0.0; + for (Instance point : points) { + // don't include noise + if (point.classValue() == -1) { + continue; + } + + double minDistance = Double.MAX_VALUE; + for (int c = 0; c < clustering.size(); c++) { + double distance = 0.0; + double[] center = clustering.get(c).getCenter(); + for (int i = 0; i < center.length; i++) { + double d = point.value(i) - center[i]; + distance += d * d; + } + minDistance = Math.min(distance, minDistance); + } + + sum += minDistance; + } + + addValue(0, sum); + } + + @Override + public void evaluateClustering(Clustering clustering, Clustering trueClsutering, ArrayList<DataPoint> points) { + double sum = 0.0; + for (int p = 0; p < points.size(); p++) { + //don't include noise + if(points.get(p).classValue()==-1) continue; + + double minDistance = Double.MAX_VALUE; + for (int c = 0; c < clustering.size(); c++) { + double distance = 0.0; + double[] center = clustering.get(c).getCenter(); + for (int i = 0; i < center.length; i++) { + double d = points.get(p).value(i) - center[i]; + distance += d * d; + } + minDistance = Math.min(distance, minDistance); + } + + sum+=minDistance; + } + + addValue(0,sum); + } +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/Separation.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/Separation.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/Separation.java new file mode 100644 index 0000000..25534a6 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/Separation.java @@ -0,0 +1,118 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2013 Yahoo! Inc. + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ +import com.yahoo.labs.samoa.instances.DenseInstance; +import com.yahoo.labs.samoa.instances.Instance; +import com.yahoo.labs.samoa.moa.cluster.Cluster; +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.cluster.SphereCluster; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; +import java.util.ArrayList; +import java.util.List; + +public class Separation extends MeasureCollection { + + public Separation() { + super(); + } + + @Override + protected String[] getNames() { + return new String[]{"BSS", "BSS-GT", "BSS-Ratio"}; + } + + //@Override + public void evaluateClusteringSamoa(Clustering clustering, + Clustering trueClustering, ArrayList<Instance> points) + throws Exception { + + double BSS_GT = 1.0; + double BSS; + int dimension = points.get(0).numAttributes() - 1; + SphereCluster sc = new SphereCluster(points, dimension); + + // DO INTERNAL EVALUATION + //clustering.getClustering().get(0).getCenter(); + + BSS = getBSS(clustering, sc.getCenter()); + + if (trueClustering != null) { + List<Instance> listInstances = new ArrayList<>(); + for (Cluster c : trueClustering.getClustering()) { + DenseInstance inst = new DenseInstance(c.getWeight(), c.getCenter()); + listInstances.add(inst); + } + SphereCluster gt = new SphereCluster(listInstances, dimension); + BSS_GT = getBSS(trueClustering, gt.getCenter()); + } + + addValue("BSS", BSS); + addValue("BSS-GT", BSS_GT); + addValue("BSS-Ratio", BSS / BSS_GT); + + } + + private double getBSS(Clustering clustering, double[] mean) { + double bss = 0.0; + for (int i = 0; i < clustering.size(); i++) { + double weight = clustering.get(i).getWeight(); + double sum = 0.0; + for (int j = 0; j < mean.length; j++) { + sum += Math.pow((mean[j] - clustering.get(i).getCenter()[j]), 2); + } + bss += weight * sum; + } + + return bss; + } + + @Override + protected void evaluateClustering(Clustering clustering, + Clustering trueClustering, ArrayList<DataPoint> points) + throws Exception { + double BSS_GT = 1.0; + double BSS; + int dimension = points.get(0).numAttributes() - 1; + SphereCluster sc = new SphereCluster(points, dimension); + + // DO INTERNAL EVALUATION + //clustering.getClustering().get(0).getCenter(); + + BSS = getBSS(clustering, sc.getCenter()); + + if (trueClustering != null) { + String s = ""; + List<Instance> listInstances = new ArrayList<>(); + for (Cluster c : trueClustering.getClustering()) { + DenseInstance inst = new DenseInstance(c.getWeight(), c.getCenter()); + listInstances.add(inst); + s += " " + c.getWeight(); + } + SphereCluster gt = new SphereCluster(listInstances, dimension); + BSS_GT = getBSS(trueClustering, gt.getCenter()); + } + + addValue("BSS", BSS); + addValue("BSS-GT", BSS_GT); + addValue("BSS-Ratio", BSS / BSS_GT); + } +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SilhouetteCoefficient.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SilhouetteCoefficient.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SilhouetteCoefficient.java new file mode 100644 index 0000000..6dee336 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SilhouetteCoefficient.java @@ -0,0 +1,137 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2010 RWTH Aachen University, Germany + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + + +import com.yahoo.labs.samoa.moa.cluster.Cluster; +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import java.util.ArrayList; + + +public class SilhouetteCoefficient extends MeasureCollection{ + private double pointInclusionProbThreshold = 0.8; + + public SilhouetteCoefficient() { + super(); + } + + @Override + protected boolean[] getDefaultEnabled() { + return new boolean[]{false}; + } + + @Override + public String[] getNames() { + return new String[]{"SilhCoeff"}; + } + + public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) { + int numFCluster = clustering.size(); + + double [][] pointInclusionProbFC = new double[points.size()][numFCluster]; + for (int p = 0; p < points.size(); p++) { + DataPoint point = points.get(p); + for (int fc = 0; fc < numFCluster; fc++) { + Cluster cl = clustering.get(fc); + pointInclusionProbFC[p][fc] = cl.getInclusionProbability(point); + } + } + + double silhCoeff = 0.0; + int totalCount = 0; + for (int p = 0; p < points.size(); p++) { + DataPoint point = points.get(p); + ArrayList<Integer> ownClusters = new ArrayList<>(); + for (int fc = 0; fc < numFCluster; fc++) { + if(pointInclusionProbFC[p][fc] > pointInclusionProbThreshold){ + ownClusters.add(fc); + } + } + + if(ownClusters.size() > 0){ + double[] distanceByClusters = new double[numFCluster]; + int[] countsByClusters = new int[numFCluster]; + //calculate averageDistance of p to all cluster + for (int p1 = 0; p1 < points.size(); p1++) { + DataPoint point1 = points.get(p1); + if(p1!= p && point1.classValue() != -1){ + for (int fc = 0; fc < numFCluster; fc++) { + if(pointInclusionProbFC[p1][fc] > pointInclusionProbThreshold){ + double distance = distance(point, point1); + distanceByClusters[fc]+=distance; + countsByClusters[fc]++; + } + } + } + } + + //find closest OWN cluster as clusters might overlap + double minAvgDistanceOwn = Double.MAX_VALUE; + int minOwnIndex = -1; + for (int fc : ownClusters) { + double normDist = distanceByClusters[fc]/(double)countsByClusters[fc]; + if(normDist < minAvgDistanceOwn){// && pointInclusionProbFC[p][fc] > pointInclusionProbThreshold){ + minAvgDistanceOwn = normDist; + minOwnIndex = fc; + } + } + + + //find closest other (or other own) cluster + double minAvgDistanceOther = Double.MAX_VALUE; + for (int fc = 0; fc < numFCluster; fc++) { + if(fc != minOwnIndex){ + double normDist = distanceByClusters[fc]/(double)countsByClusters[fc]; + if(normDist < minAvgDistanceOther){ + minAvgDistanceOther = normDist; + } + } + } + + double silhP = (minAvgDistanceOther-minAvgDistanceOwn)/Math.max(minAvgDistanceOther, minAvgDistanceOwn); + point.setMeasureValue("SC - own", minAvgDistanceOwn); + point.setMeasureValue("SC - other", minAvgDistanceOther); + point.setMeasureValue("SC", silhP); + + silhCoeff+=silhP; + totalCount++; + //System.out.println(point.getTimestamp()+" Silh "+silhP+" / "+avgDistanceOwn+" "+minAvgDistanceOther+" (C"+minIndex+")"); + } + } + if(totalCount>0) + silhCoeff/=(double)totalCount; + //normalize from -1, 1 to 0,1 + silhCoeff = (silhCoeff+1)/2.0; + addValue(0,silhCoeff); + } + + private double distance(DataPoint inst1, DataPoint inst2){ + double distance = 0.0; + int numDims = inst1.numAttributes(); + for (int i = 0; i < numDims; i++) { + double d = inst1.value(i) - inst2.value(i); + distance += d * d; + } + return Math.sqrt(distance); + } +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/StatisticalCollection.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/StatisticalCollection.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/StatisticalCollection.java new file mode 100644 index 0000000..6fc7adc --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/StatisticalCollection.java @@ -0,0 +1,186 @@ +package com.yahoo.labs.samoa.evaluation.measures; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2010 RWTH Aachen University, Germany + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import java.util.ArrayList; +import java.util.Arrays; + +import com.yahoo.labs.samoa.moa.cluster.Clustering; +import com.yahoo.labs.samoa.moa.core.DataPoint; +import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; +import com.yahoo.labs.samoa.moa.evaluation.MembershipMatrix; + +public class StatisticalCollection extends MeasureCollection{ + private boolean debug = false; + + @Override + protected String[] getNames() { + //String[] names = {"van Dongen","Rand statistic", "C Index"}; + return new String[]{"van Dongen","Rand statistic"}; + } + + @Override + protected boolean[] getDefaultEnabled() { + return new boolean[]{false, false}; + } + + @Override + public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) throws Exception { + + + MembershipMatrix mm = new MembershipMatrix(clustering, points); + int numClasses = mm.getNumClasses(); + int numCluster = clustering.size()+1; + int n = mm.getTotalEntries(); + + double dongenMaxFC = 0; + double dongenMaxSumFC = 0; + for (int i = 0; i < numCluster; i++){ + double max = 0; + for (int j = 0; j < numClasses; j++) { + if(mm.getClusterClassWeight(i, j)>max) max = mm.getClusterClassWeight(i, j); + } + dongenMaxFC+=max; + if(mm.getClusterSum(i)>dongenMaxSumFC) dongenMaxSumFC = mm.getClusterSum(i); + } + + double dongenMaxHC = 0; + double dongenMaxSumHC = 0; + for (int j = 0; j < numClasses; j++) { + double max = 0; + for (int i = 0; i < numCluster; i++){ + if(mm.getClusterClassWeight(i, j)>max) max = mm.getClusterClassWeight(i, j); + } + dongenMaxHC+=max; + if(mm.getClassSum(j)>dongenMaxSumHC) dongenMaxSumHC = mm.getClassSum(j); + } + + double dongen = (dongenMaxFC + dongenMaxHC)/(2*n); + //normalized dongen + //double dongen = 1-(2*n - dongenMaxFC - dongenMaxHC)/(2*n - dongenMaxSumFC - dongenMaxSumHC); + if(debug) + System.out.println("Dongen HC:"+dongenMaxHC+" FC:"+dongenMaxFC+" Total:"+dongen+" n "+n); + + addValue("van Dongen", dongen); + + + //Rand index + //http://www.cais.ntu.edu.sg/~qihe/menu4.html + double m1 = 0; + for (int j = 0; j < numClasses; j++) { + double v = mm.getClassSum(j); + m1+= v*(v-1)/2.0; + } + double m2 = 0; + for (int i = 0; i < numCluster; i++){ + double v = mm.getClusterSum(i); + m2+= v*(v-1)/2.0; + } + + double m = 0; + for (int i = 0; i < numCluster; i++){ + for (int j = 0; j < numClasses; j++) { + double v = mm.getClusterClassWeight(i, j); + m+= v*(v-1)/2.0; + } + } + double M = n*(n-1)/2.0; + double rand = (M - m1 - m2 +2*m)/M; + //normalized rand + //double rand = (m - m1*m2/M)/(m1/2.0 + m2/2.0 - m1*m2/M); + + addValue("Rand statistic", rand); + + + //addValue("C Index",cindex(clustering, points)); + } + + + + public double cindex(Clustering clustering, ArrayList<DataPoint> points){ + int numClusters = clustering.size(); + double withinClustersDistance = 0; + int numDistancesWithin = 0; + double numDistances = 0; + + //double[] withinClusters = new double[numClusters]; + double[] minWithinClusters = new double[numClusters]; + double[] maxWithinClusters = new double[numClusters]; + ArrayList<Integer>[] pointsInClusters = new ArrayList[numClusters]; + for (int c = 0; c < numClusters; c++) { + pointsInClusters[c] = new ArrayList<>(); + minWithinClusters[c] = Double.MAX_VALUE; + maxWithinClusters[c] = Double.MIN_VALUE; + } + + for (int p = 0; p < points.size(); p++) { + for (int c = 0; c < clustering.size(); c++) { + if(clustering.get(c).getInclusionProbability(points.get(p)) > 0.8){ + pointsInClusters[c].add(p); + numDistances++; + } + } + } + + //calc within cluster distances + min and max values + for (int c = 0; c < numClusters; c++) { + int numDistancesInC = 0; + ArrayList<Integer> pointsInC = pointsInClusters[c]; + for (int p = 0; p < pointsInC.size(); p++) { + DataPoint point = points.get(pointsInC.get(p)); + for (int p1 = p+1; p1 < pointsInC.size(); p1++) { + numDistancesWithin++; + numDistancesInC++; + DataPoint point1 = points.get(pointsInC.get(p1)); + double dist = point.getDistance(point1); + withinClustersDistance+=dist; + if(minWithinClusters[c] > dist) minWithinClusters[c] = dist; + if(maxWithinClusters[c] < dist) maxWithinClusters[c] = dist; + } + } + } + + double minWithin = Double.MAX_VALUE; + double maxWithin = Double.MIN_VALUE; + for (int c = 0; c < numClusters; c++) { + if(minWithinClusters[c] < minWithin) + minWithin = minWithinClusters[c]; + if(maxWithinClusters[c] > maxWithin) + maxWithin = maxWithinClusters[c]; + } + + double cindex = 0; + if(numDistancesWithin != 0){ + double meanWithinClustersDistance = withinClustersDistance/numDistancesWithin; + cindex = (meanWithinClustersDistance - minWithin)/(maxWithin-minWithin); + } + + + if(debug){ + System.out.println("Min:"+Arrays.toString(minWithinClusters)); + System.out.println("Max:"+Arrays.toString(maxWithinClusters)); + System.out.println("totalWithin:"+numDistancesWithin); + } + return cindex; + } + + +}
