http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java index 53fb4dc..c31fa74 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/CMM_GTAnalysis.java @@ -33,811 +33,828 @@ import java.util.Iterator; * * CMM: Ground truth analysis * - * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering on Evolving Data Streams", KDD, 2011 + * Reference: Kremer et al., + * "An Effective Evaluation Measure for Clustering on Evolving Data Streams", + * KDD, 2011 * - * @author Timm jansen - * Data Management and Data Exploration Group, RWTH Aachen University -*/ + * @author Timm jansen Data Management and Data Exploration Group, RWTH Aachen + * University + */ /* - * TODO: - * - try to avoid calcualting the radius multiple times - * - avoid the full distance map? - * - knn functionality in clusters - * - noise error + * TODO: - try to avoid calcualting the radius multiple times - avoid the full + * distance map? - knn functionality in clusters - noise error */ -public class CMM_GTAnalysis{ - - /** - * the given ground truth clustering - */ - private Clustering gtClustering; - - /** - * list of given points within the horizon - */ - private ArrayList<CMMPoint> cmmpoints; - +public class CMM_GTAnalysis { + + /** + * the given ground truth clustering + */ + private Clustering gtClustering; + + /** + * list of given points within the horizon + */ + private ArrayList<CMMPoint> cmmpoints; + + /** + * the newly calculate ground truth clustering + */ + private ArrayList<GTCluster> gt0Clusters; + + /** + * IDs of noise points + */ + private ArrayList<Integer> noise; + + /** + * total number of points + */ + private int numPoints; + + /** + * number of clusters of the original ground truth + */ + private int numGTClusters; + + /** + * number of classes of the original ground truth, in case of a micro + * clustering ground truth this differs from numGTClusters + */ + private int numGTClasses; + + /** + * number of classes after we are done with the analysis + */ + private int numGT0Classes; + + /** + * number of dimensions + */ + private int numDims; + + /** + * mapping between true cluster ID/class label of the original ground truth + * and the internal cluster ID/working class label. + * + * different original cluster IDs might map to the same new cluster ID due to + * merging of two clusters + */ + private HashMap<Integer, Integer> mapTrueLabelToWorkLabel; + + /** + * log of how clusters have been merged (for debugging) + */ + private int[] mergeMap; + + /** + * number of non-noise points that will create an error due to the underlying + * clustering model (e.g. point being covered by two clusters representing + * different classes) + */ + private int noiseErrorByModel; + + /** + * number of noise points that will create an error due to the underlying + * clustering model (e.g. noise point being covered by a cluster) + */ + private int pointErrorByModel; + + /** + * CMM debug mode + */ + private boolean debug = false; + + /******* CMM parameter ***********/ + + /** + * defines how many nearest neighbors will be used + */ + private int knnNeighbourhood = 2; + + /** + * the threshold which defines when ground truth clusters will be merged. set + * to 1 to disable merging + */ + private double tauConnection = 0.5; + + /** + * experimental (default: disabled) separate k for points to cluster and + * cluster to cluster + */ + private double clusterConnectionMaxPoints = knnNeighbourhood; + + /** + * experimental (default: disabled) use exponential connectivity function to + * model different behavior: closer points will have a stronger connection + * compared to the linear function. Use ConnRefXValue and ConnX to better + * parameterize lambda, which controls the decay of the connectivity + */ + private boolean useExpConnectivity = false; + private double lambdaConnRefXValue = 0.01; + private double lambdaConnX = 4; + private double lamdaConn; + + /******************************************/ + + /** + * Wrapper class for data points to store CMM relevant attributes + * + */ + protected class CMMPoint extends DataPoint { /** - * the newly calculate ground truth clustering + * Reference to original point */ - private ArrayList<GTCluster> gt0Clusters; + protected DataPoint p = null; /** - * IDs of noise points + * point ID */ - private ArrayList<Integer> noise; - - /** - * total number of points - */ - private int numPoints; + protected int pID = 0; /** - * number of clusters of the original ground truth + * true class label */ - private int numGTClusters; + protected int trueClass = -1; /** - * number of classes of the original ground truth, in case of a - * micro clustering ground truth this differs from numGTClusters + * the connectivity of the point to its cluster */ - private int numGTClasses; + protected double connectivity = 1.0; /** - * number of classes after we are done with the analysis + * knn distnace within own cluster */ - private int numGT0Classes; + protected double knnInCluster = 0.0; /** - * number of dimensions + * knn indices (for debugging only) */ - private int numDims; + protected ArrayList<Integer> knnIndices; + + public CMMPoint(DataPoint point, int id) { + // make a copy, but keep reference + super(point, point.getTimestamp()); + p = point; + pID = id; + trueClass = (int) point.classValue(); + } /** - * mapping between true cluster ID/class label of the original ground truth - * and the internal cluster ID/working class label. + * Retruns the current working label of the cluster the point belongs to. + * The label can change due to merging of clusters. * - * different original cluster IDs might map to the same new cluster ID due to merging of two clusters + * @return the current working class label */ - private HashMap<Integer, Integer> mapTrueLabelToWorkLabel; + protected int workclass() { + if (trueClass == -1) + return -1; + else + return mapTrueLabelToWorkLabel.get(trueClass); + } + } - /** - * log of how clusters have been merged (for debugging) - */ - private int[] mergeMap; + /** + * Main class to model the new clusters that will be the output of the cluster + * analysis + * + */ + protected class GTCluster { + /** points that are per definition in the cluster */ + private ArrayList<Integer> points = new ArrayList<Integer>(); /** - * number of non-noise points that will create an error due to the underlying clustering model - * (e.g. point being covered by two clusters representing different classes) + * a new GT cluster consists of one or more "old" GT clusters. + * Connected/overlapping clusters cannot be merged directly because of the + * underlying cluster model. E.g. for merging two spherical clusters the new + * cluster sphere can cover a lot more space then two separate smaller + * spheres. To keep the original coverage we need to keep the orignal + * clusters and merge them on an abstract level. */ - private int noiseErrorByModel; + private ArrayList<Integer> clusterRepresentations = new ArrayList<Integer>(); - /** - * number of noise points that will create an error due to the underlying clustering model - * (e.g. noise point being covered by a cluster) - */ - private int pointErrorByModel; - - /** - * CMM debug mode - */ - private boolean debug = false; + /** current work class (changes when merging) */ + private int workclass; - - /******* CMM parameter ***********/ + /** original work class */ + private final int orgWorkClass; - /** - * defines how many nearest neighbors will be used - */ - private int knnNeighbourhood = 2; + /** original class label */ + private final int label; - /** - * the threshold which defines when ground truth clusters will be merged. - * set to 1 to disable merging - */ - private double tauConnection = 0.5; - - /** - * experimental (default: disabled) - * separate k for points to cluster and cluster to cluster - */ - private double clusterConnectionMaxPoints = knnNeighbourhood; - - /** - * experimental (default: disabled) - * use exponential connectivity function to model different behavior: - * closer points will have a stronger connection compared to the linear function. - * Use ConnRefXValue and ConnX to better parameterize lambda, which controls - * the decay of the connectivity - */ - private boolean useExpConnectivity = false; - private double lambdaConnRefXValue = 0.01; - private double lambdaConnX = 4; - private double lamdaConn; - - - /******************************************/ - - - /** - * Wrapper class for data points to store CMM relevant attributes - * - */ - protected class CMMPoint extends DataPoint{ - /** - * Reference to original point - */ - protected DataPoint p = null; - - /** - * point ID - */ - protected int pID = 0; - - - /** - * true class label - */ - protected int trueClass = -1; - - - /** - * the connectivity of the point to its cluster - */ - protected double connectivity = 1.0; - - - /** - * knn distnace within own cluster - */ - protected double knnInCluster = 0.0; - - - /** - * knn indices (for debugging only) - */ - protected ArrayList<Integer> knnIndices; - - public CMMPoint(DataPoint point, int id) { - //make a copy, but keep reference - super(point,point.getTimestamp()); - p = point; - pID = id; - trueClass = (int)point.classValue(); - } + /** clusters that have been merged into this cluster (debugging) */ + private ArrayList<Integer> mergedWorkLabels = null; - - /** - * Retruns the current working label of the cluster the point belongs to. - * The label can change due to merging of clusters. - * - * @return the current working class label - */ - protected int workclass(){ - if(trueClass == -1 ) - return -1; - else - return mapTrueLabelToWorkLabel.get(trueClass); - } + /** average knn distance of all points in the cluster */ + private double knnMeanAvg = 0; + + /** average deviation of knn distance of all points */ + private double knnDevAvg = 0; + + /** connectivity of the cluster to all other clusters */ + private ArrayList<Double> connections = new ArrayList<Double>(); + + private GTCluster(int workclass, int label, int gtClusteringID) { + this.orgWorkClass = workclass; + this.workclass = workclass; + this.label = label; + this.clusterRepresentations.add(gtClusteringID); } - - /** - * Main class to model the new clusters that will be the output of the cluster analysis - * + * The original class label the cluster represents + * + * @return original class label */ - protected class GTCluster{ - /** points that are per definition in the cluster */ - private ArrayList<Integer> points = new ArrayList<Integer>(); - - /** a new GT cluster consists of one or more "old" GT clusters. - * Connected/overlapping clusters cannot be merged directly because of the - * underlying cluster model. E.g. for merging two spherical clusters the new - * cluster sphere can cover a lot more space then two separate smaller spheres. - * To keep the original coverage we need to keep the orignal clusters and merge - * them on an abstract level. */ - private ArrayList<Integer> clusterRepresentations = new ArrayList<Integer>(); - - /** current work class (changes when merging) */ - private int workclass; - - /** original work class */ - private final int orgWorkClass; - - /** original class label*/ - private final int label; - - /** clusters that have been merged into this cluster (debugging)*/ - private ArrayList<Integer> mergedWorkLabels = null; - - /** average knn distance of all points in the cluster*/ - private double knnMeanAvg = 0; - - /** average deviation of knn distance of all points*/ - private double knnDevAvg = 0; - - /** connectivity of the cluster to all other clusters */ - private ArrayList<Double> connections = new ArrayList<Double>(); - - - private GTCluster(int workclass, int label, int gtClusteringID) { - this.orgWorkClass = workclass; - this.workclass = workclass; - this.label = label; - this.clusterRepresentations.add(gtClusteringID); - } + protected int getLabel() { + return label; + } - - /** - * The original class label the cluster represents - * @return original class label - */ - protected int getLabel(){ - return label; - } + /** + * Calculate the probability of the point being covered through the cluster + * + * @param point + * to calculate the probability for + * @return probability of the point being covered through the cluster + */ + protected double getInclusionProbability(CMMPoint point) { + double prob = Double.MIN_VALUE; + // check all cluster representatives for coverage + for (int c = 0; c < clusterRepresentations.size(); c++) { + double tmp_prob = gtClustering.get(clusterRepresentations.get(c)).getInclusionProbability(point); + if (tmp_prob > prob) + prob = tmp_prob; + } + return prob; + } - /** - * Calculate the probability of the point being covered through the cluster - * @param point to calculate the probability for - * @return probability of the point being covered through the cluster - */ - protected double getInclusionProbability(CMMPoint point){ - double prob = Double.MIN_VALUE; - //check all cluster representatives for coverage - for (int c = 0; c < clusterRepresentations.size(); c++) { - double tmp_prob = gtClustering.get(clusterRepresentations.get(c)).getInclusionProbability(point); - if(tmp_prob > prob) prob = tmp_prob; - } - return prob; + /** + * calculate knn distances of points within own cluster + average knn + * distance and average knn distance deviation of all points + */ + private void calculateKnn() { + for (int p0 : points) { + CMMPoint cmdp = cmmpoints.get(p0); + if (!cmdp.isNoise()) { + AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); + AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); + + // calculate nearest neighbours + getKnnInCluster(cmdp, knnNeighbourhood, points, knnDist, knnPointIndex); + + // TODO: What to do if we have less then k neighbours? + double avgKnn = 0; + for (int i = 0; i < knnDist.size(); i++) { + avgKnn += knnDist.get(i); + } + if (knnDist.size() != 0) + avgKnn /= knnDist.size(); + cmdp.knnInCluster = avgKnn; + cmdp.knnIndices = knnPointIndex; + cmdp.p.setMeasureValue("knnAvg", cmdp.knnInCluster); + + knnMeanAvg += avgKnn; + knnDevAvg += Math.pow(avgKnn, 2); } + } + knnMeanAvg = knnMeanAvg / (double) points.size(); + knnDevAvg = knnDevAvg / (double) points.size(); - - /** - * calculate knn distances of points within own cluster - * + average knn distance and average knn distance deviation of all points - */ - private void calculateKnn(){ - for (int p0 : points) { - CMMPoint cmdp = cmmpoints.get(p0); - if(!cmdp.isNoise()){ - AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); - AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); - - //calculate nearest neighbours - getKnnInCluster(cmdp, knnNeighbourhood, points, knnDist,knnPointIndex); - - //TODO: What to do if we have less then k neighbours? - double avgKnn = 0; - for (int i = 0; i < knnDist.size(); i++) { - avgKnn+= knnDist.get(i); - } - if(knnDist.size()!=0) - avgKnn/=knnDist.size(); - cmdp.knnInCluster = avgKnn; - cmdp.knnIndices = knnPointIndex; - cmdp.p.setMeasureValue("knnAvg", cmdp.knnInCluster); - - knnMeanAvg+=avgKnn; - knnDevAvg+=Math.pow(avgKnn,2); - } - } - knnMeanAvg=knnMeanAvg/(double)points.size(); - knnDevAvg=knnDevAvg/(double)points.size(); - - double variance = knnDevAvg-Math.pow(knnMeanAvg,2.0); - // Due to numerical errors, small negative values can occur. - if (variance <= 0.0) variance = 1e-50; - knnDevAvg = Math.sqrt(variance); + double variance = knnDevAvg - Math.pow(knnMeanAvg, 2.0); + // Due to numerical errors, small negative values can occur. + if (variance <= 0.0) + variance = 1e-50; + knnDevAvg = Math.sqrt(variance); - } + } - - /** - * Calculate the connection of a cluster to this cluster - * @param otherCid cluster id of the other cluster - * @param initial flag for initial run - */ - private void calculateClusterConnection(int otherCid, boolean initial){ - double avgConnection = 0; - if(workclass==otherCid){ - avgConnection = 1; - } - else{ - AutoExpandVector<Double> kmax = new AutoExpandVector<Double>(); - AutoExpandVector<Integer> kmaxIndexes = new AutoExpandVector<Integer>(); - - for(int p : points){ - CMMPoint cmdp = cmmpoints.get(p); - double con_p_Cj = getConnectionValue(cmmpoints.get(p), otherCid); - double connection = cmdp.connectivity * con_p_Cj; - if(initial){ - cmdp.p.setMeasureValue("Connection to C"+otherCid, con_p_Cj); - } - - //connection - if(kmax.size() < clusterConnectionMaxPoints || connection > kmax.get(kmax.size()-1)){ - int index = 0; - while(index < kmax.size() && connection < kmax.get(index)) { - index++; - } - kmax.add(index, connection); - kmaxIndexes.add(index, p); - if(kmax.size() > clusterConnectionMaxPoints){ - kmax.remove(kmax.size()-1); - kmaxIndexes.add(kmaxIndexes.size()-1); - } - } - } - //connection - for (int k = 0; k < kmax.size(); k++) { - avgConnection+= kmax.get(k); - } - avgConnection/=kmax.size(); + /** + * Calculate the connection of a cluster to this cluster + * + * @param otherCid + * cluster id of the other cluster + * @param initial + * flag for initial run + */ + private void calculateClusterConnection(int otherCid, boolean initial) { + double avgConnection = 0; + if (workclass == otherCid) { + avgConnection = 1; + } + else { + AutoExpandVector<Double> kmax = new AutoExpandVector<Double>(); + AutoExpandVector<Integer> kmaxIndexes = new AutoExpandVector<Integer>(); + + for (int p : points) { + CMMPoint cmdp = cmmpoints.get(p); + double con_p_Cj = getConnectionValue(cmmpoints.get(p), otherCid); + double connection = cmdp.connectivity * con_p_Cj; + if (initial) { + cmdp.p.setMeasureValue("Connection to C" + otherCid, con_p_Cj); + } + + // connection + if (kmax.size() < clusterConnectionMaxPoints || connection > kmax.get(kmax.size() - 1)) { + int index = 0; + while (index < kmax.size() && connection < kmax.get(index)) { + index++; } - - if(otherCid<connections.size()){ - connections.set(otherCid, avgConnection); + kmax.add(index, connection); + kmaxIndexes.add(index, p); + if (kmax.size() > clusterConnectionMaxPoints) { + kmax.remove(kmax.size() - 1); + kmaxIndexes.add(kmaxIndexes.size() - 1); } - else - if(connections.size() == otherCid){ - connections.add(avgConnection); - } - else - System.out.println("Something is going really wrong with the connection listing!"+knnNeighbourhood+" "+tauConnection); + } } - - - /** - * Merge a cluster into this cluster - * @param mergeID the ID of the cluster to be merged - */ - private void mergeCluster(int mergeID){ - if(mergeID < gt0Clusters.size()){ - //track merging (debugging) - for (int i = 0; i < numGTClasses; i++) { - if(mergeMap[i]==mergeID) - mergeMap[i]=workclass; - if(mergeMap[i]>mergeID) - mergeMap[i]--; - } - GTCluster gtcMerge = gt0Clusters.get(mergeID); - if(debug) - System.out.println("Merging C"+gtcMerge.workclass+" into C"+workclass+ - " with Con "+connections.get(mergeID)+" / "+gtcMerge.connections.get(workclass)); - - - //update mapTrueLabelToWorkLabel - mapTrueLabelToWorkLabel.put(gtcMerge.label, workclass); - Iterator iterator = mapTrueLabelToWorkLabel.keySet().iterator(); - while (iterator.hasNext()) { - Integer key = (Integer)iterator.next(); - //update pointer of already merged cluster - int value = mapTrueLabelToWorkLabel.get(key); - if(value == mergeID) - mapTrueLabelToWorkLabel.put(key, workclass); - if(value > mergeID) - mapTrueLabelToWorkLabel.put(key, value-1); - } - - //merge points from B into A - points.addAll(gtcMerge.points); - clusterRepresentations.addAll(gtcMerge.clusterRepresentations); - if(mergedWorkLabels==null){ - mergedWorkLabels = new ArrayList<Integer>(); - } - mergedWorkLabels.add(gtcMerge.orgWorkClass); - if(gtcMerge.mergedWorkLabels!=null) - mergedWorkLabels.addAll(gtcMerge.mergedWorkLabels); - - gt0Clusters.remove(mergeID); - - //update workclass labels - for(int c=mergeID; c < gt0Clusters.size(); c++){ - gt0Clusters.get(c).workclass = c; - } - - //update knn distances - calculateKnn(); - for(int c=0; c < gt0Clusters.size(); c++){ - gt0Clusters.get(c).connections.remove(mergeID); - - //recalculate connection from other clusters to the new merged one - gt0Clusters.get(c).calculateClusterConnection(workclass,false); - //and from new merged one to other clusters - gt0Clusters.get(workclass).calculateClusterConnection(c,false); - } - } - else{ - System.out.println("Merge indices are not valid"); - } + // connection + for (int k = 0; k < kmax.size(); k++) { + avgConnection += kmax.get(k); } + avgConnection /= kmax.size(); + } + + if (otherCid < connections.size()) { + connections.set(otherCid, avgConnection); + } + else if (connections.size() == otherCid) { + connections.add(avgConnection); + } + else + System.out.println("Something is going really wrong with the connection listing!" + knnNeighbourhood + " " + + tauConnection); } - /** - * @param trueClustering the ground truth clustering - * @param points data points - * @param enableClassMerge allow class merging (should be set to true on default) + * Merge a cluster into this cluster + * + * @param mergeID + * the ID of the cluster to be merged */ - public CMM_GTAnalysis(Clustering trueClustering, ArrayList<DataPoint> points, boolean enableClassMerge){ - if(debug) - System.out.println("GT Analysis Debug Output"); - - noiseErrorByModel = 0; - pointErrorByModel = 0; - if(!enableClassMerge){ - tauConnection = 1.0; - } - - lamdaConn = -Math.log(lambdaConnRefXValue)/Math.log(2)/lambdaConnX; - - this.gtClustering = trueClustering; - - numPoints = points.size(); - numDims = points.get(0).numAttributes()-1; - numGTClusters = gtClustering.size(); - - //init mappings between work and true labels - mapTrueLabelToWorkLabel = new HashMap<Integer, Integer>(); - - //set up base of new clustering - gt0Clusters = new ArrayList<GTCluster>(); - int numWorkClasses = 0; - //create label to worklabel mapping as real labels can be just a set of unordered integers - for (int i = 0; i < numGTClusters; i++) { - int label = (int)gtClustering.get(i).getGroundTruth(); - if(!mapTrueLabelToWorkLabel.containsKey(label)){ - gt0Clusters.add(new GTCluster(numWorkClasses,label,i)); - mapTrueLabelToWorkLabel.put(label,numWorkClasses); - numWorkClasses++; - } - else{ - gt0Clusters.get(mapTrueLabelToWorkLabel.get(label)).clusterRepresentations.add(i); - } - } - numGTClasses = numWorkClasses; - - mergeMap = new int[numGTClasses]; + private void mergeCluster(int mergeID) { + if (mergeID < gt0Clusters.size()) { + // track merging (debugging) for (int i = 0; i < numGTClasses; i++) { - mergeMap[i]=i; - } - - //create cmd point wrapper instances - cmmpoints = new ArrayList<CMMPoint>(); - for (int p = 0; p < points.size(); p++) { - CMMPoint cmdp = new CMMPoint(points.get(p), p); - cmmpoints.add(cmdp); + if (mergeMap[i] == mergeID) + mergeMap[i] = workclass; + if (mergeMap[i] > mergeID) + mergeMap[i]--; } - - - //split points up into their GTClusters and Noise (according to class labels) - noise = new ArrayList<Integer>(); - for (int p = 0; p < numPoints; p++) { - if(cmmpoints.get(p).isNoise()){ - noise.add(p); - } - else{ - gt0Clusters.get(cmmpoints.get(p).workclass()).points.add(p); - } + GTCluster gtcMerge = gt0Clusters.get(mergeID); + if (debug) + System.out.println("Merging C" + gtcMerge.workclass + " into C" + workclass + + " with Con " + connections.get(mergeID) + " / " + gtcMerge.connections.get(workclass)); + + // update mapTrueLabelToWorkLabel + mapTrueLabelToWorkLabel.put(gtcMerge.label, workclass); + Iterator iterator = mapTrueLabelToWorkLabel.keySet().iterator(); + while (iterator.hasNext()) { + Integer key = (Integer) iterator.next(); + // update pointer of already merged cluster + int value = mapTrueLabelToWorkLabel.get(key); + if (value == mergeID) + mapTrueLabelToWorkLabel.put(key, workclass); + if (value > mergeID) + mapTrueLabelToWorkLabel.put(key, value - 1); } - //calculate initial knnMean and knnDev - for (GTCluster gtc : gt0Clusters) { - gtc.calculateKnn(); + // merge points from B into A + points.addAll(gtcMerge.points); + clusterRepresentations.addAll(gtcMerge.clusterRepresentations); + if (mergedWorkLabels == null) { + mergedWorkLabels = new ArrayList<Integer>(); } + mergedWorkLabels.add(gtcMerge.orgWorkClass); + if (gtcMerge.mergedWorkLabels != null) + mergedWorkLabels.addAll(gtcMerge.mergedWorkLabels); - //calculate cluster connections - calculateGTClusterConnections(); + gt0Clusters.remove(mergeID); - //calculate point connections with own clusters - calculateGTPointQualities(); - - if(debug) - System.out.println("GT Analysis Debug End"); - - } - - /** - * Calculate the connection of a point to a cluster - * - * @param cmmp the point to calculate the connection for - * @param clusterID the corresponding cluster - * @return the connection value - */ - //TODO: Cache the connection value for a point to the different clusters??? - protected double getConnectionValue(CMMPoint cmmp, int clusterID){ - AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); - AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); - - //calculate the knn distance of the point to the cluster - getKnnInCluster(cmmp, knnNeighbourhood, gt0Clusters.get(clusterID).points, knnDist, knnPointIndex); - - //TODO: What to do if we have less then k neighbors? - double avgDist = 0; - for (int i = 0; i < knnDist.size(); i++) { - avgDist+= knnDist.get(i); + // update workclass labels + for (int c = mergeID; c < gt0Clusters.size(); c++) { + gt0Clusters.get(c).workclass = c; } - //what to do if we only have a single point??? - if(knnDist.size()!=0) - avgDist/=knnDist.size(); - else - return 0; - - //get the upper knn distance of the cluster - double upperKnn = gt0Clusters.get(clusterID).knnMeanAvg + gt0Clusters.get(clusterID).knnDevAvg; - - /* calculate the connectivity based on knn distance of the point within the cluster - and the upper knn distance of the cluster*/ - if(avgDist < upperKnn){ - return 1; - } - else{ - //value that should be reached at upperKnn distance - //Choose connection formula - double conn; - if(useExpConnectivity) - conn = Math.pow(2,-lamdaConn*(avgDist-upperKnn)/upperKnn); - else - conn = upperKnn/avgDist; - if(Double.isNaN(conn)) - System.out.println("Connectivity NaN at "+cmmp.p.getTimestamp()); + // update knn distances + calculateKnn(); + for (int c = 0; c < gt0Clusters.size(); c++) { + gt0Clusters.get(c).connections.remove(mergeID); - return conn; + // recalculate connection from other clusters to the new merged one + gt0Clusters.get(c).calculateClusterConnection(workclass, false); + // and from new merged one to other clusters + gt0Clusters.get(workclass).calculateClusterConnection(c, false); } + } + else { + System.out.println("Merge indices are not valid"); + } } - - - /** - * @param cmmp point to calculate knn distance for - * @param k number of nearest neighbors to look for - * @param pointIDs list of point IDs to check - * @param knnDist sorted list of smallest knn distances (can already be filled to make updates possible) - * @param knnPointIndex list of corresponding knn indices - */ - private void getKnnInCluster(CMMPoint cmmp, int k, - ArrayList<Integer> pointIDs, - AutoExpandVector<Double> knnDist, - AutoExpandVector<Integer> knnPointIndex) { - - //iterate over every point in the choosen cluster, cal distance and insert into list - for (int p1 = 0; p1 < pointIDs.size(); p1++) { - int pid = pointIDs.get(p1); - if(cmmp.pID == pid) continue; - double dist = distance(cmmp,cmmpoints.get(pid)); - if(knnDist.size() < k || dist < knnDist.get(knnDist.size()-1)){ - int index = 0; - while(index < knnDist.size() && dist > knnDist.get(index)) { - index++; - } - knnDist.add(index, dist); - knnPointIndex.add(index,pid); - if(knnDist.size() > k){ - knnDist.remove(knnDist.size()-1); - knnPointIndex.remove(knnPointIndex.size()-1); - } - } - } + } + + /** + * @param trueClustering + * the ground truth clustering + * @param points + * data points + * @param enableClassMerge + * allow class merging (should be set to true on default) + */ + public CMM_GTAnalysis(Clustering trueClustering, ArrayList<DataPoint> points, boolean enableClassMerge) { + if (debug) + System.out.println("GT Analysis Debug Output"); + + noiseErrorByModel = 0; + pointErrorByModel = 0; + if (!enableClassMerge) { + tauConnection = 1.0; } - - - /** - * calculate initial connectivities - */ - private void calculateGTPointQualities(){ - for (int p = 0; p < numPoints; p++) { - CMMPoint cmdp = cmmpoints.get(p); - if(!cmdp.isNoise()){ - cmdp.connectivity = getConnectionValue(cmdp, cmdp.workclass()); - cmdp.p.setMeasureValue("Connectivity", cmdp.connectivity); - } - } + lamdaConn = -Math.log(lambdaConnRefXValue) / Math.log(2) / lambdaConnX; + + this.gtClustering = trueClustering; + + numPoints = points.size(); + numDims = points.get(0).numAttributes() - 1; + numGTClusters = gtClustering.size(); + + // init mappings between work and true labels + mapTrueLabelToWorkLabel = new HashMap<Integer, Integer>(); + + // set up base of new clustering + gt0Clusters = new ArrayList<GTCluster>(); + int numWorkClasses = 0; + // create label to worklabel mapping as real labels can be just a set of + // unordered integers + for (int i = 0; i < numGTClusters; i++) { + int label = (int) gtClustering.get(i).getGroundTruth(); + if (!mapTrueLabelToWorkLabel.containsKey(label)) { + gt0Clusters.add(new GTCluster(numWorkClasses, label, i)); + mapTrueLabelToWorkLabel.put(label, numWorkClasses); + numWorkClasses++; + } + else { + gt0Clusters.get(mapTrueLabelToWorkLabel.get(label)).clusterRepresentations.add(i); + } } + numGTClasses = numWorkClasses; - - - /** - * Calculate connections between clusters and merge clusters accordingly as - * long as connections exceed threshold - */ - private void calculateGTClusterConnections(){ - for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { - for (int c1 = 0; c1 < gt0Clusters.size(); c1++) { - gt0Clusters.get(c0).calculateClusterConnection(c1, true); - } - } - - boolean changedConnection = true; - while(changedConnection){ - if(debug){ - System.out.println("Cluster Connection"); - for (int c = 0; c < gt0Clusters.size(); c++) { - System.out.print("C"+gt0Clusters.get(c).label+" --> "); - for (int c1 = 0; c1 < gt0Clusters.get(c).connections.size(); c1++) { - System.out.print(" C"+gt0Clusters.get(c1).label+": "+gt0Clusters.get(c).connections.get(c1)); - } - System.out.println(""); - } - System.out.println(""); - } - - double max = 0; - int maxIndexI = -1; - int maxIndexJ = -1; - - changedConnection = false; - for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { - for (int c1 = c0+1; c1 < gt0Clusters.size(); c1++) { - if(c0==c1) continue; - double min =Math.min(gt0Clusters.get(c0).connections.get(c1), gt0Clusters.get(c1).connections.get(c0)); - if(min > max){ - max = min; - maxIndexI = c0; - maxIndexJ = c1; - } - } - } - if(maxIndexI!=-1 && max > tauConnection){ - gt0Clusters.get(maxIndexI).mergeCluster(maxIndexJ); - if(debug) - System.out.println("Merging "+maxIndexI+" and "+maxIndexJ+" because of connection "+max); - - changedConnection = true; - } - } - numGT0Classes = gt0Clusters.size(); + mergeMap = new int[numGTClasses]; + for (int i = 0; i < numGTClasses; i++) { + mergeMap[i] = i; } - - /** - * Calculates how well the original clusters are separable. - * Small values indicate bad separability, values close to 1 indicate good separability - * @return index of seperability - */ - public double getClassSeparability(){ -// int totalConn = numGTClasses*(numGTClasses-1)/2; -// int mergedConn = 0; -// for(GTCluster gt : gt0Clusters){ -// int merged = gt.clusterRepresentations.size(); -// if(merged > 1) -// mergedConn+=merged * (merged-1)/2; -// } -// if(totalConn == 0) -// return 0; -// else -// return 1-mergedConn/(double)totalConn; - return numGT0Classes/(double)numGTClasses; + // create cmd point wrapper instances + cmmpoints = new ArrayList<CMMPoint>(); + for (int p = 0; p < points.size(); p++) { + CMMPoint cmdp = new CMMPoint(points.get(p), p); + cmmpoints.add(cmdp); + } + // split points up into their GTClusters and Noise (according to class + // labels) + noise = new ArrayList<Integer>(); + for (int p = 0; p < numPoints; p++) { + if (cmmpoints.get(p).isNoise()) { + noise.add(p); + } + else { + gt0Clusters.get(cmmpoints.get(p).workclass()).points.add(p); + } } - - /** - * Calculates how well noise is separable from the given clusters - * Small values indicate bad separability, values close to 1 indicate good separability - * @return index of noise separability - */ - public double getNoiseSeparability(){ - if(noise.isEmpty()) - return 1; - - double connectivity = 0; - for(int p : noise){ - CMMPoint npoint = cmmpoints.get(p); - double maxConnection = 0; - - //TODO: some kind of pruning possible. what about weighting? - for (int c = 0; c < gt0Clusters.size(); c++) { - double connection = getConnectionValue(npoint, c); - if(connection > maxConnection) - maxConnection = connection; - } - connectivity+=maxConnection; - npoint.p.setMeasureValue("MaxConnection", maxConnection); - } + // calculate initial knnMean and knnDev + for (GTCluster gtc : gt0Clusters) { + gtc.calculateKnn(); + } - return 1-(connectivity / noise.size()); + // calculate cluster connections + calculateGTClusterConnections(); + + // calculate point connections with own clusters + calculateGTPointQualities(); + + if (debug) + System.out.println("GT Analysis Debug End"); + + } + + /** + * Calculate the connection of a point to a cluster + * + * @param cmmp + * the point to calculate the connection for + * @param clusterID + * the corresponding cluster + * @return the connection value + */ + // TODO: Cache the connection value for a point to the different clusters??? + protected double getConnectionValue(CMMPoint cmmp, int clusterID) { + AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); + AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); + + // calculate the knn distance of the point to the cluster + getKnnInCluster(cmmp, knnNeighbourhood, gt0Clusters.get(clusterID).points, knnDist, knnPointIndex); + + // TODO: What to do if we have less then k neighbors? + double avgDist = 0; + for (int i = 0; i < knnDist.size(); i++) { + avgDist += knnDist.get(i); } + // what to do if we only have a single point??? + if (knnDist.size() != 0) + avgDist /= knnDist.size(); + else + return 0; - - /** - * Calculates the relative number of errors being caused by the underlying cluster model - * @return quality of the model + // get the upper knn distance of the cluster + double upperKnn = gt0Clusters.get(clusterID).knnMeanAvg + gt0Clusters.get(clusterID).knnDevAvg; + + /* + * calculate the connectivity based on knn distance of the point within the + * cluster and the upper knn distance of the cluster */ - public double getModelQuality(){ - for(int p = 0; p < numPoints; p++){ - CMMPoint cmdp = cmmpoints.get(p); - for(int hc = 0; hc < numGTClusters;hc++){ - if(gtClustering.get(hc).getGroundTruth() != cmdp.trueClass){ - if(gtClustering.get(hc).getInclusionProbability(cmdp) >= 1){ - if(!cmdp.isNoise()) - pointErrorByModel++; - else - noiseErrorByModel++; - break; - } - } - } + if (avgDist < upperKnn) { + return 1; + } + else { + // value that should be reached at upperKnn distance + // Choose connection formula + double conn; + if (useExpConnectivity) + conn = Math.pow(2, -lamdaConn * (avgDist - upperKnn) / upperKnn); + else + conn = upperKnn / avgDist; + + if (Double.isNaN(conn)) + System.out.println("Connectivity NaN at " + cmmp.p.getTimestamp()); + + return conn; + } + } + + /** + * @param cmmp + * point to calculate knn distance for + * @param k + * number of nearest neighbors to look for + * @param pointIDs + * list of point IDs to check + * @param knnDist + * sorted list of smallest knn distances (can already be filled to + * make updates possible) + * @param knnPointIndex + * list of corresponding knn indices + */ + private void getKnnInCluster(CMMPoint cmmp, int k, + ArrayList<Integer> pointIDs, + AutoExpandVector<Double> knnDist, + AutoExpandVector<Integer> knnPointIndex) { + + // iterate over every point in the choosen cluster, cal distance and insert + // into list + for (int p1 = 0; p1 < pointIDs.size(); p1++) { + int pid = pointIDs.get(p1); + if (cmmp.pID == pid) + continue; + double dist = distance(cmmp, cmmpoints.get(pid)); + if (knnDist.size() < k || dist < knnDist.get(knnDist.size() - 1)) { + int index = 0; + while (index < knnDist.size() && dist > knnDist.get(index)) { + index++; } - if(debug) - System.out.println("Error by model: noise "+noiseErrorByModel+" point "+pointErrorByModel); - - return 1-((pointErrorByModel + noiseErrorByModel)/(double) numPoints); + knnDist.add(index, dist); + knnPointIndex.add(index, pid); + if (knnDist.size() > k) { + knnDist.remove(knnDist.size() - 1); + knnPointIndex.remove(knnPointIndex.size() - 1); + } + } } - - - /** - * Get CMM internal point - * @param index of the point - * @return cmm point - */ - protected CMMPoint getPoint(int index){ - return cmmpoints.get(index); + } + + /** + * calculate initial connectivities + */ + private void calculateGTPointQualities() { + for (int p = 0; p < numPoints; p++) { + CMMPoint cmdp = cmmpoints.get(p); + if (!cmdp.isNoise()) { + cmdp.connectivity = getConnectionValue(cmdp, cmdp.workclass()); + cmdp.p.setMeasureValue("Connectivity", cmdp.connectivity); + } } - - - /** - * Return cluster - * @param index of the cluster to return - * @return cluster - */ - protected GTCluster getGT0Cluster(int index){ - return gt0Clusters.get(index); + } + + /** + * Calculate connections between clusters and merge clusters accordingly as + * long as connections exceed threshold + */ + private void calculateGTClusterConnections() { + for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { + for (int c1 = 0; c1 < gt0Clusters.size(); c1++) { + gt0Clusters.get(c0).calculateClusterConnection(c1, true); + } } - /** - * Number of classes/clusters of the new clustering - * @return number of new clusters - */ - protected int getNumberOfGT0Classes() { - return numGT0Classes; + boolean changedConnection = true; + while (changedConnection) { + if (debug) { + System.out.println("Cluster Connection"); + for (int c = 0; c < gt0Clusters.size(); c++) { + System.out.print("C" + gt0Clusters.get(c).label + " --> "); + for (int c1 = 0; c1 < gt0Clusters.get(c).connections.size(); c1++) { + System.out.print(" C" + gt0Clusters.get(c1).label + ": " + gt0Clusters.get(c).connections.get(c1)); + } + System.out.println(""); + } + System.out.println(""); + } + + double max = 0; + int maxIndexI = -1; + int maxIndexJ = -1; + + changedConnection = false; + for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { + for (int c1 = c0 + 1; c1 < gt0Clusters.size(); c1++) { + if (c0 == c1) + continue; + double min = Math.min(gt0Clusters.get(c0).connections.get(c1), gt0Clusters.get(c1).connections.get(c0)); + if (min > max) { + max = min; + maxIndexI = c0; + maxIndexJ = c1; + } + } + } + if (maxIndexI != -1 && max > tauConnection) { + gt0Clusters.get(maxIndexI).mergeCluster(maxIndexJ); + if (debug) + System.out.println("Merging " + maxIndexI + " and " + maxIndexJ + " because of connection " + max); + + changedConnection = true; + } } - - /** - * Calculates Euclidian distance - * @param inst1 point as double array - * @param inst2 point as double array - * @return euclidian distance - */ - private double distance(Instance inst1, Instance inst2){ - return distance(inst1, inst2.toDoubleArray()); - + numGT0Classes = gt0Clusters.size(); + } + + /** + * Calculates how well the original clusters are separable. Small values + * indicate bad separability, values close to 1 indicate good separability + * + * @return index of seperability + */ + public double getClassSeparability() { + // int totalConn = numGTClasses*(numGTClasses-1)/2; + // int mergedConn = 0; + // for(GTCluster gt : gt0Clusters){ + // int merged = gt.clusterRepresentations.size(); + // if(merged > 1) + // mergedConn+=merged * (merged-1)/2; + // } + // if(totalConn == 0) + // return 0; + // else + // return 1-mergedConn/(double)totalConn; + return numGT0Classes / (double) numGTClasses; + + } + + /** + * Calculates how well noise is separable from the given clusters Small values + * indicate bad separability, values close to 1 indicate good separability + * + * @return index of noise separability + */ + public double getNoiseSeparability() { + if (noise.isEmpty()) + return 1; + + double connectivity = 0; + for (int p : noise) { + CMMPoint npoint = cmmpoints.get(p); + double maxConnection = 0; + + // TODO: some kind of pruning possible. what about weighting? + for (int c = 0; c < gt0Clusters.size(); c++) { + double connection = getConnectionValue(npoint, c); + if (connection > maxConnection) + maxConnection = connection; + } + connectivity += maxConnection; + npoint.p.setMeasureValue("MaxConnection", maxConnection); } - - /** - * Calculates Euclidian distance - * @param inst1 point as an instance - * @param inst2 point as double array - * @return euclidian distance - */ - private double distance(Instance inst1, double[] inst2){ - double distance = 0.0; - for (int i = 0; i < numDims; i++) { - double d = inst1.value(i) - inst2[i]; - distance += d * d; + + return 1 - (connectivity / noise.size()); + } + + /** + * Calculates the relative number of errors being caused by the underlying + * cluster model + * + * @return quality of the model + */ + public double getModelQuality() { + for (int p = 0; p < numPoints; p++) { + CMMPoint cmdp = cmmpoints.get(p); + for (int hc = 0; hc < numGTClusters; hc++) { + if (gtClustering.get(hc).getGroundTruth() != cmdp.trueClass) { + if (gtClustering.get(hc).getInclusionProbability(cmdp) >= 1) { + if (!cmdp.isNoise()) + pointErrorByModel++; + else + noiseErrorByModel++; + break; + } } - return Math.sqrt(distance); + } } - - /** - * String with main CMM parameters - * @return main CMM parameter - */ - public String getParameterString(){ - String para = ""; - para+="k="+knnNeighbourhood+";"; - if(useExpConnectivity){ - para+="lambdaConnX="+lambdaConnX+";"; - para+="lambdaConn="+lamdaConn+";"; - para+="lambdaConnRef="+lambdaConnRefXValue+";"; - } - para+="m="+clusterConnectionMaxPoints+";"; - para+="tauConn="+tauConnection+";"; + if (debug) + System.out.println("Error by model: noise " + noiseErrorByModel + " point " + pointErrorByModel); + + return 1 - ((pointErrorByModel + noiseErrorByModel) / (double) numPoints); + } + + /** + * Get CMM internal point + * + * @param index + * of the point + * @return cmm point + */ + protected CMMPoint getPoint(int index) { + return cmmpoints.get(index); + } + + /** + * Return cluster + * + * @param index + * of the cluster to return + * @return cluster + */ + protected GTCluster getGT0Cluster(int index) { + return gt0Clusters.get(index); + } + + /** + * Number of classes/clusters of the new clustering + * + * @return number of new clusters + */ + protected int getNumberOfGT0Classes() { + return numGT0Classes; + } + + /** + * Calculates Euclidian distance + * + * @param inst1 + * point as double array + * @param inst2 + * point as double array + * @return euclidian distance + */ + private double distance(Instance inst1, Instance inst2) { + return distance(inst1, inst2.toDoubleArray()); + + } + + /** + * Calculates Euclidian distance + * + * @param inst1 + * point as an instance + * @param inst2 + * point as double array + * @return euclidian distance + */ + private double distance(Instance inst1, double[] inst2) { + double distance = 0.0; + for (int i = 0; i < numDims; i++) { + double d = inst1.value(i) - inst2[i]; + distance += d * d; + } + return Math.sqrt(distance); + } + + /** + * String with main CMM parameters + * + * @return main CMM parameter + */ + public String getParameterString() { + String para = ""; + para += "k=" + knnNeighbourhood + ";"; + if (useExpConnectivity) { + para += "lambdaConnX=" + lambdaConnX + ";"; + para += "lambdaConn=" + lamdaConn + ";"; + para += "lambdaConnRef=" + lambdaConnRefXValue + ";"; + } + para += "m=" + clusterConnectionMaxPoints + ";"; + para += "tauConn=" + tauConnection + ";"; - return para; - } + return para; + } } - -
http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java index 0d311e4..1a44542 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/EntropyCollection.java @@ -30,145 +30,146 @@ import com.yahoo.labs.samoa.moa.core.DataPoint; import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; import com.yahoo.labs.samoa.moa.evaluation.MembershipMatrix; -public class EntropyCollection extends MeasureCollection{ - - private static final Logger logger = LoggerFactory.getLogger(EntropyCollection.class); - - @Override - protected String[] getNames() { - return new String[]{"GT cross entropy","FC cross entropy","Homogeneity","Completeness","V-Measure","VarInformation"}; - } - - @Override - protected boolean[] getDefaultEnabled() { - return new boolean[]{false, false, false, false, false, false}; +public class EntropyCollection extends MeasureCollection { + + private static final Logger logger = LoggerFactory.getLogger(EntropyCollection.class); + + @Override + protected String[] getNames() { + return new String[] { "GT cross entropy", "FC cross entropy", "Homogeneity", "Completeness", "V-Measure", + "VarInformation" }; + } + + @Override + protected boolean[] getDefaultEnabled() { + return new boolean[] { false, false, false, false, false, false }; + } + + @Override + public void evaluateClustering(Clustering fclustering, Clustering hClustering, ArrayList<DataPoint> points) + throws Exception { + + MembershipMatrix mm = new MembershipMatrix(fclustering, points); + int numClasses = mm.getNumClasses(); + int numCluster = fclustering.size() + 1; + int n = mm.getTotalEntries(); + + double FCentropy = 0; + if (numCluster > 1) { + for (int fc = 0; fc < numCluster; fc++) { + double weight = mm.getClusterSum(fc) / (double) n; + if (weight > 0) + FCentropy += weight * Math.log10(weight); + } + FCentropy /= (-1 * Math.log10(numCluster)); } - @Override - public void evaluateClustering(Clustering fclustering, Clustering hClustering, ArrayList<DataPoint> points) throws Exception { + logger.debug("FC entropy: {}", FCentropy); - MembershipMatrix mm = new MembershipMatrix(fclustering, points); - int numClasses = mm.getNumClasses(); - int numCluster = fclustering.size()+1; - int n = mm.getTotalEntries(); - - - double FCentropy = 0; - if(numCluster > 1){ - for (int fc = 0; fc < numCluster; fc++){ - double weight = mm.getClusterSum(fc)/(double)n; - if(weight > 0) - FCentropy+= weight * Math.log10(weight); - } - FCentropy/=(-1*Math.log10(numCluster)); - } + double GTentropy = 0; + if (numClasses > 1) { + for (int hc = 0; hc < numClasses; hc++) { + double weight = mm.getClassSum(hc) / (double) n; + if (weight > 0) + GTentropy += weight * Math.log10(weight); + } + GTentropy /= (-1 * Math.log10(numClasses)); + } - logger.debug("FC entropy: {}", FCentropy); + logger.debug("GT entropy: {}", GTentropy); - double GTentropy = 0; - if(numClasses > 1){ - for (int hc = 0; hc < numClasses; hc++){ - double weight = mm.getClassSum(hc)/(double)n; - if(weight > 0) - GTentropy+= weight * Math.log10(weight); - } - GTentropy/=(-1*Math.log10(numClasses)); - } + // cluster based entropy + double FCcrossEntropy = 0; - logger.debug("GT entropy: {}", GTentropy); - - //cluster based entropy - double FCcrossEntropy = 0; - - for (int fc = 0; fc < numCluster; fc++){ - double e = 0; - int clusterWeight = mm.getClusterSum(fc); - if(clusterWeight>0){ - for (int hc = 0; hc < numClasses; hc++) { - double p = mm.getClusterClassWeight(fc, hc)/(double)clusterWeight; - if(p!=0){ - e+=p * Math.log10(p); - } - } - FCcrossEntropy+=((clusterWeight/(double)n) * e); - } - } - if(numCluster > 1){ - FCcrossEntropy/=-1*Math.log10(numCluster); + for (int fc = 0; fc < numCluster; fc++) { + double e = 0; + int clusterWeight = mm.getClusterSum(fc); + if (clusterWeight > 0) { + for (int hc = 0; hc < numClasses; hc++) { + double p = mm.getClusterClassWeight(fc, hc) / (double) clusterWeight; + if (p != 0) { + e += p * Math.log10(p); + } } + FCcrossEntropy += ((clusterWeight / (double) n) * e); + } + } + if (numCluster > 1) { + FCcrossEntropy /= -1 * Math.log10(numCluster); + } - addValue("FC cross entropy", 1-FCcrossEntropy); - logger.debug("FC cross entropy: {}", 1 - FCcrossEntropy); - - //class based entropy - double GTcrossEntropy = 0; - for (int hc = 0; hc < numClasses; hc++){ - double e = 0; - int classWeight = mm.getClassSum(hc); - if(classWeight>0){ - for (int fc = 0; fc < numCluster; fc++) { - double p = mm.getClusterClassWeight(fc, hc)/(double)classWeight; - if(p!=0){ - e+=p * Math.log10(p); - } - } - } - GTcrossEntropy+=((classWeight/(double)n) * e); + addValue("FC cross entropy", 1 - FCcrossEntropy); + logger.debug("FC cross entropy: {}", 1 - FCcrossEntropy); + + // class based entropy + double GTcrossEntropy = 0; + for (int hc = 0; hc < numClasses; hc++) { + double e = 0; + int classWeight = mm.getClassSum(hc); + if (classWeight > 0) { + for (int fc = 0; fc < numCluster; fc++) { + double p = mm.getClusterClassWeight(fc, hc) / (double) classWeight; + if (p != 0) { + e += p * Math.log10(p); + } } - if(numClasses > 1) - GTcrossEntropy/=-1*Math.log10(numClasses); - addValue("GT cross entropy", 1-GTcrossEntropy); - logger.debug("GT cross entropy: {}", 1 - GTcrossEntropy); - - double homogeneity; - if(FCentropy == 0) - homogeneity = 1; - else - homogeneity = 1 - FCcrossEntropy/FCentropy; - - //TODO set err values for now, needs to be debugged - if(homogeneity > 1 || homogeneity < 0) - addValue("Homogeneity",-1); - else - addValue("Homogeneity",homogeneity); - - double completeness; - if(GTentropy == 0) - completeness = 1; - else - completeness = 1 - GTcrossEntropy/GTentropy; - addValue("Completeness",completeness); - - double beta = 1; - double vmeasure = (1+ beta)*homogeneity*completeness/(beta *homogeneity+completeness); - - if(vmeasure > 1 || homogeneity < 0) - addValue("V-Measure",-1); - else - addValue("V-Measure",vmeasure); - - - - double mutual = 0; - for (int i = 0; i < numCluster; i++){ - for (int j = 0; j < numClasses; j++) { - if(mm.getClusterClassWeight(i, j)==0) continue; - double m = Math.log10(mm.getClusterClassWeight(i, j)/(double)mm.getClusterSum(i)/(double)mm.getClassSum(j)*(double)n); - m*= mm.getClusterClassWeight(i, j)/(double)n; - logger.debug("( {} / {}): ",m, m); - mutual+=m; - } - } - if(numClasses > 1) - mutual/=Math.log10(numClasses); + } + GTcrossEntropy += ((classWeight / (double) n) * e); + } + if (numClasses > 1) + GTcrossEntropy /= -1 * Math.log10(numClasses); + addValue("GT cross entropy", 1 - GTcrossEntropy); + logger.debug("GT cross entropy: {}", 1 - GTcrossEntropy); + + double homogeneity; + if (FCentropy == 0) + homogeneity = 1; + else + homogeneity = 1 - FCcrossEntropy / FCentropy; + + // TODO set err values for now, needs to be debugged + if (homogeneity > 1 || homogeneity < 0) + addValue("Homogeneity", -1); + else + addValue("Homogeneity", homogeneity); + + double completeness; + if (GTentropy == 0) + completeness = 1; + else + completeness = 1 - GTcrossEntropy / GTentropy; + addValue("Completeness", completeness); + + double beta = 1; + double vmeasure = (1 + beta) * homogeneity * completeness / (beta * homogeneity + completeness); + + if (vmeasure > 1 || homogeneity < 0) + addValue("V-Measure", -1); + else + addValue("V-Measure", vmeasure); + + double mutual = 0; + for (int i = 0; i < numCluster; i++) { + for (int j = 0; j < numClasses; j++) { + if (mm.getClusterClassWeight(i, j) == 0) + continue; + double m = Math.log10(mm.getClusterClassWeight(i, j) / (double) mm.getClusterSum(i) + / (double) mm.getClassSum(j) * (double) n); + m *= mm.getClusterClassWeight(i, j) / (double) n; + logger.debug("( {} / {}): ", m, m); + mutual += m; + } + } + if (numClasses > 1) + mutual /= Math.log10(numClasses); - double varInfo = 1; - if(FCentropy + GTentropy > 0) - varInfo = 2*mutual/(FCentropy + GTentropy); + double varInfo = 1; + if (FCentropy + GTentropy > 0) + varInfo = 2 * mutual / (FCentropy + GTentropy); - logger.debug("mutual: {} / VI: {}", mutual, varInfo); - addValue("VarInformation", varInfo); + logger.debug("mutual: {} / VI: {}", mutual, varInfo); + addValue("VarInformation", varInfo); - } + } } http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java index 6533f36..f62b6bb 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/F1.java @@ -26,90 +26,85 @@ import com.yahoo.labs.samoa.moa.evaluation.MembershipMatrix; import com.yahoo.labs.samoa.moa.core.DataPoint; import java.util.ArrayList; +public class F1 extends MeasureCollection { -public class F1 extends MeasureCollection{ + @Override + protected String[] getNames() { + return new String[] { "F1-P", "F1-R", "Purity" }; + } - @Override - protected String[] getNames() { - return new String[]{"F1-P","F1-R","Purity"}; - } - - public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) { - - if (clustering.size()<0){ - addValue(0,0); - addValue(1,0); - return; - } - - MembershipMatrix mm = new MembershipMatrix(clustering, points); - //System.out.println(mm.toString()); - - int numClasses = mm.getNumClasses(); - if(mm.hasNoiseClass()) - numClasses--; - - - - //F1 as defined in P3C, try using F1 optimization - double F1_P = 0.0; - double purity = 0; - int realClusters = 0; - for (int i = 0; i < clustering.size(); i++) { - int max_weight = 0; - int max_weight_index = -1; + public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) { - //find max index - for (int j = 0; j < numClasses; j++) { - if(mm.getClusterClassWeight(i, j) > max_weight){ - max_weight = mm.getClusterClassWeight(i, j); - max_weight_index = j; - } - } - if(max_weight_index!=-1){ - realClusters++; - double precision = mm.getClusterClassWeight(i, max_weight_index)/(double)mm.getClusterSum(i); - double recall = mm.getClusterClassWeight(i, max_weight_index)/(double) mm.getClassSum(max_weight_index); - double f1 = 0; - if(precision > 0 || recall > 0){ - f1 = 2*precision*recall/(precision+recall); - } - F1_P += f1; - purity += precision; + if (clustering.size() < 0) { + addValue(0, 0); + addValue(1, 0); + return; + } - //TODO should we move setMeasure stuff into the Cluster interface? - clustering.get(i).setMeasureValue("F1-P", Double.toString(f1)); - } + MembershipMatrix mm = new MembershipMatrix(clustering, points); + // System.out.println(mm.toString()); + + int numClasses = mm.getNumClasses(); + if (mm.hasNoiseClass()) + numClasses--; + + // F1 as defined in P3C, try using F1 optimization + double F1_P = 0.0; + double purity = 0; + int realClusters = 0; + for (int i = 0; i < clustering.size(); i++) { + int max_weight = 0; + int max_weight_index = -1; + + // find max index + for (int j = 0; j < numClasses; j++) { + if (mm.getClusterClassWeight(i, j) > max_weight) { + max_weight = mm.getClusterClassWeight(i, j); + max_weight_index = j; } - if(realClusters > 0){ - F1_P/=realClusters; - purity/=realClusters; + } + if (max_weight_index != -1) { + realClusters++; + double precision = mm.getClusterClassWeight(i, max_weight_index) / (double) mm.getClusterSum(i); + double recall = mm.getClusterClassWeight(i, max_weight_index) / (double) mm.getClassSum(max_weight_index); + double f1 = 0; + if (precision > 0 || recall > 0) { + f1 = 2 * precision * recall / (precision + recall); } - addValue("F1-P",F1_P); - addValue("Purity",purity); - + F1_P += f1; + purity += precision; - - //F1 as defined in .... mainly maximizes F1 for each class - double F1_R = 0.0; - for (int j = 0; j < numClasses; j++) { - double max_f1 = 0; - for (int i = 0; i < clustering.size(); i++) { - double precision = mm.getClusterClassWeight(i, j)/(double)mm.getClusterSum(i); - double recall = mm.getClusterClassWeight(i, j)/(double)mm.getClassSum(j); - double f1 = 0; - if(precision > 0 || recall > 0){ - f1 = 2*precision*recall/(precision+recall); - } - if(max_f1 < f1){ - max_f1 = f1; - } - } - F1_R+= max_f1; + // TODO should we move setMeasure stuff into the Cluster interface? + clustering.get(i).setMeasureValue("F1-P", Double.toString(f1)); + } + } + if (realClusters > 0) { + F1_P /= realClusters; + purity /= realClusters; + } + addValue("F1-P", F1_P); + addValue("Purity", purity); + + // F1 as defined in .... mainly maximizes F1 for each class + double F1_R = 0.0; + for (int j = 0; j < numClasses; j++) { + double max_f1 = 0; + for (int i = 0; i < clustering.size(); i++) { + double precision = mm.getClusterClassWeight(i, j) / (double) mm.getClusterSum(i); + double recall = mm.getClusterClassWeight(i, j) / (double) mm.getClassSum(j); + double f1 = 0; + if (precision > 0 || recall > 0) { + f1 = 2 * precision * recall / (precision + recall); } - F1_R/=numClasses; - - addValue("F1-R",F1_R); + if (max_f1 < f1) { + max_f1 = f1; + } + } + F1_R += max_f1; } + F1_R /= numClasses; + + addValue("F1-R", F1_R); + } } http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java index 7f23c1b..287af06 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/General.java @@ -20,7 +20,6 @@ package com.yahoo.labs.samoa.evaluation.measures; * #L% */ - import com.yahoo.labs.samoa.instances.Instance; import com.yahoo.labs.samoa.moa.cluster.Clustering; import com.yahoo.labs.samoa.moa.cluster.SphereCluster; @@ -28,164 +27,166 @@ import com.yahoo.labs.samoa.moa.evaluation.MeasureCollection; import com.yahoo.labs.samoa.moa.core.DataPoint; import java.util.ArrayList; - -public class General extends MeasureCollection{ - private int numPoints; - private int numFClusters; - private int numDims; - private double pointInclusionProbThreshold = 0.8; - private Clustering clustering; - private ArrayList<DataPoint> points; - - - public General() { - super(); +public class General extends MeasureCollection { + private int numPoints; + private int numFClusters; + private int numDims; + private double pointInclusionProbThreshold = 0.8; + private Clustering clustering; + private ArrayList<DataPoint> points; + + public General() { + super(); + } + + @Override + protected String[] getNames() { + // String[] names = + // {"GPrecision","GRecall","Redundancy","Overlap","numCluster","numClasses","Compactness"}; + return new String[] { "GPrecision", "GRecall", "Redundancy", "numCluster", "numClasses" }; + } + + // @Override + // protected boolean[] getDefaultEnabled() { + // boolean [] defaults = {false, false, false, false, false ,false}; + // return defaults; + // } + + @Override + public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) + throws Exception { + + this.points = points; + this.clustering = clustering; + numPoints = points.size(); + numFClusters = clustering.size(); + numDims = points.get(0).numAttributes() - 1; + + int totalRedundancy = 0; + int trueCoverage = 0; + int totalCoverage = 0; + + int numNoise = 0; + for (int p = 0; p < numPoints; p++) { + int coverage = 0; + for (int c = 0; c < numFClusters; c++) { + // contained in cluster c? + if (clustering.get(c).getInclusionProbability(points.get(p)) >= pointInclusionProbThreshold) { + coverage++; + } + } + + if (points.get(p).classValue() == -1) { + numNoise++; + } + else { + if (coverage > 0) + trueCoverage++; + } + + if (coverage > 0) + totalCoverage++; // points covered by clustering (incl. noise) + if (coverage > 1) + totalRedundancy++; // include noise } - - @Override - protected String[] getNames() { - //String[] names = {"GPrecision","GRecall","Redundancy","Overlap","numCluster","numClasses","Compactness"}; - return new String[]{"GPrecision","GRecall","Redundancy","numCluster","numClasses"}; + addValue("numCluster", clustering.size()); + addValue("numClasses", trueClustering.size()); + addValue("Redundancy", ((double) totalRedundancy / (double) numPoints)); + addValue("GPrecision", (totalCoverage == 0 ? 0 : ((double) trueCoverage / (double) (totalCoverage)))); + addValue("GRecall", ((double) trueCoverage / (double) (numPoints - numNoise))); + // if(isEnabled(3)){ + // addValue("Compactness", computeCompactness()); + // } + // if(isEnabled(3)){ + // addValue("Overlap", computeOverlap()); + // } + } + + private double computeOverlap() { + for (int c = 0; c < numFClusters; c++) { + if (!(clustering.get(c) instanceof SphereCluster)) { + System.out.println("Overlap only supports Sphere Cluster. Found: " + clustering.get(c).getClass()); + return Double.NaN; + } } -// @Override -// protected boolean[] getDefaultEnabled() { -// boolean [] defaults = {false, false, false, false, false ,false}; -// return defaults; -// } - - @Override - public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) throws Exception{ - - this.points = points; - this.clustering = clustering; - numPoints = points.size(); - numFClusters = clustering.size(); - numDims = points.get(0).numAttributes()-1; - - - int totalRedundancy = 0; - int trueCoverage = 0; - int totalCoverage = 0; - - int numNoise = 0; - for (int p = 0; p < numPoints; p++) { - int coverage = 0; - for (int c = 0; c < numFClusters; c++) { - //contained in cluster c? - if(clustering.get(c).getInclusionProbability(points.get(p)) >= pointInclusionProbThreshold){ - coverage++; - } - } - - if(points.get(p).classValue()==-1){ - numNoise++; - } - else{ - if(coverage>0) trueCoverage++; - } - - if(coverage>0) totalCoverage++; //points covered by clustering (incl. noise) - if(coverage>1) totalRedundancy++; //include noise + boolean[] overlap = new boolean[numFClusters]; + + for (int c0 = 0; c0 < numFClusters; c0++) { + if (overlap[c0]) + continue; + SphereCluster s0 = (SphereCluster) clustering.get(c0); + for (int c1 = c0; c1 < clustering.size(); c1++) { + if (c1 == c0) + continue; + SphereCluster s1 = (SphereCluster) clustering.get(c1); + if (s0.overlapRadiusDegree(s1) > 0) { + overlap[c0] = overlap[c1] = true; } - - addValue("numCluster", clustering.size()); - addValue("numClasses", trueClustering.size()); - addValue("Redundancy", ((double)totalRedundancy/(double)numPoints)); - addValue("GPrecision", (totalCoverage==0?0:((double)trueCoverage/(double)(totalCoverage)))); - addValue("GRecall", ((double)trueCoverage/(double)(numPoints-numNoise))); -// if(isEnabled(3)){ -// addValue("Compactness", computeCompactness()); -// } -// if(isEnabled(3)){ -// addValue("Overlap", computeOverlap()); -// } + } } - private double computeOverlap(){ - for (int c = 0; c < numFClusters; c++) { - if(!(clustering.get(c) instanceof SphereCluster)){ - System.out.println("Overlap only supports Sphere Cluster. Found: "+clustering.get(c).getClass()); - return Double.NaN; - } - } - - boolean[] overlap = new boolean[numFClusters]; - - for (int c0 = 0; c0 < numFClusters; c0++) { - if(overlap[c0]) continue; - SphereCluster s0 = (SphereCluster)clustering.get(c0); - for (int c1 = c0; c1 < clustering.size(); c1++) { - if(c1 == c0) continue; - SphereCluster s1 = (SphereCluster)clustering.get(c1); - if(s0.overlapRadiusDegree(s1) > 0){ - overlap[c0] = overlap[c1] = true; - } - } - } - - double totalOverlap = 0; - for (int c0 = 0; c0 < numFClusters; c0++) { - if(overlap[c0]) - totalOverlap++; - } - -// if(totalOverlap/(double)numFClusters > .8) RunVisualizer.pause(); - if(numFClusters>0) totalOverlap/=(double)numFClusters; - return totalOverlap; + double totalOverlap = 0; + for (int c0 = 0; c0 < numFClusters; c0++) { + if (overlap[c0]) + totalOverlap++; } + // if(totalOverlap/(double)numFClusters > .8) RunVisualizer.pause(); + if (numFClusters > 0) + totalOverlap /= (double) numFClusters; + return totalOverlap; + } + + private double computeCompactness() { + if (numFClusters == 0) + return 0; + for (int c = 0; c < numFClusters; c++) { + if (!(clustering.get(c) instanceof SphereCluster)) { + System.out.println("Compactness only supports Sphere Cluster. Found: " + clustering.get(c).getClass()); + return Double.NaN; + } + } - private double computeCompactness(){ - if(numFClusters == 0) return 0; - for (int c = 0; c < numFClusters; c++) { - if(!(clustering.get(c) instanceof SphereCluster)){ - System.out.println("Compactness only supports Sphere Cluster. Found: "+clustering.get(c).getClass()); - return Double.NaN; - } + // TODO weight radius by number of dimensions + double totalCompactness = 0; + for (int c = 0; c < numFClusters; c++) { + ArrayList<Instance> containedPoints = new ArrayList<Instance>(); + for (int p = 0; p < numPoints; p++) { + // p in c + if (clustering.get(c).getInclusionProbability(points.get(p)) >= pointInclusionProbThreshold) { + containedPoints.add(points.get(p)); } - - //TODO weight radius by number of dimensions - double totalCompactness = 0; - for (int c = 0; c < numFClusters; c++) { - ArrayList<Instance> containedPoints = new ArrayList<Instance>(); - for (int p = 0; p < numPoints; p++) { - //p in c - if(clustering.get(c).getInclusionProbability(points.get(p)) >= pointInclusionProbThreshold){ - containedPoints.add(points.get(p)); - } - } - double compactness = 0; - if(containedPoints.size()>1){ - //cluster not empty - SphereCluster minEnclosingCluster = new SphereCluster(containedPoints, numDims); - double minRadius = minEnclosingCluster.getRadius(); - double cfRadius = ((SphereCluster)clustering.get(c)).getRadius(); - if(Math.abs(minRadius-cfRadius) < 0.1e-10){ - compactness = 1; - } - else - if(minRadius < cfRadius) - compactness = minRadius/cfRadius; - else{ - System.out.println("Optimal radius bigger then real one ("+(cfRadius-minRadius)+"), this is really wrong"); - compactness = 1; - } - } - else{ - double cfRadius = ((SphereCluster)clustering.get(c)).getRadius(); - if(cfRadius==0) compactness = 1; - } - - //weight by weight of cluster??? - totalCompactness+=compactness; - clustering.get(c).setMeasureValue("Compactness", Double.toString(compactness)); + } + double compactness = 0; + if (containedPoints.size() > 1) { + // cluster not empty + SphereCluster minEnclosingCluster = new SphereCluster(containedPoints, numDims); + double minRadius = minEnclosingCluster.getRadius(); + double cfRadius = ((SphereCluster) clustering.get(c)).getRadius(); + if (Math.abs(minRadius - cfRadius) < 0.1e-10) { + compactness = 1; } - return (totalCompactness/numFClusters); + else if (minRadius < cfRadius) + compactness = minRadius / cfRadius; + else { + System.out.println("Optimal radius bigger then real one (" + (cfRadius - minRadius) + + "), this is really wrong"); + compactness = 1; + } + } + else { + double cfRadius = ((SphereCluster) clustering.get(c)).getRadius(); + if (cfRadius == 0) + compactness = 1; + } + + // weight by weight of cluster??? + totalCompactness += compactness; + clustering.get(c).setMeasureValue("Compactness", Double.toString(compactness)); } - + return (totalCompactness / numFClusters); + } } - - http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java index 4f57788..ac25888 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/evaluation/measures/SSQ.java @@ -28,69 +28,70 @@ import com.yahoo.labs.samoa.instances.Instance; public class SSQ extends MeasureCollection { - public SSQ() { - super(); - } - - @Override - public String[] getNames() { - return new String[]{"SSQ"}; - } + public SSQ() { + super(); + } - @Override - protected boolean[] getDefaultEnabled() { - return new boolean[]{false}; - } + @Override + public String[] getNames() { + return new String[] { "SSQ" }; + } - // TODO Work on this later - //@Override - public void evaluateClusteringSamoa(Clustering clustering, - Clustering trueClsutering, ArrayList<Instance> points) { - double sum = 0.0; - for (Instance point : points) { - // don't include noise - if (point.classValue() == -1) { - continue; - } + @Override + protected boolean[] getDefaultEnabled() { + return new boolean[] { false }; + } - double minDistance = Double.MAX_VALUE; - for (int c = 0; c < clustering.size(); c++) { - double distance = 0.0; - double[] center = clustering.get(c).getCenter(); - for (int i = 0; i < center.length; i++) { - double d = point.value(i) - center[i]; - distance += d * d; - } - minDistance = Math.min(distance, minDistance); - } + // TODO Work on this later + // @Override + public void evaluateClusteringSamoa(Clustering clustering, + Clustering trueClsutering, ArrayList<Instance> points) { + double sum = 0.0; + for (Instance point : points) { + // don't include noise + if (point.classValue() == -1) { + continue; + } - sum += minDistance; + double minDistance = Double.MAX_VALUE; + for (int c = 0; c < clustering.size(); c++) { + double distance = 0.0; + double[] center = clustering.get(c).getCenter(); + for (int i = 0; i < center.length; i++) { + double d = point.value(i) - center[i]; + distance += d * d; } + minDistance = Math.min(distance, minDistance); + } - addValue(0, sum); + sum += minDistance; } - @Override - public void evaluateClustering(Clustering clustering, Clustering trueClsutering, ArrayList<DataPoint> points) { - double sum = 0.0; - for (int p = 0; p < points.size(); p++) { - //don't include noise - if(points.get(p).classValue()==-1) continue; + addValue(0, sum); + } - double minDistance = Double.MAX_VALUE; - for (int c = 0; c < clustering.size(); c++) { - double distance = 0.0; - double[] center = clustering.get(c).getCenter(); - for (int i = 0; i < center.length; i++) { - double d = points.get(p).value(i) - center[i]; - distance += d * d; - } - minDistance = Math.min(distance, minDistance); - } - - sum+=minDistance; + @Override + public void evaluateClustering(Clustering clustering, Clustering trueClsutering, ArrayList<DataPoint> points) { + double sum = 0.0; + for (int p = 0; p < points.size(); p++) { + // don't include noise + if (points.get(p).classValue() == -1) + continue; + + double minDistance = Double.MAX_VALUE; + for (int c = 0; c < clustering.size(); c++) { + double distance = 0.0; + double[] center = clustering.get(c).getCenter(); + for (int i = 0; i < center.length; i++) { + double d = points.get(p).value(i) - center[i]; + distance += d * d; } - - addValue(0,sum); + minDistance = Math.min(distance, minDistance); + } + + sum += minDistance; } + + addValue(0, sum); + } }
