http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Clustering.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Clustering.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Clustering.java index 5f76ba5..b7c9862 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Clustering.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Clustering.java @@ -29,248 +29,242 @@ import com.yahoo.labs.samoa.moa.core.DataPoint; import com.yahoo.labs.samoa.instances.Attribute; import com.yahoo.labs.samoa.instances.Instance; -public class Clustering extends AbstractMOAObject{ +public class Clustering extends AbstractMOAObject { - private AutoExpandVector<Cluster> clusters; + private AutoExpandVector<Cluster> clusters; - public Clustering() { - this.clusters = new AutoExpandVector<Cluster>(); - } + public Clustering() { + this.clusters = new AutoExpandVector<Cluster>(); + } - public Clustering( Cluster[] clusters ) { - this.clusters = new AutoExpandVector<Cluster>(); - for (int i = 0; i < clusters.length; i++) { - this.clusters.add(clusters[i]); - } + public Clustering(Cluster[] clusters) { + this.clusters = new AutoExpandVector<Cluster>(); + for (int i = 0; i < clusters.length; i++) { + this.clusters.add(clusters[i]); } + } - public Clustering(List<? extends Instance> points){ - HashMap<Integer, Integer> labelMap = classValues(points); - int dim = points.get(0).dataset().numAttributes()-1; + public Clustering(List<? extends Instance> points) { + HashMap<Integer, Integer> labelMap = classValues(points); + int dim = points.get(0).dataset().numAttributes() - 1; - int numClasses = labelMap.size(); - int noiseLabel; - - Attribute classLabel = points.get(0).dataset().classAttribute(); - int lastLabelIndex = classLabel.numValues() - 1; - if (classLabel.value(lastLabelIndex) == "noise") { - noiseLabel = lastLabelIndex; - } else { - noiseLabel = -1; - } + int numClasses = labelMap.size(); + int noiseLabel; - ArrayList<Instance>[] sorted_points = (ArrayList<Instance>[]) new ArrayList[numClasses]; - for (int i = 0; i < numClasses; i++) { - sorted_points[i] = new ArrayList<Instance>(); - } - for (Instance point : points) { - int clusterid = (int)point.classValue(); - if(clusterid == noiseLabel) continue; - sorted_points[labelMap.get(clusterid)].add((Instance)point); - } - this.clusters = new AutoExpandVector<Cluster>(); - for (int i = 0; i < numClasses; i++) { - if(sorted_points[i].size()>0){ - SphereCluster s = new SphereCluster(sorted_points[i],dim); - s.setId(sorted_points[i].get(0).classValue()); - s.setGroundTruth(sorted_points[i].get(0).classValue()); - clusters.add(s); - } - } + Attribute classLabel = points.get(0).dataset().classAttribute(); + int lastLabelIndex = classLabel.numValues() - 1; + if (classLabel.value(lastLabelIndex) == "noise") { + noiseLabel = lastLabelIndex; + } else { + noiseLabel = -1; } - public Clustering(ArrayList<DataPoint> points, double overlapThreshold, int initMinPoints){ - HashMap<Integer, Integer> labelMap = Clustering.classValues(points); - int dim = points.get(0).dataset().numAttributes()-1; + ArrayList<Instance>[] sorted_points = (ArrayList<Instance>[]) new ArrayList[numClasses]; + for (int i = 0; i < numClasses; i++) { + sorted_points[i] = new ArrayList<Instance>(); + } + for (Instance point : points) { + int clusterid = (int) point.classValue(); + if (clusterid == noiseLabel) + continue; + sorted_points[labelMap.get(clusterid)].add((Instance) point); + } + this.clusters = new AutoExpandVector<Cluster>(); + for (int i = 0; i < numClasses; i++) { + if (sorted_points[i].size() > 0) { + SphereCluster s = new SphereCluster(sorted_points[i], dim); + s.setId(sorted_points[i].get(0).classValue()); + s.setGroundTruth(sorted_points[i].get(0).classValue()); + clusters.add(s); + } + } + } - int numClasses = labelMap.size(); - int num = 0; + public Clustering(ArrayList<DataPoint> points, double overlapThreshold, int initMinPoints) { + HashMap<Integer, Integer> labelMap = Clustering.classValues(points); + int dim = points.get(0).dataset().numAttributes() - 1; - ArrayList<DataPoint>[] sorted_points = (ArrayList<DataPoint>[]) new ArrayList[numClasses]; - for (int i = 0; i < numClasses; i++) { - sorted_points[i] = new ArrayList<DataPoint>(); - } - for (DataPoint point : points) { - int clusterid = (int)point.classValue(); - if(clusterid == -1) continue; - sorted_points[labelMap.get(clusterid)].add(point); - num++; - } + int numClasses = labelMap.size(); + int num = 0; - clusters = new AutoExpandVector<Cluster>(); - int microID = 0; - for (int i = 0; i < numClasses; i++) { - ArrayList<SphereCluster> microByClass = new ArrayList<SphereCluster>(); - ArrayList<DataPoint> pointInCluster = new ArrayList<DataPoint>(); - ArrayList<ArrayList<Instance>> pointInMicroClusters = new ArrayList(); + ArrayList<DataPoint>[] sorted_points = (ArrayList<DataPoint>[]) new ArrayList[numClasses]; + for (int i = 0; i < numClasses; i++) { + sorted_points[i] = new ArrayList<DataPoint>(); + } + for (DataPoint point : points) { + int clusterid = (int) point.classValue(); + if (clusterid == -1) + continue; + sorted_points[labelMap.get(clusterid)].add(point); + num++; + } - pointInCluster.addAll(sorted_points[i]); - while(pointInCluster.size()>0){ - ArrayList<Instance> micro_points = new ArrayList<Instance>(); - for (int j = 0; j < initMinPoints && !pointInCluster.isEmpty(); j++) { - micro_points.add((Instance)pointInCluster.get(0)); - pointInCluster.remove(0); - } - if(micro_points.size() > 0){ - SphereCluster s = new SphereCluster(micro_points,dim); - for (int c = 0; c < microByClass.size(); c++) { - if(((SphereCluster)microByClass.get(c)).overlapRadiusDegree(s) > overlapThreshold ){ - micro_points.addAll(pointInMicroClusters.get(c)); - s = new SphereCluster(micro_points,dim); - pointInMicroClusters.remove(c); - microByClass.remove(c); - //System.out.println("Removing redundant cluster based on radius overlap"+c); - } - } - for (int j = 0; j < pointInCluster.size(); j++) { - Instance instance = pointInCluster.get(j); - if(s.getInclusionProbability(instance) > 0.8){ - pointInCluster.remove(j); - micro_points.add(instance); - } - } - s.setWeight(micro_points.size()); - microByClass.add(s); - pointInMicroClusters.add(micro_points); - microID++; - } - } - // - boolean changed = true; - while(changed){ - changed = false; - for(int c = 0; c < microByClass.size(); c++) { - for(int c1 = c+1; c1 < microByClass.size(); c1++) { - double overlap = microByClass.get(c).overlapRadiusDegree(microByClass.get(c1)); -// System.out.println("Overlap C"+(clustering.size()+c)+" ->C"+(clustering.size()+c1)+": "+overlap); - if(overlap > overlapThreshold){ - pointInMicroClusters.get(c).addAll(pointInMicroClusters.get(c1)); - SphereCluster s = new SphereCluster(pointInMicroClusters.get(c),dim); - microByClass.set(c, s); - pointInMicroClusters.remove(c1); - microByClass.remove(c1); - changed = true; - break; - } - } - } + clusters = new AutoExpandVector<Cluster>(); + int microID = 0; + for (int i = 0; i < numClasses; i++) { + ArrayList<SphereCluster> microByClass = new ArrayList<SphereCluster>(); + ArrayList<DataPoint> pointInCluster = new ArrayList<DataPoint>(); + ArrayList<ArrayList<Instance>> pointInMicroClusters = new ArrayList(); + + pointInCluster.addAll(sorted_points[i]); + while (pointInCluster.size() > 0) { + ArrayList<Instance> micro_points = new ArrayList<Instance>(); + for (int j = 0; j < initMinPoints && !pointInCluster.isEmpty(); j++) { + micro_points.add((Instance) pointInCluster.get(0)); + pointInCluster.remove(0); + } + if (micro_points.size() > 0) { + SphereCluster s = new SphereCluster(micro_points, dim); + for (int c = 0; c < microByClass.size(); c++) { + if (((SphereCluster) microByClass.get(c)).overlapRadiusDegree(s) > overlapThreshold) { + micro_points.addAll(pointInMicroClusters.get(c)); + s = new SphereCluster(micro_points, dim); + pointInMicroClusters.remove(c); + microByClass.remove(c); + // System.out.println("Removing redundant cluster based on radius overlap"+c); } - for (int j = 0; j < microByClass.size(); j++) { - microByClass.get(j).setGroundTruth(sorted_points[i].get(0).classValue()); - clusters.add(microByClass.get(j)); + } + for (int j = 0; j < pointInCluster.size(); j++) { + Instance instance = pointInCluster.get(j); + if (s.getInclusionProbability(instance) > 0.8) { + pointInCluster.remove(j); + micro_points.add(instance); } - - } - for (int j = 0; j < clusters.size(); j++) { - clusters.get(j).setId(j); + } + s.setWeight(micro_points.size()); + microByClass.add(s); + pointInMicroClusters.add(micro_points); + microID++; } - - } - - /** - * @param points - * @return an array with the min and max class label value - */ - public static HashMap<Integer, Integer> classValues(List<? extends Instance> points){ - HashMap<Integer,Integer> classes = new HashMap<Integer, Integer>(); - int workcluster = 0; - boolean hasnoise = false; - for (int i = 0; i < points.size(); i++) { - int label = (int) points.get(i).classValue(); - if(label == -1){ - hasnoise = true; - } - else{ - if(!classes.containsKey(label)){ - classes.put(label,workcluster); - workcluster++; - } + } + // + boolean changed = true; + while (changed) { + changed = false; + for (int c = 0; c < microByClass.size(); c++) { + for (int c1 = c + 1; c1 < microByClass.size(); c1++) { + double overlap = microByClass.get(c).overlapRadiusDegree(microByClass.get(c1)); + // System.out.println("Overlap C"+(clustering.size()+c)+" ->C"+(clustering.size()+c1)+": "+overlap); + if (overlap > overlapThreshold) { + pointInMicroClusters.get(c).addAll(pointInMicroClusters.get(c1)); + SphereCluster s = new SphereCluster(pointInMicroClusters.get(c), dim); + microByClass.set(c, s); + pointInMicroClusters.remove(c1); + microByClass.remove(c1); + changed = true; + break; } + } } - if(hasnoise) - classes.put(-1,workcluster); - return classes; - } + } + for (int j = 0; j < microByClass.size(); j++) { + microByClass.get(j).setGroundTruth(sorted_points[i].get(0).classValue()); + clusters.add(microByClass.get(j)); + } - public Clustering(AutoExpandVector<Cluster> clusters) { - this.clusters = clusters; } - - - /** - * add a cluster to the clustering - */ - public void add(Cluster cluster){ - clusters.add(cluster); - } - - /** - * remove a cluster from the clustering - */ - public void remove(int index){ - if(index < clusters.size()){ - clusters.remove(index); - } + for (int j = 0; j < clusters.size(); j++) { + clusters.get(j).setId(j); } - /** - * remove a cluster from the clustering - */ - public Cluster get(int index){ - if(index < clusters.size()){ - return clusters.get(index); + } + + /** + * @param points + * @return an array with the min and max class label value + */ + public static HashMap<Integer, Integer> classValues(List<? extends Instance> points) { + HashMap<Integer, Integer> classes = new HashMap<Integer, Integer>(); + int workcluster = 0; + boolean hasnoise = false; + for (int i = 0; i < points.size(); i++) { + int label = (int) points.get(i).classValue(); + if (label == -1) { + hasnoise = true; + } + else { + if (!classes.containsKey(label)) { + classes.put(label, workcluster); + workcluster++; } - return null; - } - - /** - * @return the <code>Clustering</code> as an AutoExpandVector - */ - public AutoExpandVector<Cluster> getClustering() { - return clusters; - } - - /** - * @return A deepcopy of the <code>Clustering</code> as an AutoExpandVector - */ - public AutoExpandVector<Cluster> getClusteringCopy() { - return (AutoExpandVector<Cluster>)clusters.copy(); - } - - - /** - * @return the number of clusters - */ - public int size() { - return clusters.size(); + } } - - /** - * @return the number of dimensions of this clustering - */ - public int dimension() { - assert (clusters.size() != 0); - return clusters.get(0).getCenter().length; + if (hasnoise) + classes.put(-1, workcluster); + return classes; + } + + public Clustering(AutoExpandVector<Cluster> clusters) { + this.clusters = clusters; + } + + /** + * add a cluster to the clustering + */ + public void add(Cluster cluster) { + clusters.add(cluster); + } + + /** + * remove a cluster from the clustering + */ + public void remove(int index) { + if (index < clusters.size()) { + clusters.remove(index); } - - @Override - public void getDescription(StringBuilder sb, int i) { - sb.append("Clustering Object"); + } + + /** + * remove a cluster from the clustering + */ + public Cluster get(int index) { + if (index < clusters.size()) { + return clusters.get(index); } - - - - public double getMaxInclusionProbability(Instance point) { - double maxInclusion = 0.0; - for (int i = 0; i < clusters.size(); i++) { - maxInclusion = Math.max(clusters.get(i).getInclusionProbability(point), - maxInclusion); - } - return maxInclusion; + return null; + } + + /** + * @return the <code>Clustering</code> as an AutoExpandVector + */ + public AutoExpandVector<Cluster> getClustering() { + return clusters; + } + + /** + * @return A deepcopy of the <code>Clustering</code> as an AutoExpandVector + */ + public AutoExpandVector<Cluster> getClusteringCopy() { + return (AutoExpandVector<Cluster>) clusters.copy(); + } + + /** + * @return the number of clusters + */ + public int size() { + return clusters.size(); + } + + /** + * @return the number of dimensions of this clustering + */ + public int dimension() { + assert (clusters.size() != 0); + return clusters.get(0).getCenter().length; + } + + @Override + public void getDescription(StringBuilder sb, int i) { + sb.append("Clustering Object"); + } + + public double getMaxInclusionProbability(Instance point) { + double maxInclusion = 0.0; + for (int i = 0; i < clusters.size(); i++) { + maxInclusion = Math.max(clusters.get(i).getInclusionProbability(point), + maxInclusion); } - - - - + return maxInclusion; + } }
http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Miniball.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Miniball.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Miniball.java index 45c947e..f27a6ad 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Miniball.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/Miniball.java @@ -27,58 +27,58 @@ import java.util.List; public class Miniball { - private int dimension; - private com.dreizak.miniball.highdim.Miniball mb; - private PointStorage pointSet; + private int dimension; + private com.dreizak.miniball.highdim.Miniball mb; + private PointStorage pointSet; - public Miniball(int dimension) { - this.dimension = dimension; - } + public Miniball(int dimension) { + this.dimension = dimension; + } - void clear() { - this.pointSet = new PointStorage(this.dimension); - } + void clear() { + this.pointSet = new PointStorage(this.dimension); + } - void check_in(double[] array) { - this.pointSet.add(array); - } + void check_in(double[] array) { + this.pointSet.add(array); + } - double[] center() { - return this.mb.center(); - } + double[] center() { + return this.mb.center(); + } - double radius() { - return this.mb.radius(); - } + double radius() { + return this.mb.radius(); + } - void build() { - this.mb = new com.dreizak.miniball.highdim.Miniball(this.pointSet); - } + void build() { + this.mb = new com.dreizak.miniball.highdim.Miniball(this.pointSet); + } - public class PointStorage implements PointSet { + public class PointStorage implements PointSet { - protected int dimension; - protected List<double[]> L; + protected int dimension; + protected List<double[]> L; - public PointStorage(int dimension) { - this.dimension = dimension; - this.L = new ArrayList<double[]>(); - } + public PointStorage(int dimension) { + this.dimension = dimension; + this.L = new ArrayList<double[]>(); + } - public void add(double[] array) { - this.L.add(array); - } + public void add(double[] array) { + this.L.add(array); + } - public int size() { - return L.size(); - } + public int size() { + return L.size(); + } - public int dimension() { - return dimension; - } + public int dimension() { + return dimension; + } - public double coord(int point, int coordinate) { - return L.get(point)[coordinate]; - } + public double coord(int point, int coordinate) { + return L.get(point)[coordinate]; } + } } http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/SphereCluster.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/SphereCluster.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/SphereCluster.java index 558b0f1..4a5cc97 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/SphereCluster.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/cluster/SphereCluster.java @@ -1,4 +1,3 @@ - package com.yahoo.labs.samoa.moa.cluster; /* @@ -28,346 +27,341 @@ import java.util.Random; /** * A simple implementation of the <code>Cluster</code> interface representing - * spherical clusters. The inclusion probability is one inside the sphere and zero - * everywhere else. - * + * spherical clusters. The inclusion probability is one inside the sphere and + * zero everywhere else. + * */ public class SphereCluster extends Cluster { - private static final long serialVersionUID = 1L; - - private double[] center; - private double radius; - private double weight; - - - public SphereCluster(double[] center, double radius) { - this( center, radius, 1.0 ); - } - - public SphereCluster() { - } - - public SphereCluster( double[] center, double radius, double weightedSize) { - this(); - this.center = center; - this.radius = radius; - this.weight = weightedSize; - } - - public SphereCluster(int dimensions, double radius, Random random) { - this(); - this.center = new double[dimensions]; - this.radius = radius; - - // Position randomly but keep hypersphere inside the boundaries - double interval = 1.0 - 2 * radius; - for (int i = 0; i < center.length; i++) { - this.center[i] = (random.nextDouble() * interval) + radius; - } - this.weight = 0.0; - } - - - public SphereCluster(List<?extends Instance> instances, int dimension){ - this(); - if(instances == null || instances.size() <= 0) - return; - - weight = instances.size(); - - Miniball mb = new Miniball(dimension); - mb.clear(); - - for (Instance instance : instances) { - mb.check_in(instance.toDoubleArray()); - } - - mb.build(); - center = mb.center(); - radius = mb.radius(); - mb.clear(); - } - - - /** - * Checks whether two <code>SphereCluster</code> overlap based on radius - * NOTE: overlapRadiusDegree only calculates the overlap based - * on the centers and the radi, so not the real overlap - * - * TODO: should we do this by MC to get the real overlap??? - * - * @param other - * @return - */ - - public double overlapRadiusDegree(SphereCluster other) { - - - double[] center0 = getCenter(); - double radius0 = getRadius(); - - double[] center1 = other.getCenter(); - double radius1 = other.getRadius(); - - double radiusBig; - double radiusSmall; - if(radius0 < radius1){ - radiusBig = radius1; - radiusSmall = radius0; - } - else{ - radiusBig = radius0; - radiusSmall = radius1; - } - - double dist = 0; - for (int i = 0; i < center0.length; i++) { - double delta = center0[i] - center1[i]; - dist += delta * delta; - } - dist = Math.sqrt(dist); - - if(dist > radiusSmall + radiusBig) - return 0; - if(dist + radiusSmall <= radiusBig){ - //one lies within the other - return 1; - } - else{ - return (radiusSmall+radiusBig-dist)/(2*radiusSmall); - } - } - - public void combine(SphereCluster cluster) { - double[] center = getCenter(); - double[] newcenter = new double[center.length]; - double[] other_center = cluster.getCenter(); - double other_weight = cluster.getWeight(); - double other_radius = cluster.getRadius(); - - for (int i = 0; i < center.length; i++) { - newcenter[i] = (center[i]*getWeight()+other_center[i]*other_weight)/(getWeight()+other_weight); - } - - center = newcenter; - double r_0 = getRadius() + Math.abs(distance(center, newcenter)); - double r_1 = other_radius + Math.abs(distance(other_center, newcenter)); - radius = Math.max(r_0, r_1); - weight+= other_weight; - } - - public void merge(SphereCluster cluster) { - double[] c0 = getCenter(); - double w0 = getWeight(); - double r0 = getRadius(); - - double[] c1 = cluster.getCenter(); - double w1 = cluster.getWeight(); - double r1 = cluster.getRadius(); - - //vector - double[] v = new double[c0.length]; - //center distance - double d = 0; - - for (int i = 0; i < c0.length; i++) { - v[i] = c0[i] - c1[i]; - d += v[i] * v[i]; - } - d = Math.sqrt(d); - - - - double r; - double[] c = new double[c0.length]; - - //one lays within the others - if(d + r0 <= r1 || d + r1 <= r0){ - if(d + r0 <= r1){ - r = r1; - c = c1; - } - else{ - r = r0; - c = c0; - } - } - else{ - r = (r0 + r1 + d)/2.0; - for (int i = 0; i < c.length; i++) { - c[i] = c1[i] - v[i]/d * (r1-r); - } - } - - setCenter(c); - setRadius(r); - setWeight(w0+w1); - - } - - @Override - public double[] getCenter() { - double[] copy = new double[center.length]; - System.arraycopy(center, 0, copy, 0, center.length); - return copy; - } - - public void setCenter(double[] center) { - this.center = center; - } - - public double getRadius() { - return radius; - } - - public void setRadius( double radius ) { - this.radius = radius; - } - - @Override - public double getWeight() { - return weight; - } - - public void setWeight( double weight ) { - this.weight = weight; - } - - @Override - public double getInclusionProbability(Instance instance) { - if (getCenterDistance(instance) <= getRadius()) { - return 1.0; - } - return 0.0; - } - - public double getCenterDistance(Instance instance) { - double distance = 0.0; - //get the center through getCenter so subclass have a chance - double[] center = getCenter(); - for (int i = 0; i < center.length; i++) { - double d = center[i] - instance.value(i); - distance += d * d; - } - return Math.sqrt(distance); - } - - public double getCenterDistance(SphereCluster other) { - return distance(getCenter(), other.getCenter()); - } - - /* - * the minimal distance between the surface of two clusters. - * is negative if the two clusters overlap - */ - public double getHullDistance(SphereCluster other) { - double distance; - //get the center through getCenter so subclass have a chance - double[] center0 = getCenter(); - double[] center1 = other.getCenter(); - distance = distance(center0, center1); - - distance = distance - getRadius() - other.getRadius(); - return distance; - } - - /* + private static final long serialVersionUID = 1L; + + private double[] center; + private double radius; + private double weight; + + public SphereCluster(double[] center, double radius) { + this(center, radius, 1.0); + } + + public SphereCluster() { + } + + public SphereCluster(double[] center, double radius, double weightedSize) { + this(); + this.center = center; + this.radius = radius; + this.weight = weightedSize; + } + + public SphereCluster(int dimensions, double radius, Random random) { + this(); + this.center = new double[dimensions]; + this.radius = radius; + + // Position randomly but keep hypersphere inside the boundaries + double interval = 1.0 - 2 * radius; + for (int i = 0; i < center.length; i++) { + this.center[i] = (random.nextDouble() * interval) + radius; + } + this.weight = 0.0; + } + + public SphereCluster(List<? extends Instance> instances, int dimension) { + this(); + if (instances == null || instances.size() <= 0) + return; + + weight = instances.size(); + + Miniball mb = new Miniball(dimension); + mb.clear(); + + for (Instance instance : instances) { + mb.check_in(instance.toDoubleArray()); + } + + mb.build(); + center = mb.center(); + radius = mb.radius(); + mb.clear(); + } + + /** + * Checks whether two <code>SphereCluster</code> overlap based on radius NOTE: + * overlapRadiusDegree only calculates the overlap based on the centers and + * the radi, so not the real overlap + * + * TODO: should we do this by MC to get the real overlap??? + * + * @param other + * @return + */ + + public double overlapRadiusDegree(SphereCluster other) { + + double[] center0 = getCenter(); + double radius0 = getRadius(); + + double[] center1 = other.getCenter(); + double radius1 = other.getRadius(); + + double radiusBig; + double radiusSmall; + if (radius0 < radius1) { + radiusBig = radius1; + radiusSmall = radius0; + } + else { + radiusBig = radius0; + radiusSmall = radius1; + } + + double dist = 0; + for (int i = 0; i < center0.length; i++) { + double delta = center0[i] - center1[i]; + dist += delta * delta; + } + dist = Math.sqrt(dist); + + if (dist > radiusSmall + radiusBig) + return 0; + if (dist + radiusSmall <= radiusBig) { + // one lies within the other + return 1; + } + else { + return (radiusSmall + radiusBig - dist) / (2 * radiusSmall); + } + } + + public void combine(SphereCluster cluster) { + double[] center = getCenter(); + double[] newcenter = new double[center.length]; + double[] other_center = cluster.getCenter(); + double other_weight = cluster.getWeight(); + double other_radius = cluster.getRadius(); + + for (int i = 0; i < center.length; i++) { + newcenter[i] = (center[i] * getWeight() + other_center[i] * other_weight) / (getWeight() + other_weight); + } + + center = newcenter; + double r_0 = getRadius() + Math.abs(distance(center, newcenter)); + double r_1 = other_radius + Math.abs(distance(other_center, newcenter)); + radius = Math.max(r_0, r_1); + weight += other_weight; + } + + public void merge(SphereCluster cluster) { + double[] c0 = getCenter(); + double w0 = getWeight(); + double r0 = getRadius(); + + double[] c1 = cluster.getCenter(); + double w1 = cluster.getWeight(); + double r1 = cluster.getRadius(); + + // vector + double[] v = new double[c0.length]; + // center distance + double d = 0; + + for (int i = 0; i < c0.length; i++) { + v[i] = c0[i] - c1[i]; + d += v[i] * v[i]; + } + d = Math.sqrt(d); + + double r; + double[] c = new double[c0.length]; + + // one lays within the others + if (d + r0 <= r1 || d + r1 <= r0) { + if (d + r0 <= r1) { + r = r1; + c = c1; + } + else { + r = r0; + c = c0; + } + } + else { + r = (r0 + r1 + d) / 2.0; + for (int i = 0; i < c.length; i++) { + c[i] = c1[i] - v[i] / d * (r1 - r); + } + } + + setCenter(c); + setRadius(r); + setWeight(w0 + w1); + + } + + @Override + public double[] getCenter() { + double[] copy = new double[center.length]; + System.arraycopy(center, 0, copy, 0, center.length); + return copy; + } + + public void setCenter(double[] center) { + this.center = center; + } + + public double getRadius() { + return radius; + } + + public void setRadius(double radius) { + this.radius = radius; + } + + @Override + public double getWeight() { + return weight; + } + + public void setWeight(double weight) { + this.weight = weight; + } + + @Override + public double getInclusionProbability(Instance instance) { + if (getCenterDistance(instance) <= getRadius()) { + return 1.0; + } + return 0.0; + } + + public double getCenterDistance(Instance instance) { + double distance = 0.0; + // get the center through getCenter so subclass have a chance + double[] center = getCenter(); + for (int i = 0; i < center.length; i++) { + double d = center[i] - instance.value(i); + distance += d * d; + } + return Math.sqrt(distance); + } + + public double getCenterDistance(SphereCluster other) { + return distance(getCenter(), other.getCenter()); + } + + /* + * the minimal distance between the surface of two clusters. is negative if + * the two clusters overlap + */ + public double getHullDistance(SphereCluster other) { + double distance; + // get the center through getCenter so subclass have a chance + double[] center0 = getCenter(); + double[] center1 = other.getCenter(); + distance = distance(center0, center1); + + distance = distance - getRadius() - other.getRadius(); + return distance; + } + + /* */ - /** - * When a clusters looses points the new minimal bounding sphere can be - * partly outside of the originating cluster. If a another cluster is - * right next to the original cluster (without overlapping), the new - * cluster can be overlapping with this second cluster. OverlapSave - * will tell you if the current cluster can degenerate so much that it - * overlaps with cluster 'other' - * - * @param other the potentially overlapping cluster - * @return true if cluster can potentially overlap - */ - public boolean overlapSave(SphereCluster other){ - //use basic geometry to figure out the maximal degenerated cluster - //comes down to Max(radius *(sin alpha + cos alpha)) which is - double minDist = Math.sqrt(2)*(getRadius() + other.getRadius()); - double diff = getCenterDistance(other) - minDist; - - return diff > 0; - } - - private double distance(double[] v1, double[] v2){ - double distance = 0.0; - double[] center = getCenter(); - for (int i = 0; i < center.length; i++) { - double d = v1[i] - v2[i]; - distance += d * d; - } - return Math.sqrt(distance); - } - - public double[] getDistanceVector(Instance instance){ - return distanceVector(getCenter(), instance.toDoubleArray()); - } - - public double[] getDistanceVector(SphereCluster other){ - return distanceVector(getCenter(), other.getCenter()); - } - - private double[] distanceVector(double[] v1, double[] v2){ - double[] v = new double[v1.length]; - for (int i = 0; i < v1.length; i++) { - v[i] = v2[i] - v1[i]; - } - return v; - } - - - /** - * Samples this cluster by returning a point from inside it. - * @param random a random number source - * @return a point that lies inside this cluster - */ - public Instance sample(Random random) { - // Create sample in hypersphere coordinates - //get the center through getCenter so subclass have a chance - double[] center = getCenter(); - - final int dimensions = center.length; - - final double sin[] = new double[dimensions - 1]; - final double cos[] = new double[dimensions - 1]; - final double length = random.nextDouble() * getRadius(); - - double lastValue = 1.0; - for (int i = 0; i < dimensions-1; i++) { - double angle = random.nextDouble() * 2 * Math.PI; - sin[i] = lastValue * Math.sin( angle ); // Store cumulative values - cos[i] = Math.cos( angle ); - lastValue = sin[i]; - } - - // Calculate cartesian coordinates - double res[] = new double[dimensions]; - - // First value uses only cosines - res[0] = center[0] + length*cos[0]; - - // Loop through 'middle' coordinates which use cosines and sines - for (int i = 1; i < dimensions-1; i++) { - res[i] = center[i] + length*sin[i-1]*cos[i]; - } - - // Last value uses only sines - res[dimensions-1] = center[dimensions-1] + length*sin[dimensions-2]; - - return new DenseInstance(1.0, res); - } - - @Override - protected void getClusterSpecificInfo(List<String> infoTitle, List<String> infoValue) { - super.getClusterSpecificInfo(infoTitle, infoValue); - infoTitle.add("Radius"); - infoValue.add(Double.toString(getRadius())); - } - + /** + * When a clusters looses points the new minimal bounding sphere can be partly + * outside of the originating cluster. If a another cluster is right next to + * the original cluster (without overlapping), the new cluster can be + * overlapping with this second cluster. OverlapSave will tell you if the + * current cluster can degenerate so much that it overlaps with cluster + * 'other' + * + * @param other + * the potentially overlapping cluster + * @return true if cluster can potentially overlap + */ + public boolean overlapSave(SphereCluster other) { + // use basic geometry to figure out the maximal degenerated cluster + // comes down to Max(radius *(sin alpha + cos alpha)) which is + double minDist = Math.sqrt(2) * (getRadius() + other.getRadius()); + double diff = getCenterDistance(other) - minDist; + + return diff > 0; + } + + private double distance(double[] v1, double[] v2) { + double distance = 0.0; + double[] center = getCenter(); + for (int i = 0; i < center.length; i++) { + double d = v1[i] - v2[i]; + distance += d * d; + } + return Math.sqrt(distance); + } + + public double[] getDistanceVector(Instance instance) { + return distanceVector(getCenter(), instance.toDoubleArray()); + } + + public double[] getDistanceVector(SphereCluster other) { + return distanceVector(getCenter(), other.getCenter()); + } + + private double[] distanceVector(double[] v1, double[] v2) { + double[] v = new double[v1.length]; + for (int i = 0; i < v1.length; i++) { + v[i] = v2[i] - v1[i]; + } + return v; + } + + /** + * Samples this cluster by returning a point from inside it. + * + * @param random + * a random number source + * @return a point that lies inside this cluster + */ + public Instance sample(Random random) { + // Create sample in hypersphere coordinates + // get the center through getCenter so subclass have a chance + double[] center = getCenter(); + + final int dimensions = center.length; + + final double sin[] = new double[dimensions - 1]; + final double cos[] = new double[dimensions - 1]; + final double length = random.nextDouble() * getRadius(); + + double lastValue = 1.0; + for (int i = 0; i < dimensions - 1; i++) { + double angle = random.nextDouble() * 2 * Math.PI; + sin[i] = lastValue * Math.sin(angle); // Store cumulative values + cos[i] = Math.cos(angle); + lastValue = sin[i]; + } + + // Calculate cartesian coordinates + double res[] = new double[dimensions]; + + // First value uses only cosines + res[0] = center[0] + length * cos[0]; + + // Loop through 'middle' coordinates which use cosines and sines + for (int i = 1; i < dimensions - 1; i++) { + res[i] = center[i] + length * sin[i - 1] * cos[i]; + } + + // Last value uses only sines + res[dimensions - 1] = center[dimensions - 1] + length * sin[dimensions - 2]; + + return new DenseInstance(1.0, res); + } + + @Override + protected void getClusterSpecificInfo(List<String> infoTitle, List<String> infoValue) { + super.getClusterSpecificInfo(infoTitle, infoValue); + infoTitle.add("Radius"); + infoValue.add(Double.toString(getRadius())); + } } http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/AbstractClusterer.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/AbstractClusterer.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/AbstractClusterer.java index a1d80d1..b023648 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/AbstractClusterer.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/AbstractClusterer.java @@ -37,262 +37,262 @@ import com.yahoo.labs.samoa.instances.Instance; import com.yahoo.labs.samoa.instances.Instances; public abstract class AbstractClusterer extends AbstractOptionHandler - implements Clusterer { - - @Override - public String getPurposeString() { - return "MOA Clusterer: " + getClass().getCanonicalName(); - } + implements Clusterer { + + @Override + public String getPurposeString() { + return "MOA Clusterer: " + getClass().getCanonicalName(); + } + + protected InstancesHeader modelContext; + + protected double trainingWeightSeenByModel = 0.0; + + protected int randomSeed = 1; + + protected IntOption randomSeedOption; + + public FlagOption evaluateMicroClusteringOption; + + protected Random clustererRandom; + + protected Clustering clustering; + + public AbstractClusterer() { + if (isRandomizable()) { + this.randomSeedOption = new IntOption("randomSeed", 'r', + "Seed for random behaviour of the Clusterer.", 1); + } + + if (implementsMicroClusterer()) { + this.evaluateMicroClusteringOption = + new FlagOption("evaluateMicroClustering", 'M', + "Evaluate the underlying microclustering instead of the macro clustering"); + } + } + + @Override + public void prepareForUseImpl(TaskMonitor monitor, + ObjectRepository repository) { + if (this.randomSeedOption != null) { + this.randomSeed = this.randomSeedOption.getValue(); + } + if (!trainingHasStarted()) { + resetLearning(); + } + clustering = new Clustering(); + } + + public void setModelContext(InstancesHeader ih) { + if ((ih != null) && (ih.classIndex() < 0)) { + throw new IllegalArgumentException( + "Context for a Clusterer must include a class to learn"); + } + if (trainingHasStarted() + && (this.modelContext != null) + && ((ih == null) || !contextIsCompatible(this.modelContext, ih))) { + throw new IllegalArgumentException( + "New context is not compatible with existing model"); + } + this.modelContext = ih; + } + + public InstancesHeader getModelContext() { + return this.modelContext; + } + + public void setRandomSeed(int s) { + this.randomSeed = s; + if (this.randomSeedOption != null) { + // keep option consistent + this.randomSeedOption.setValue(s); + } + } + + public boolean trainingHasStarted() { + return this.trainingWeightSeenByModel > 0.0; + } + + public double trainingWeightSeenByModel() { + return this.trainingWeightSeenByModel; + } + + public void resetLearning() { + this.trainingWeightSeenByModel = 0.0; + if (isRandomizable()) { + this.clustererRandom = new Random(this.randomSeed); + } + resetLearningImpl(); + } + + public void trainOnInstance(Instance inst) { + if (inst.weight() > 0.0) { + this.trainingWeightSeenByModel += inst.weight(); + trainOnInstanceImpl(inst); + } + } + + public Measurement[] getModelMeasurements() { + List<Measurement> measurementList = new LinkedList<Measurement>(); + measurementList.add(new Measurement("model training instances", + trainingWeightSeenByModel())); + measurementList.add(new Measurement("model serialized size (bytes)", + measureByteSize())); + Measurement[] modelMeasurements = getModelMeasurementsImpl(); + if (modelMeasurements != null) { + for (Measurement measurement : modelMeasurements) { + measurementList.add(measurement); + } + } + // add average of sub-model measurements + Clusterer[] subModels = getSubClusterers(); + if ((subModels != null) && (subModels.length > 0)) { + List<Measurement[]> subMeasurements = new LinkedList<Measurement[]>(); + for (Clusterer subModel : subModels) { + if (subModel != null) { + subMeasurements.add(subModel.getModelMeasurements()); + } + } + Measurement[] avgMeasurements = Measurement + .averageMeasurements(subMeasurements + .toArray(new Measurement[subMeasurements.size()][])); + for (Measurement measurement : avgMeasurements) { + measurementList.add(measurement); + } + } + return measurementList.toArray(new Measurement[measurementList.size()]); + } + + public void getDescription(StringBuilder out, int indent) { + StringUtils.appendIndented(out, indent, "Model type: "); + out.append(this.getClass().getName()); + StringUtils.appendNewline(out); + Measurement.getMeasurementsDescription(getModelMeasurements(), out, + indent); + StringUtils.appendNewlineIndented(out, indent, "Model description:"); + StringUtils.appendNewline(out); + if (trainingHasStarted()) { + getModelDescription(out, indent); + } else { + StringUtils.appendIndented(out, indent, + "Model has not been trained."); + } + } + + public Clusterer[] getSubClusterers() { + return null; + } + + @Override + public Clusterer copy() { + return (Clusterer) super.copy(); + } + + // public boolean correctlyClassifies(Instance inst) { + // return Utils.maxIndex(getVotesForInstance(inst)) == (int) inst + // .classValue(); + // } + + public String getClassNameString() { + return InstancesHeader.getClassNameString(this.modelContext); + } + + public String getClassLabelString(int classLabelIndex) { + return InstancesHeader.getClassLabelString(this.modelContext, + classLabelIndex); + } + + public String getAttributeNameString(int attIndex) { + return InstancesHeader.getAttributeNameString(this.modelContext, + attIndex); + } + + public String getNominalValueString(int attIndex, int valIndex) { + return InstancesHeader.getNominalValueString(this.modelContext, + attIndex, valIndex); + } + + // originalContext notnull + // newContext notnull + public static boolean contextIsCompatible(InstancesHeader originalContext, + InstancesHeader newContext) { + // rule 1: num classes can increase but never decrease + // rule 2: num attributes can increase but never decrease + // rule 3: num nominal attribute values can increase but never decrease + // rule 4: attribute types must stay in the same order (although class + // can + // move; is always skipped over) + // attribute names are free to change, but should always still represent + // the original attributes + if (newContext.numClasses() < originalContext.numClasses()) { + return false; // rule 1 + } + if (newContext.numAttributes() < originalContext.numAttributes()) { + return false; // rule 2 + } + int oPos = 0; + int nPos = 0; + while (oPos < originalContext.numAttributes()) { + if (oPos == originalContext.classIndex()) { + oPos++; + if (!(oPos < originalContext.numAttributes())) { + break; + } + } + if (nPos == newContext.classIndex()) { + nPos++; + } + if (originalContext.attribute(oPos).isNominal()) { + if (!newContext.attribute(nPos).isNominal()) { + return false; // rule 4 + } + if (newContext.attribute(nPos).numValues() < originalContext + .attribute(oPos).numValues()) { + return false; // rule 3 + } + } else { + assert (originalContext.attribute(oPos).isNumeric()); + if (!newContext.attribute(nPos).isNumeric()) { + return false; // rule 4 + } + } + oPos++; + nPos++; + } + return true; // all checks clear + } - protected InstancesHeader modelContext; + // reason for ...Impl methods: + // ease programmer burden by not requiring them to remember calls to super + // in overridden methods & will produce compiler errors if not overridden - protected double trainingWeightSeenByModel = 0.0; + public abstract void resetLearningImpl(); - protected int randomSeed = 1; + public abstract void trainOnInstanceImpl(Instance inst); - protected IntOption randomSeedOption; + protected abstract Measurement[] getModelMeasurementsImpl(); - public FlagOption evaluateMicroClusteringOption; + public abstract void getModelDescription(StringBuilder out, int indent); - protected Random clustererRandom; + protected static int modelAttIndexToInstanceAttIndex(int index, + Instance inst) { + return inst.classIndex() > index ? index : index + 1; + } - protected Clustering clustering; - - public AbstractClusterer() { - if (isRandomizable()) { - this.randomSeedOption = new IntOption("randomSeed", 'r', - "Seed for random behaviour of the Clusterer.", 1); - } + protected static int modelAttIndexToInstanceAttIndex(int index, + Instances insts) { + return insts.classIndex() > index ? index : index + 1; + } - if( implementsMicroClusterer()){ - this.evaluateMicroClusteringOption = - new FlagOption("evaluateMicroClustering", 'M', - "Evaluate the underlying microclustering instead of the macro clustering"); - } - } - - @Override - public void prepareForUseImpl(TaskMonitor monitor, - ObjectRepository repository) { - if (this.randomSeedOption != null) { - this.randomSeed = this.randomSeedOption.getValue(); - } - if (!trainingHasStarted()) { - resetLearning(); - } - clustering = new Clustering(); - } - - public void setModelContext(InstancesHeader ih) { - if ((ih != null) && (ih.classIndex() < 0)) { - throw new IllegalArgumentException( - "Context for a Clusterer must include a class to learn"); - } - if (trainingHasStarted() - && (this.modelContext != null) - && ((ih == null) || !contextIsCompatible(this.modelContext, ih))) { - throw new IllegalArgumentException( - "New context is not compatible with existing model"); - } - this.modelContext = ih; - } - - public InstancesHeader getModelContext() { - return this.modelContext; - } - - public void setRandomSeed(int s) { - this.randomSeed = s; - if (this.randomSeedOption != null) { - // keep option consistent - this.randomSeedOption.setValue(s); - } - } - - public boolean trainingHasStarted() { - return this.trainingWeightSeenByModel > 0.0; - } - - public double trainingWeightSeenByModel() { - return this.trainingWeightSeenByModel; - } - - public void resetLearning() { - this.trainingWeightSeenByModel = 0.0; - if (isRandomizable()) { - this.clustererRandom = new Random(this.randomSeed); - } - resetLearningImpl(); - } - - public void trainOnInstance(Instance inst) { - if (inst.weight() > 0.0) { - this.trainingWeightSeenByModel += inst.weight(); - trainOnInstanceImpl(inst); - } - } - - public Measurement[] getModelMeasurements() { - List<Measurement> measurementList = new LinkedList<Measurement>(); - measurementList.add(new Measurement("model training instances", - trainingWeightSeenByModel())); - measurementList.add(new Measurement("model serialized size (bytes)", - measureByteSize())); - Measurement[] modelMeasurements = getModelMeasurementsImpl(); - if (modelMeasurements != null) { - for (Measurement measurement : modelMeasurements) { - measurementList.add(measurement); - } - } - // add average of sub-model measurements - Clusterer[] subModels = getSubClusterers(); - if ((subModels != null) && (subModels.length > 0)) { - List<Measurement[]> subMeasurements = new LinkedList<Measurement[]>(); - for (Clusterer subModel : subModels) { - if (subModel != null) { - subMeasurements.add(subModel.getModelMeasurements()); - } - } - Measurement[] avgMeasurements = Measurement - .averageMeasurements(subMeasurements - .toArray(new Measurement[subMeasurements.size()][])); - for (Measurement measurement : avgMeasurements) { - measurementList.add(measurement); - } - } - return measurementList.toArray(new Measurement[measurementList.size()]); - } - - public void getDescription(StringBuilder out, int indent) { - StringUtils.appendIndented(out, indent, "Model type: "); - out.append(this.getClass().getName()); - StringUtils.appendNewline(out); - Measurement.getMeasurementsDescription(getModelMeasurements(), out, - indent); - StringUtils.appendNewlineIndented(out, indent, "Model description:"); - StringUtils.appendNewline(out); - if (trainingHasStarted()) { - getModelDescription(out, indent); - } else { - StringUtils.appendIndented(out, indent, - "Model has not been trained."); - } - } - - public Clusterer[] getSubClusterers() { - return null; - } - - @Override - public Clusterer copy() { - return (Clusterer) super.copy(); - } - -// public boolean correctlyClassifies(Instance inst) { -// return Utils.maxIndex(getVotesForInstance(inst)) == (int) inst -// .classValue(); -// } - - public String getClassNameString() { - return InstancesHeader.getClassNameString(this.modelContext); - } - - public String getClassLabelString(int classLabelIndex) { - return InstancesHeader.getClassLabelString(this.modelContext, - classLabelIndex); - } - - public String getAttributeNameString(int attIndex) { - return InstancesHeader.getAttributeNameString(this.modelContext, - attIndex); - } - - public String getNominalValueString(int attIndex, int valIndex) { - return InstancesHeader.getNominalValueString(this.modelContext, - attIndex, valIndex); - } - - // originalContext notnull - // newContext notnull - public static boolean contextIsCompatible(InstancesHeader originalContext, - InstancesHeader newContext) { - // rule 1: num classes can increase but never decrease - // rule 2: num attributes can increase but never decrease - // rule 3: num nominal attribute values can increase but never decrease - // rule 4: attribute types must stay in the same order (although class - // can - // move; is always skipped over) - // attribute names are free to change, but should always still represent - // the original attributes - if (newContext.numClasses() < originalContext.numClasses()) { - return false; // rule 1 - } - if (newContext.numAttributes() < originalContext.numAttributes()) { - return false; // rule 2 - } - int oPos = 0; - int nPos = 0; - while (oPos < originalContext.numAttributes()) { - if (oPos == originalContext.classIndex()) { - oPos++; - if (!(oPos < originalContext.numAttributes())) { - break; - } - } - if (nPos == newContext.classIndex()) { - nPos++; - } - if (originalContext.attribute(oPos).isNominal()) { - if (!newContext.attribute(nPos).isNominal()) { - return false; // rule 4 - } - if (newContext.attribute(nPos).numValues() < originalContext - .attribute(oPos).numValues()) { - return false; // rule 3 - } - } else { - assert (originalContext.attribute(oPos).isNumeric()); - if (!newContext.attribute(nPos).isNumeric()) { - return false; // rule 4 - } - } - oPos++; - nPos++; - } - return true; // all checks clear - } - - // reason for ...Impl methods: - // ease programmer burden by not requiring them to remember calls to super - // in overridden methods & will produce compiler errors if not overridden - - public abstract void resetLearningImpl(); - - public abstract void trainOnInstanceImpl(Instance inst); - - protected abstract Measurement[] getModelMeasurementsImpl(); - - public abstract void getModelDescription(StringBuilder out, int indent); - - protected static int modelAttIndexToInstanceAttIndex(int index, - Instance inst) { - return inst.classIndex() > index ? index : index + 1; - } - - protected static int modelAttIndexToInstanceAttIndex(int index, - Instances insts) { - return insts.classIndex() > index ? index : index + 1; - } - - public boolean implementsMicroClusterer(){ - return false; - } + public boolean implementsMicroClusterer() { + return false; + } - public boolean keepClassLabel(){ - return false; - } - - public Clustering getMicroClusteringResult(){ - return null; - }; + public boolean keepClassLabel() { + return false; + } + + public Clustering getMicroClusteringResult() { + return null; + }; } http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/ClusterGenerator.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/ClusterGenerator.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/ClusterGenerator.java index 6056514..6e01f86 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/ClusterGenerator.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/ClusterGenerator.java @@ -1,4 +1,3 @@ - package com.yahoo.labs.samoa.moa.clusterers; /* @@ -32,337 +31,326 @@ import com.yahoo.labs.samoa.moa.core.Measurement; import com.yahoo.labs.samoa.moa.core.DataPoint; import com.yahoo.labs.samoa.instances.Instance; -public class ClusterGenerator extends AbstractClusterer{ +public class ClusterGenerator extends AbstractClusterer { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; - public IntOption timeWindowOption = new IntOption("timeWindow", - 't', "Rang of the window.", 1000); + public IntOption timeWindowOption = new IntOption("timeWindow", + 't', "Rang of the window.", 1000); - public FloatOption radiusDecreaseOption = new FloatOption("radiusDecrease", 'r', - "The average radii of the centroids in the model.", 0, 0, 1); + public FloatOption radiusDecreaseOption = new FloatOption("radiusDecrease", 'r', + "The average radii of the centroids in the model.", 0, 0, 1); - public FloatOption radiusIncreaseOption = new FloatOption("radiusIncrease", 'R', - "The average radii of the centroids in the model.", 0, 0, 1); + public FloatOption radiusIncreaseOption = new FloatOption("radiusIncrease", 'R', + "The average radii of the centroids in the model.", 0, 0, 1); - public FloatOption positionOffsetOption = new FloatOption("positionOffset", 'p', - "The average radii of the centroids in the model.", 0, 0, 1); + public FloatOption positionOffsetOption = new FloatOption("positionOffset", 'p', + "The average radii of the centroids in the model.", 0, 0, 1); - public FloatOption clusterRemoveOption = new FloatOption("clusterRemove", 'D', - "Deletes complete clusters from the clustering.", 0, 0, 1); + public FloatOption clusterRemoveOption = new FloatOption("clusterRemove", 'D', + "Deletes complete clusters from the clustering.", 0, 0, 1); - public FloatOption joinClustersOption = new FloatOption("joinClusters", 'j', - "Join two clusters if their hull distance is less minRadius times this factor.", 0, 0, 1); + public FloatOption joinClustersOption = new FloatOption("joinClusters", 'j', + "Join two clusters if their hull distance is less minRadius times this factor.", 0, 0, 1); - public FloatOption clusterAddOption = new FloatOption("clusterAdd", 'A', - "Adds additional clusters.", 0, 0, 1); + public FloatOption clusterAddOption = new FloatOption("clusterAdd", 'A', + "Adds additional clusters.", 0, 0, 1); - private static double err_intervall_width = 0.0; - private ArrayList<DataPoint> points; - private int instanceCounter; - private int windowCounter; - private Random random; - private Clustering sourceClustering = null; + private static double err_intervall_width = 0.0; + private ArrayList<DataPoint> points; + private int instanceCounter; + private int windowCounter; + private Random random; + private Clustering sourceClustering = null; - @Override - public void resetLearningImpl() { - points = new ArrayList<DataPoint>(); - instanceCounter = 0; - windowCounter = 0; - random = new Random(227); + @Override + public void resetLearningImpl() { + points = new ArrayList<DataPoint>(); + instanceCounter = 0; + windowCounter = 0; + random = new Random(227); - //joinClustersOption.set(); - //evaluateMicroClusteringOption.set(); - } + // joinClustersOption.set(); + // evaluateMicroClusteringOption.set(); + } - @Override - public void trainOnInstanceImpl(Instance inst) { - if(windowCounter >= timeWindowOption.getValue()){ - points.clear(); - windowCounter = 0; - } - windowCounter++; - instanceCounter++; - points.add( new DataPoint(inst,instanceCounter)); + @Override + public void trainOnInstanceImpl(Instance inst) { + if (windowCounter >= timeWindowOption.getValue()) { + points.clear(); + windowCounter = 0; } - - @Override - public boolean implementsMicroClusterer() { - return true; + windowCounter++; + instanceCounter++; + points.add(new DataPoint(inst, instanceCounter)); + } + + @Override + public boolean implementsMicroClusterer() { + return true; + } + + public void setSourceClustering(Clustering source) { + sourceClustering = source; + } + + @Override + public Clustering getMicroClusteringResult() { + // System.out.println("Numcluster:"+clustering.size()+" / "+num); + // Clustering source_clustering = new Clustering(points, overlapThreshold, + // microInitMinPoints); + if (sourceClustering == null) { + + System.out.println("You need to set a source clustering for the ClusterGenerator to work"); + return null; } - - - public void setSourceClustering(Clustering source){ - sourceClustering = source; - } - - @Override - public Clustering getMicroClusteringResult() { - //System.out.println("Numcluster:"+clustering.size()+" / "+num); - //Clustering source_clustering = new Clustering(points, overlapThreshold, microInitMinPoints); - if(sourceClustering == null){ - - System.out.println("You need to set a source clustering for the ClusterGenerator to work"); - return null; - } - return alterClustering(sourceClustering); + return alterClustering(sourceClustering); + } + + public Clustering getClusteringResult() { + sourceClustering = new Clustering(points); + // if(sourceClustering == null){ + // System.out.println("You need to set a source clustering for the ClusterGenerator to work"); + // return null; + // } + return alterClustering(sourceClustering); + } + + private Clustering alterClustering(Clustering scclustering) { + // percentage of the radius that will be cut off + // 0: no changes to radius + // 1: radius of 0 + double errLevelRadiusDecrease = radiusDecreaseOption.getValue(); + + // 0: no changes to radius + // 1: radius 100% bigger + double errLevelRadiusIncrease = radiusIncreaseOption.getValue(); + + // 0: no changes + // 1: distance between centers is 2 * original radius + double errLevelPosition = positionOffsetOption.getValue(); + + int numRemoveCluster = (int) (clusterRemoveOption.getValue() * scclustering.size()); + + int numAddCluster = (int) (clusterAddOption.getValue() * scclustering.size()); + + for (int c = 0; c < numRemoveCluster; c++) { + int delId = random.nextInt(scclustering.size()); + scclustering.remove(delId); } - - - public Clustering getClusteringResult(){ - sourceClustering = new Clustering(points); -// if(sourceClustering == null){ -// System.out.println("You need to set a source clustering for the ClusterGenerator to work"); -// return null; -// } - return alterClustering(sourceClustering); + int numCluster = scclustering.size(); + double[] err_seeds = new double[numCluster]; + double err_seed_sum = 0.0; + double tmp_seed; + for (int i = 0; i < numCluster; i++) { + tmp_seed = random.nextDouble(); + err_seeds[i] = err_seed_sum + tmp_seed; + err_seed_sum += tmp_seed; } + double sumWeight = 0; + for (int i = 0; i < numCluster; i++) { + sumWeight += scclustering.get(i).getWeight(); + } - private Clustering alterClustering(Clustering scclustering){ - //percentage of the radius that will be cut off - //0: no changes to radius - //1: radius of 0 - double errLevelRadiusDecrease = radiusDecreaseOption.getValue(); - - //0: no changes to radius - //1: radius 100% bigger - double errLevelRadiusIncrease = radiusIncreaseOption.getValue(); - - //0: no changes - //1: distance between centers is 2 * original radius - double errLevelPosition = positionOffsetOption.getValue(); - - - int numRemoveCluster = (int)(clusterRemoveOption.getValue()*scclustering.size()); + Clustering clustering = new Clustering(); + + for (int i = 0; i < numCluster; i++) { + if (!(scclustering.get(i) instanceof SphereCluster)) { + System.out.println("Not a Sphere Cluster"); + continue; + } + SphereCluster sourceCluster = (SphereCluster) scclustering.get(i); + double[] center = Arrays.copyOf(sourceCluster.getCenter(), sourceCluster.getCenter().length); + double weight = sourceCluster.getWeight(); + double radius = sourceCluster.getRadius(); + + // move cluster center + if (errLevelPosition > 0) { + double errOffset = random.nextDouble() * err_intervall_width / 2.0; + double errOffsetDirection = ((random.nextBoolean()) ? 1 : -1); + double level = errLevelPosition + errOffsetDirection * errOffset; + double[] vector = new double[center.length]; + double vectorLength = 0; + for (int d = 0; d < center.length; d++) { + vector[d] = (random.nextBoolean() ? 1 : -1) * random.nextDouble(); + vectorLength += Math.pow(vector[d], 2); + } + vectorLength = Math.sqrt(vectorLength); - int numAddCluster = (int)(clusterAddOption.getValue()*scclustering.size()); + // max is when clusters are next to each other + double length = 2 * radius * level; - for (int c = 0; c < numRemoveCluster; c++) { - int delId = random.nextInt(scclustering.size()); - scclustering.remove(delId); + for (int d = 0; d < center.length; d++) { + // normalize length and then strecht to reach error position + vector[d] = vector[d] / vectorLength * length; } - - int numCluster = scclustering.size(); - double[] err_seeds = new double[numCluster]; - double err_seed_sum = 0.0; - double tmp_seed; - for (int i = 0; i < numCluster; i++) { - tmp_seed = random.nextDouble(); - err_seeds[i] = err_seed_sum + tmp_seed; - err_seed_sum+= tmp_seed; + // System.out.println("Center "+Arrays.toString(center)); + // System.out.println("Vector "+Arrays.toString(vector)); + // check if error position is within bounds + double[] newCenter = new double[center.length]; + for (int d = 0; d < center.length; d++) { + // check bounds, otherwise flip vector + if (center[d] + vector[d] >= 0 && center[d] + vector[d] <= 1) { + newCenter[d] = center[d] + vector[d]; + } + else { + newCenter[d] = center[d] + (-1) * vector[d]; + } } - - double sumWeight = 0; - for (int i = 0; i <numCluster; i++) { - sumWeight+= scclustering.get(i).getWeight(); + center = newCenter; + for (int d = 0; d < center.length; d++) { + if (newCenter[d] >= 0 && newCenter[d] <= 1) { + } + else { + System.out + .println("This shouldnt have happend, Cluster center out of bounds:" + Arrays.toString(newCenter)); + } } + // System.out.println("new Center "+Arrays.toString(newCenter)); - Clustering clustering = new Clustering(); - - for (int i = 0; i <numCluster; i++) { - if(!(scclustering.get(i) instanceof SphereCluster)){ - System.out.println("Not a Sphere Cluster"); - continue; - } - SphereCluster sourceCluster = (SphereCluster)scclustering.get(i); - double[] center = Arrays.copyOf(sourceCluster.getCenter(),sourceCluster.getCenter().length); - double weight = sourceCluster.getWeight(); - double radius = sourceCluster.getRadius(); - - //move cluster center - if(errLevelPosition >0){ - double errOffset = random.nextDouble()*err_intervall_width/2.0; - double errOffsetDirection = ((random.nextBoolean())? 1 : -1); - double level = errLevelPosition + errOffsetDirection * errOffset; - double[] vector = new double[center.length]; - double vectorLength = 0; - for (int d = 0; d < center.length; d++) { - vector[d] = (random.nextBoolean()?1:-1)*random.nextDouble(); - vectorLength += Math.pow(vector[d],2); - } - vectorLength = Math.sqrt(vectorLength); - - - //max is when clusters are next to each other - double length = 2 * radius * level; - - for (int d = 0; d < center.length; d++) { - //normalize length and then strecht to reach error position - vector[d]=vector[d]/vectorLength*length; - } -// System.out.println("Center "+Arrays.toString(center)); -// System.out.println("Vector "+Arrays.toString(vector)); - //check if error position is within bounds - double [] newCenter = new double[center.length]; - for (int d = 0; d < center.length; d++) { - //check bounds, otherwise flip vector - if(center[d] + vector[d] >= 0 && center[d] + vector[d] <= 1){ - newCenter[d] = center[d] + vector[d]; - } - else{ - newCenter[d] = center[d] + (-1)*vector[d]; - } - } - center = newCenter; - for (int d = 0; d < center.length; d++) { - if(newCenter[d] >= 0 && newCenter[d] <= 1){ - } - else{ - System.out.println("This shouldnt have happend, Cluster center out of bounds:"+Arrays.toString(newCenter)); - } - } - //System.out.println("new Center "+Arrays.toString(newCenter)); - - } - - //alter radius - if(errLevelRadiusDecrease > 0 || errLevelRadiusIncrease > 0){ - double errOffset = random.nextDouble()*err_intervall_width/2.0; - int errOffsetDirection = ((random.nextBoolean())? 1 : -1); - - if(errLevelRadiusDecrease > 0 && (errLevelRadiusIncrease == 0 || random.nextBoolean())){ - double level = (errLevelRadiusDecrease + errOffsetDirection * errOffset);//*sourceCluster.getWeight()/sumWeight; - level = (level<0)?0:level; - level = (level>1)?1:level; - radius*=(1-level); - } - else{ - double level = errLevelRadiusIncrease + errOffsetDirection * errOffset; - level = (level<0)?0:level; - level = (level>1)?1:level; - radius+=radius*level; - } - } - - SphereCluster newCluster = new SphereCluster(center, radius, weight); - newCluster.setMeasureValue("Source Cluster", "C"+sourceCluster.getId()); - - clustering.add(newCluster); - } + } - if(joinClustersOption.getValue() > 0){ - clustering = joinClusters(clustering); - } + // alter radius + if (errLevelRadiusDecrease > 0 || errLevelRadiusIncrease > 0) { + double errOffset = random.nextDouble() * err_intervall_width / 2.0; + int errOffsetDirection = ((random.nextBoolean()) ? 1 : -1); - //add new clusters by copying clusters and set a random center - for (int c = 0; c < numAddCluster; c++) { - int copyId = random.nextInt(clustering.size()); - SphereCluster scorg = (SphereCluster)clustering.get(copyId); - int dim = scorg.getCenter().length; - double[] center = new double [dim]; - double radius = scorg.getRadius(); - - boolean outofbounds = true; - int tryCounter = 0; - while(outofbounds && tryCounter < 20){ - tryCounter++; - outofbounds = false; - for (int j = 0; j < center.length; j++) { - center[j] = random.nextDouble(); - if(center[j]- radius < 0 || center[j] + radius > 1){ - outofbounds = true; - break; - } - } - } - if(outofbounds){ - System.out.println("Coludn't place additional cluster"); - } - else{ - SphereCluster scnew = new SphereCluster(center, radius, scorg.getWeight()/2); - scorg.setWeight(scorg.getWeight()-scnew.getWeight()); - clustering.add(scnew); - } + if (errLevelRadiusDecrease > 0 && (errLevelRadiusIncrease == 0 || random.nextBoolean())) { + double level = (errLevelRadiusDecrease + errOffsetDirection * errOffset);// *sourceCluster.getWeight()/sumWeight; + level = (level < 0) ? 0 : level; + level = (level > 1) ? 1 : level; + radius *= (1 - level); + } + else { + double level = errLevelRadiusIncrease + errOffsetDirection * errOffset; + level = (level < 0) ? 0 : level; + level = (level > 1) ? 1 : level; + radius += radius * level; } + } - return clustering; + SphereCluster newCluster = new SphereCluster(center, radius, weight); + newCluster.setMeasureValue("Source Cluster", "C" + sourceCluster.getId()); + clustering.add(newCluster); } + if (joinClustersOption.getValue() > 0) { + clustering = joinClusters(clustering); + } - - private Clustering joinClusters(Clustering clustering){ - - double radiusFactor = joinClustersOption.getValue(); - boolean[] merged = new boolean[clustering.size()]; - - Clustering mclustering = new Clustering(); - - if(radiusFactor >0){ - for (int c1 = 0; c1 < clustering.size(); c1++) { - SphereCluster sc1 = (SphereCluster) clustering.get(c1); - double minDist = Double.MAX_VALUE; - double minOver = 1; - int maxindexCon = -1; - int maxindexOver = -1; - for (int c2 = 0; c2 < clustering.size(); c2++) { - SphereCluster sc2 = (SphereCluster) clustering.get(c2); -// double over = sc1.overlapRadiusDegree(sc2); -// if(over > 0 && over < minOver){ -// minOver = over; -// maxindexOver = c2; -// } - double dist = sc1.getHullDistance(sc2); - double threshold = Math.min(sc1.getRadius(), sc2.getRadius())*radiusFactor; - if(dist > 0 && dist < minDist && dist < threshold){ - minDist = dist; - maxindexCon = c2; - } - } - int maxindex = -1; - if(maxindexOver!=-1) - maxindex = maxindexOver; - else - maxindex = maxindexCon; - - if(maxindex!=-1 && !merged[c1]){ - merged[c1]=true; - merged[maxindex]=true; - SphereCluster scnew = new SphereCluster(sc1.getCenter(),sc1.getRadius(),sc1.getWeight()); - SphereCluster sc2 = (SphereCluster) clustering.get(maxindex); - scnew.merge(sc2); - mclustering.add(scnew); - } - } + // add new clusters by copying clusters and set a random center + for (int c = 0; c < numAddCluster; c++) { + int copyId = random.nextInt(clustering.size()); + SphereCluster scorg = (SphereCluster) clustering.get(copyId); + int dim = scorg.getCenter().length; + double[] center = new double[dim]; + double radius = scorg.getRadius(); + + boolean outofbounds = true; + int tryCounter = 0; + while (outofbounds && tryCounter < 20) { + tryCounter++; + outofbounds = false; + for (int j = 0; j < center.length; j++) { + center[j] = random.nextDouble(); + if (center[j] - radius < 0 || center[j] + radius > 1) { + outofbounds = true; + break; + } } + } + if (outofbounds) { + System.out.println("Coludn't place additional cluster"); + } + else { + SphereCluster scnew = new SphereCluster(center, radius, scorg.getWeight() / 2); + scorg.setWeight(scorg.getWeight() - scnew.getWeight()); + clustering.add(scnew); + } + } - for (int i = 0; i < merged.length; i++) { - if(!merged[i]) - mclustering.add(clustering.get(i)); + return clustering; + + } + + private Clustering joinClusters(Clustering clustering) { + + double radiusFactor = joinClustersOption.getValue(); + boolean[] merged = new boolean[clustering.size()]; + + Clustering mclustering = new Clustering(); + + if (radiusFactor > 0) { + for (int c1 = 0; c1 < clustering.size(); c1++) { + SphereCluster sc1 = (SphereCluster) clustering.get(c1); + double minDist = Double.MAX_VALUE; + double minOver = 1; + int maxindexCon = -1; + int maxindexOver = -1; + for (int c2 = 0; c2 < clustering.size(); c2++) { + SphereCluster sc2 = (SphereCluster) clustering.get(c2); + // double over = sc1.overlapRadiusDegree(sc2); + // if(over > 0 && over < minOver){ + // minOver = over; + // maxindexOver = c2; + // } + double dist = sc1.getHullDistance(sc2); + double threshold = Math.min(sc1.getRadius(), sc2.getRadius()) * radiusFactor; + if (dist > 0 && dist < minDist && dist < threshold) { + minDist = dist; + maxindexCon = c2; + } } + int maxindex = -1; + if (maxindexOver != -1) + maxindex = maxindexOver; + else + maxindex = maxindexCon; + + if (maxindex != -1 && !merged[c1]) { + merged[c1] = true; + merged[maxindex] = true; + SphereCluster scnew = new SphereCluster(sc1.getCenter(), sc1.getRadius(), sc1.getWeight()); + SphereCluster sc2 = (SphereCluster) clustering.get(maxindex); + scnew.merge(sc2); + mclustering.add(scnew); + } + } + } - - return mclustering; - + for (int i = 0; i < merged.length; i++) { + if (!merged[i]) + mclustering.add(clustering.get(i)); } + return mclustering; + } - @Override - protected Measurement[] getModelMeasurementsImpl() { - throw new UnsupportedOperationException("Not supported yet."); - } + @Override + protected Measurement[] getModelMeasurementsImpl() { + throw new UnsupportedOperationException("Not supported yet."); + } - @Override - public void getModelDescription(StringBuilder out, int indent) { - throw new UnsupportedOperationException("Not supported yet."); - } + @Override + public void getModelDescription(StringBuilder out, int indent) { + throw new UnsupportedOperationException("Not supported yet."); + } - @Override - public boolean isRandomizable() { - return false; - } + @Override + public boolean isRandomizable() { + return false; + } - @Override - public boolean keepClassLabel(){ - return true; - } + @Override + public boolean keepClassLabel() { + return true; + } - public double[] getVotesForInstance(Instance inst) { - return null; - } + public double[] getVotesForInstance(Instance inst) { + return null; + } } - - http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/23a35dbe/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/Clusterer.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/Clusterer.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/Clusterer.java index 76d7d50..480da5f 100644 --- a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/Clusterer.java +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/clusterers/Clusterer.java @@ -29,36 +29,36 @@ import com.yahoo.labs.samoa.instances.Instance; public interface Clusterer extends MOAObject, OptionHandler { - public void setModelContext(InstancesHeader ih); + public void setModelContext(InstancesHeader ih); - public InstancesHeader getModelContext(); + public InstancesHeader getModelContext(); - public boolean isRandomizable(); + public boolean isRandomizable(); - public void setRandomSeed(int s); + public void setRandomSeed(int s); - public boolean trainingHasStarted(); + public boolean trainingHasStarted(); - public double trainingWeightSeenByModel(); + public double trainingWeightSeenByModel(); - public void resetLearning(); + public void resetLearning(); - public void trainOnInstance(Instance inst); + public void trainOnInstance(Instance inst); - public double[] getVotesForInstance(Instance inst); + public double[] getVotesForInstance(Instance inst); - public Measurement[] getModelMeasurements(); + public Measurement[] getModelMeasurements(); - public Clusterer[] getSubClusterers(); + public Clusterer[] getSubClusterers(); - public Clusterer copy(); + public Clusterer copy(); - public Clustering getClusteringResult(); + public Clustering getClusteringResult(); - public boolean implementsMicroClusterer(); + public boolean implementsMicroClusterer(); - public Clustering getMicroClusteringResult(); - - public boolean keepClassLabel(); + public Clustering getMicroClusteringResult(); + + public boolean keepClassLabel(); }
