http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/AbstractClassifier.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/AbstractClassifier.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/AbstractClassifier.java new file mode 100644 index 0000000..09de49e --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/AbstractClassifier.java @@ -0,0 +1,378 @@ + +package com.yahoo.labs.samoa.moa.classifiers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Random; + +import com.github.javacliparser.IntOption; +import com.yahoo.labs.samoa.instances.Instance; +import com.yahoo.labs.samoa.instances.InstancesHeader; +import com.yahoo.labs.samoa.moa.MOAObject; +import com.yahoo.labs.samoa.moa.core.Example; +import com.yahoo.labs.samoa.moa.core.Measurement; +import com.yahoo.labs.samoa.moa.core.ObjectRepository; +import com.yahoo.labs.samoa.moa.core.StringUtils; +import com.yahoo.labs.samoa.moa.core.Utils; +import com.yahoo.labs.samoa.moa.learners.Learner; +import com.yahoo.labs.samoa.moa.options.AbstractOptionHandler; +import com.yahoo.labs.samoa.moa.tasks.TaskMonitor; + +public abstract class AbstractClassifier extends AbstractOptionHandler implements Classifier { + + @Override + public String getPurposeString() { + return "MOA Classifier: " + getClass().getCanonicalName(); + } + + /** Header of the instances of the data stream */ + protected InstancesHeader modelContext; + + /** Sum of the weights of the instances trained by this model */ + protected double trainingWeightSeenByModel = 0.0; + + /** Random seed used in randomizable learners */ + protected int randomSeed = 1; + + /** Option for randomizable learners to change the random seed */ + protected IntOption randomSeedOption; + + /** Random Generator used in randomizable learners */ + public Random classifierRandom; + + /** + * Creates an classifier and setups the random seed option + * if the classifier is randomizable. + */ + public AbstractClassifier() { + if (isRandomizable()) { + this.randomSeedOption = new IntOption("randomSeed", 'r', + "Seed for random behaviour of the classifier.", 1); + } + } + + @Override + public void prepareForUseImpl(TaskMonitor monitor, + ObjectRepository repository) { + if (this.randomSeedOption != null) { + this.randomSeed = this.randomSeedOption.getValue(); + } + if (!trainingHasStarted()) { + resetLearning(); + } + } + + + @Override + public double[] getVotesForInstance(Example<Instance> example){ + return getVotesForInstance(example.getData()); + } + + @Override + public abstract double[] getVotesForInstance(Instance inst); + + @Override + public void setModelContext(InstancesHeader ih) { + if ((ih != null) && (ih.classIndex() < 0)) { + throw new IllegalArgumentException( + "Context for a classifier must include a class to learn"); + } + if (trainingHasStarted() + && (this.modelContext != null) + && ((ih == null) || !contextIsCompatible(this.modelContext, ih))) { + throw new IllegalArgumentException( + "New context is not compatible with existing model"); + } + this.modelContext = ih; + } + + @Override + public InstancesHeader getModelContext() { + return this.modelContext; + } + + @Override + public void setRandomSeed(int s) { + this.randomSeed = s; + if (this.randomSeedOption != null) { + // keep option consistent + this.randomSeedOption.setValue(s); + } + } + + @Override + public boolean trainingHasStarted() { + return this.trainingWeightSeenByModel > 0.0; + } + + @Override + public double trainingWeightSeenByModel() { + return this.trainingWeightSeenByModel; + } + + @Override + public void resetLearning() { + this.trainingWeightSeenByModel = 0.0; + if (isRandomizable()) { + this.classifierRandom = new Random(this.randomSeed); + } + resetLearningImpl(); + } + + @Override + public void trainOnInstance(Instance inst) { + if (inst.weight() > 0.0) { + this.trainingWeightSeenByModel += inst.weight(); + trainOnInstanceImpl(inst); + } + } + + @Override + public Measurement[] getModelMeasurements() { + List<Measurement> measurementList = new LinkedList<>(); + measurementList.add(new Measurement("model training instances", + trainingWeightSeenByModel())); + measurementList.add(new Measurement("model serialized size (bytes)", + measureByteSize())); + Measurement[] modelMeasurements = getModelMeasurementsImpl(); + if (modelMeasurements != null) { + measurementList.addAll(Arrays.asList(modelMeasurements)); + } + // add average of sub-model measurements + Learner[] subModels = getSublearners(); + if ((subModels != null) && (subModels.length > 0)) { + List<Measurement[]> subMeasurements = new LinkedList<>(); + for (Learner subModel : subModels) { + if (subModel != null) { + subMeasurements.add(subModel.getModelMeasurements()); + } + } + Measurement[] avgMeasurements = Measurement.averageMeasurements(subMeasurements.toArray(new Measurement[subMeasurements.size()][])); + measurementList.addAll(Arrays.asList(avgMeasurements)); + } + return measurementList.toArray(new Measurement[measurementList.size()]); + } + + @Override + public void getDescription(StringBuilder out, int indent) { + StringUtils.appendIndented(out, indent, "Model type: "); + out.append(this.getClass().getName()); + StringUtils.appendNewline(out); + Measurement.getMeasurementsDescription(getModelMeasurements(), out, + indent); + StringUtils.appendNewlineIndented(out, indent, "Model description:"); + StringUtils.appendNewline(out); + if (trainingHasStarted()) { + getModelDescription(out, indent); + } else { + StringUtils.appendIndented(out, indent, + "Model has not been trained."); + } + } + + @Override + public Learner[] getSublearners() { + return null; + } + + + @Override + public Classifier[] getSubClassifiers() { + return null; + } + + + @Override + public Classifier copy() { + return (Classifier) super.copy(); + } + + + @Override + public MOAObject getModel(){ + return this; + } + + @Override + public void trainOnInstance(Example<Instance> example){ + trainOnInstance(example.getData()); + } + + @Override + public boolean correctlyClassifies(Instance inst) { + return Utils.maxIndex(getVotesForInstance(inst)) == (int) inst.classValue(); + } + + /** + * Gets the name of the attribute of the class from the header. + * + * @return the string with name of the attribute of the class + */ + public String getClassNameString() { + return InstancesHeader.getClassNameString(this.modelContext); + } + + /** + * Gets the name of a label of the class from the header. + * + * @param classLabelIndex the label index + * @return the name of the label of the class + */ + public String getClassLabelString(int classLabelIndex) { + return InstancesHeader.getClassLabelString(this.modelContext, + classLabelIndex); + } + + /** + * Gets the name of an attribute from the header. + * + * @param attIndex the attribute index + * @return the name of the attribute + */ + public String getAttributeNameString(int attIndex) { + return InstancesHeader.getAttributeNameString(this.modelContext, attIndex); + } + + /** + * Gets the name of a value of an attribute from the header. + * + * @param attIndex the attribute index + * @param valIndex the value of the attribute + * @return the name of the value of the attribute + */ + public String getNominalValueString(int attIndex, int valIndex) { + return InstancesHeader.getNominalValueString(this.modelContext, attIndex, valIndex); + } + + + /** + * Returns if two contexts or headers of instances are compatible.<br><br> + * + * Two contexts are compatible if they follow the following rules:<br> + * Rule 1: num classes can increase but never decrease<br> + * Rule 2: num attributes can increase but never decrease<br> + * Rule 3: num nominal attribute values can increase but never decrease<br> + * Rule 4: attribute types must stay in the same order (although class + * can move; is always skipped over)<br><br> + * + * Attribute names are free to change, but should always still represent + * the original attributes. + * + * @param originalContext the first context to compare + * @param newContext the second context to compare + * @return true if the two contexts are compatible. + */ + public static boolean contextIsCompatible(InstancesHeader originalContext, + InstancesHeader newContext) { + + if (newContext.numClasses() < originalContext.numClasses()) { + return false; // rule 1 + } + if (newContext.numAttributes() < originalContext.numAttributes()) { + return false; // rule 2 + } + int oPos = 0; + int nPos = 0; + while (oPos < originalContext.numAttributes()) { + if (oPos == originalContext.classIndex()) { + oPos++; + if (!(oPos < originalContext.numAttributes())) { + break; + } + } + if (nPos == newContext.classIndex()) { + nPos++; + } + if (originalContext.attribute(oPos).isNominal()) { + if (!newContext.attribute(nPos).isNominal()) { + return false; // rule 4 + } + if (newContext.attribute(nPos).numValues() < originalContext.attribute(oPos).numValues()) { + return false; // rule 3 + } + } else { + assert (originalContext.attribute(oPos).isNumeric()); + if (!newContext.attribute(nPos).isNumeric()) { + return false; // rule 4 + } + } + oPos++; + nPos++; + } + return true; // all checks clear + } + + + + /** + * Resets this classifier. It must be similar to + * starting a new classifier from scratch. <br><br> + * + * The reason for ...Impl methods: ease programmer burden by not requiring + * them to remember calls to super in overridden methods. + * Note that this will produce compiler errors if not overridden. + */ + public abstract void resetLearningImpl(); + + /** + * Trains this classifier incrementally using the given instance.<br><br> + * + * The reason for ...Impl methods: ease programmer burden by not requiring + * them to remember calls to super in overridden methods. + * Note that this will produce compiler errors if not overridden. + * + * @param inst the instance to be used for training + */ + public abstract void trainOnInstanceImpl(Instance inst); + + /** + * Gets the current measurements of this classifier.<br><br> + * + * The reason for ...Impl methods: ease programmer burden by not requiring + * them to remember calls to super in overridden methods. + * Note that this will produce compiler errors if not overridden. + * + * @return an array of measurements to be used in evaluation tasks + */ + protected abstract Measurement[] getModelMeasurementsImpl(); + + /** + * Returns a string representation of the model. + * + * @param out the stringbuilder to add the description + * @param indent the number of characters to indent + */ + public abstract void getModelDescription(StringBuilder out, int indent); + + /** + * Gets the index of the attribute in the instance, + * given the index of the attribute in the learner. + * + * @param index the index of the attribute in the learner + * @return the index in the instance + */ + protected static int modelAttIndexToInstanceAttIndex(int index) { + return index; //inst.classIndex() > index ? index : index + 1; + } +}
http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/Classifier.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/Classifier.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/Classifier.java new file mode 100644 index 0000000..efbc918 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/Classifier.java @@ -0,0 +1,77 @@ +package com.yahoo.labs.samoa.moa.classifiers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.instances.Instance; +import com.yahoo.labs.samoa.moa.core.Example; +import com.yahoo.labs.samoa.moa.learners.Learner; + +/** + * Classifier interface for incremental classification models. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public interface Classifier extends Learner<Example<Instance>> { + + /** + * Gets the classifiers of this ensemble. Returns null if this learner is a + * single learner. + * + * @return an array of the learners of the ensemble + */ + public Classifier[] getSubClassifiers(); + + /** + * Produces a copy of this learner. + * + * @return the copy of this learner + */ + public Classifier copy(); + + /** + * Gets whether this classifier correctly classifies an instance. Uses + * getVotesForInstance to obtain the prediction and the instance to obtain + * its true class. + * + * + * @param inst the instance to be classified + * @return true if the instance is correctly classified + */ + public boolean correctlyClassifies(Instance inst); + + /** + * Trains this learner incrementally using the given example. + * + * @param inst the instance to be used for training + */ + public void trainOnInstance(Instance inst); + + /** + * Predicts the class memberships for a given instance. If an instance is + * unclassified, the returned array elements must be all zero. + * + * @param inst the instance to be classified + * @return an array containing the estimated membership probabilities of the + * test instance in each class + */ + public double[] getVotesForInstance(Instance inst); +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/Regressor.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/Regressor.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/Regressor.java new file mode 100644 index 0000000..758f5c4 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/Regressor.java @@ -0,0 +1,31 @@ +package com.yahoo.labs.samoa.moa.classifiers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +/** + * Regressor interface for incremental regression models. It is used only in the GUI Regression Tab. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public interface Regressor { + +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/AttributeSplitSuggestion.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/AttributeSplitSuggestion.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/AttributeSplitSuggestion.java new file mode 100644 index 0000000..1ecc9ed --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/AttributeSplitSuggestion.java @@ -0,0 +1,68 @@ +package com.yahoo.labs.samoa.moa.classifiers.core; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.moa.AbstractMOAObject; +import com.yahoo.labs.samoa.moa.classifiers.core.conditionaltests.InstanceConditionalTest; + +/** + * Class for computing attribute split suggestions given a split test. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public class AttributeSplitSuggestion extends AbstractMOAObject implements Comparable<AttributeSplitSuggestion> { + + private static final long serialVersionUID = 1L; + + public InstanceConditionalTest splitTest; + + public double[][] resultingClassDistributions; + + public double merit; + + public AttributeSplitSuggestion() {} + + public AttributeSplitSuggestion(InstanceConditionalTest splitTest, + double[][] resultingClassDistributions, double merit) { + this.splitTest = splitTest; + this.resultingClassDistributions = resultingClassDistributions.clone(); + this.merit = merit; + } + + public int numSplits() { + return this.resultingClassDistributions.length; + } + + public double[] resultingClassDistributionFromSplit(int splitIndex) { + return this.resultingClassDistributions[splitIndex].clone(); + } + + @Override + public int compareTo(AttributeSplitSuggestion comp) { + return Double.compare(this.merit, comp.merit); + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + // do nothing + } +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/AttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/AttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/AttributeClassObserver.java new file mode 100644 index 0000000..d6adc2e --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/AttributeClassObserver.java @@ -0,0 +1,73 @@ +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.moa.classifiers.core.AttributeSplitSuggestion; +import com.yahoo.labs.samoa.moa.classifiers.core.splitcriteria.SplitCriterion; +import com.yahoo.labs.samoa.moa.options.OptionHandler; + +/** + * Interface for observing the class data distribution for an attribute. + * This observer monitors the class distribution of a given attribute. + * Used in naive Bayes and decision trees to monitor data statistics on leaves. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public interface AttributeClassObserver extends OptionHandler { + + /** + * Updates statistics of this observer given an attribute value, a class + * and the weight of the instance observed + * + * @param attVal the value of the attribute + * @param classVal the class + * @param weight the weight of the instance + */ + public void observeAttributeClass(double attVal, int classVal, double weight); + + /** + * Gets the probability for an attribute value given a class + * + * @param attVal the attribute value + * @param classVal the class + * @return probability for an attribute value given a class + */ + public double probabilityOfAttributeValueGivenClass(double attVal, + int classVal); + + /** + * Gets the best split suggestion given a criterion and a class distribution + * + * @param criterion the split criterion to use + * @param preSplitDist the class distribution before the split + * @param attIndex the attribute index + * @param binaryOnly true to use binary splits + * @return suggestion of best attribute split + */ + public AttributeSplitSuggestion getBestEvaluatedSplitSuggestion( + SplitCriterion criterion, double[] preSplitDist, int attIndex, + boolean binaryOnly); + + + public void observeAttributeTarget(double attVal, double target); + +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/BinaryTreeNumericAttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/BinaryTreeNumericAttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/BinaryTreeNumericAttributeClassObserver.java new file mode 100644 index 0000000..e9bb2f9 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/BinaryTreeNumericAttributeClassObserver.java @@ -0,0 +1,183 @@ +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import java.io.Serializable; +import com.yahoo.labs.samoa.moa.classifiers.core.AttributeSplitSuggestion; +import com.yahoo.labs.samoa.moa.classifiers.core.conditionaltests.NumericAttributeBinaryTest; +import com.yahoo.labs.samoa.moa.classifiers.core.splitcriteria.SplitCriterion; +import com.yahoo.labs.samoa.moa.core.DoubleVector; +import com.yahoo.labs.samoa.moa.core.ObjectRepository; +import com.yahoo.labs.samoa.moa.options.AbstractOptionHandler; +import com.yahoo.labs.samoa.moa.tasks.TaskMonitor; + +/** + * Class for observing the class data distribution for a numeric attribute using a binary tree. + * This observer monitors the class distribution of a given attribute. + * Used in naive Bayes and decision trees to monitor data statistics on leaves. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public class BinaryTreeNumericAttributeClassObserver extends AbstractOptionHandler + implements NumericAttributeClassObserver { + + private static final long serialVersionUID = 1L; + + public class Node implements Serializable { + + private static final long serialVersionUID = 1L; + + public double cut_point; + + public DoubleVector classCountsLeft = new DoubleVector(); + + public DoubleVector classCountsRight = new DoubleVector(); + + public Node left; + + public Node right; + + public Node(double val, int label, double weight) { + this.cut_point = val; + this.classCountsLeft.addToValue(label, weight); + } + + public void insertValue(double val, int label, double weight) { + if (val == this.cut_point) { + this.classCountsLeft.addToValue(label, weight); + } else if (val <= this.cut_point) { + this.classCountsLeft.addToValue(label, weight); + if (this.left == null) { + this.left = new Node(val, label, weight); + } else { + this.left.insertValue(val, label, weight); + } + } else { // val > cut_point + this.classCountsRight.addToValue(label, weight); + if (this.right == null) { + this.right = new Node(val, label, weight); + } else { + this.right.insertValue(val, label, weight); + } + } + } + } + + public Node root = null; + + @Override + public void observeAttributeClass(double attVal, int classVal, double weight) { + if (Double.isNaN(attVal)) { //Instance.isMissingValue(attVal) + } else { + if (this.root == null) { + this.root = new Node(attVal, classVal, weight); + } else { + this.root.insertValue(attVal, classVal, weight); + } + } + } + + @Override + public double probabilityOfAttributeValueGivenClass(double attVal, + int classVal) { + // TODO: NaiveBayes broken until implemented + return 0.0; + } + + @Override + public AttributeSplitSuggestion getBestEvaluatedSplitSuggestion( + SplitCriterion criterion, double[] preSplitDist, int attIndex, + boolean binaryOnly) { + return searchForBestSplitOption(this.root, null, null, null, null, false, + criterion, preSplitDist, attIndex); + } + + protected AttributeSplitSuggestion searchForBestSplitOption( + Node currentNode, AttributeSplitSuggestion currentBestOption, + double[] actualParentLeft, + double[] parentLeft, double[] parentRight, boolean leftChild, + SplitCriterion criterion, double[] preSplitDist, int attIndex) { + if (currentNode == null) { + return currentBestOption; + } + DoubleVector leftDist = new DoubleVector(); + DoubleVector rightDist = new DoubleVector(); + if (parentLeft == null) { + leftDist.addValues(currentNode.classCountsLeft); + rightDist.addValues(currentNode.classCountsRight); + } else { + leftDist.addValues(parentLeft); + rightDist.addValues(parentRight); + if (leftChild) { + //get the exact statistics of the parent value + DoubleVector exactParentDist = new DoubleVector(); + exactParentDist.addValues(actualParentLeft); + exactParentDist.subtractValues(currentNode.classCountsLeft); + exactParentDist.subtractValues(currentNode.classCountsRight); + + // move the subtrees + leftDist.subtractValues(currentNode.classCountsRight); + rightDist.addValues(currentNode.classCountsRight); + + // move the exact value from the parent + rightDist.addValues(exactParentDist); + leftDist.subtractValues(exactParentDist); + + } else { + leftDist.addValues(currentNode.classCountsLeft); + rightDist.subtractValues(currentNode.classCountsLeft); + } + } + double[][] postSplitDists = new double[][]{leftDist.getArrayRef(), + rightDist.getArrayRef()}; + double merit = criterion.getMeritOfSplit(preSplitDist, postSplitDists); + if ((currentBestOption == null) || (merit > currentBestOption.merit)) { + currentBestOption = new AttributeSplitSuggestion( + new NumericAttributeBinaryTest(attIndex, + currentNode.cut_point, true), postSplitDists, merit); + + } + currentBestOption = searchForBestSplitOption(currentNode.left, + currentBestOption, currentNode.classCountsLeft.getArrayRef(), postSplitDists[0], postSplitDists[1], true, + criterion, preSplitDist, attIndex); + currentBestOption = searchForBestSplitOption(currentNode.right, + currentBestOption, currentNode.classCountsLeft.getArrayRef(), postSplitDists[0], postSplitDists[1], false, + criterion, preSplitDist, attIndex); + return currentBestOption; + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + // TODO Auto-generated method stub + } + + @Override + protected void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { + // TODO Auto-generated method stub + } + + @Override + public void observeAttributeTarget(double attVal, double target) { + throw new UnsupportedOperationException("Not supported yet."); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/BinaryTreeNumericAttributeClassObserverRegression.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/BinaryTreeNumericAttributeClassObserverRegression.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/BinaryTreeNumericAttributeClassObserverRegression.java new file mode 100644 index 0000000..a68cad9 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/BinaryTreeNumericAttributeClassObserverRegression.java @@ -0,0 +1,148 @@ + +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2013 University of Porto, Portugal + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + + +import java.io.Serializable; +import com.yahoo.labs.samoa.moa.classifiers.core.AttributeSplitSuggestion; +import com.yahoo.labs.samoa.moa.classifiers.core.splitcriteria.SplitCriterion; +import com.yahoo.labs.samoa.moa.core.ObjectRepository; +import com.yahoo.labs.samoa.moa.options.AbstractOptionHandler; +import com.yahoo.labs.samoa.moa.tasks.TaskMonitor; + +/** + * Class for observing the class data distribution for a numeric attribute using a binary tree. + * This observer monitors the class distribution of a given attribute. + * + * <p>Learning Adaptive Model Rules from High-Speed Data Streams, ECML 2013, E. Almeida, C. Ferreira, P. Kosina and J. Gama; </p> + * + * @author E. Almeida, J. Gama + * @version $Revision: 2$ + */ +public class BinaryTreeNumericAttributeClassObserverRegression extends AbstractOptionHandler + implements NumericAttributeClassObserver { + + public static final long serialVersionUID = 1L; + + public class Node implements Serializable { + + private static final long serialVersionUID = 1L; + + public double cut_point; + + public double[] lessThan; //This array maintains statistics for the instance reaching the node with attribute values less than or iqual to the cutpoint. + + public double[] greaterThan; //This array maintains statistics for the instance reaching the node with attribute values greater than to the cutpoint. + + public Node left; + + public Node right; + + public Node(double val, double target) { + this.cut_point = val; + this.lessThan = new double[3]; + this.greaterThan = new double[3]; + this.lessThan[0] = target; //The sum of their target attribute values. + this.lessThan[1] = target * target; //The sum of the squared target attribute values. + this.lessThan[2] = 1.0; //A counter of the number of instances that have reached the node. + this.greaterThan[0] = 0.0; + this.greaterThan[1] = 0.0; + this.greaterThan[2] = 0.0; + } + + public void insertValue(double val, double target) { + if (val == this.cut_point) { + this.lessThan[0] = this.lessThan[0] + target; + this.lessThan[1] = this.lessThan[1] + (target * target); + this.lessThan[2] = this.lessThan[2] + 1; + } else if (val <= this.cut_point) { + this.lessThan[0] = this.lessThan[0] + target; + this.lessThan[1] = this.lessThan[1] + (target * target); + this.lessThan[2] = this.lessThan[2] + 1; + if (this.left == null) { + this.left = new Node(val, target); + } else { + this.left.insertValue(val, target); + } + } else { + this.greaterThan[0] = this.greaterThan[0] + target; + this.greaterThan[1] = this.greaterThan[1] + (target*target); + this.greaterThan[2] = this.greaterThan[2] + 1; + if (this.right == null) { + + this.right = new Node(val, target); + } else { + this.right.insertValue(val, target); + } + } + } + } + + public Node root1 = null; + + public void observeAttributeTarget(double attVal, double target){ + if (!Double.isNaN(attVal)) { + if (this.root1 == null) { + this.root1 = new Node(attVal, target); + } else { + this.root1.insertValue(attVal, target); + } + } + } + + @Override + public void observeAttributeClass(double attVal, int classVal, double weight) { + + } + + @Override + public double probabilityOfAttributeValueGivenClass(double attVal, + int classVal) { + return 0.0; + } + + @Override + public AttributeSplitSuggestion getBestEvaluatedSplitSuggestion( + SplitCriterion criterion, double[] preSplitDist, int attIndex, + boolean binaryOnly) { + return searchForBestSplitOption(this.root1, null, null, null, null, false, + criterion, preSplitDist, attIndex); + } + + protected AttributeSplitSuggestion searchForBestSplitOption( + Node currentNode, AttributeSplitSuggestion currentBestOption, + double[] actualParentLeft, + double[] parentLeft, double[] parentRight, boolean leftChild, + SplitCriterion criterion, double[] preSplitDist, int attIndex) { + + return currentBestOption; + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + } + + @Override + protected void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { + } +} + http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/DiscreteAttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/DiscreteAttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/DiscreteAttributeClassObserver.java new file mode 100644 index 0000000..e756fcd --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/DiscreteAttributeClassObserver.java @@ -0,0 +1,34 @@ +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +/** + * Interface for observing the class data distribution for a discrete (nominal) attribute. + * This observer monitors the class distribution of a given attribute. + * Used in naive Bayes and decision trees to monitor data statistics on leaves. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public interface DiscreteAttributeClassObserver extends AttributeClassObserver { + + +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/FIMTDDNumericAttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/FIMTDDNumericAttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/FIMTDDNumericAttributeClassObserver.java new file mode 100644 index 0000000..2434652 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/FIMTDDNumericAttributeClassObserver.java @@ -0,0 +1,240 @@ + +/* Project Knowledge Discovery from Data Streams, FCT LIAAD-INESC TEC, + * + * Contact: [email protected] + */ + +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2013 University of Porto, Portugal + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import java.io.Serializable; + +import com.yahoo.labs.samoa.moa.classifiers.core.AttributeSplitSuggestion; +import com.yahoo.labs.samoa.moa.classifiers.core.conditionaltests.NumericAttributeBinaryTest; +import com.yahoo.labs.samoa.moa.classifiers.core.splitcriteria.SplitCriterion; +import com.yahoo.labs.samoa.moa.core.DoubleVector; +import com.yahoo.labs.samoa.moa.core.ObjectRepository; +import com.yahoo.labs.samoa.moa.tasks.TaskMonitor; + +public class FIMTDDNumericAttributeClassObserver extends BinaryTreeNumericAttributeClassObserver implements NumericAttributeClassObserver { + + private static final long serialVersionUID = 1L; + + protected class Node implements Serializable { + + private static final long serialVersionUID = 1L; + + // The split point to use + public double cut_point; + + // E-BST statistics + public DoubleVector leftStatistics = new DoubleVector(); + public DoubleVector rightStatistics = new DoubleVector(); + + // Child nodes + public Node left; + public Node right; + + public Node(double val, double label, double weight) { + this.cut_point = val; + this.leftStatistics.addToValue(0, 1); + this.leftStatistics.addToValue(1, label); + this.leftStatistics.addToValue(2, label * label); + } + + /** + * Insert a new value into the tree, updating both the sum of values and + * sum of squared values arrays + */ + public void insertValue(double val, double label, double weight) { + + // If the new value equals the value stored in a node, update + // the left (<=) node information + if (val == this.cut_point) { + this.leftStatistics.addToValue(0, 1); + this.leftStatistics.addToValue(1, label); + this.leftStatistics.addToValue(2, label * label); + } // If the new value is less than the value in a node, update the + // left distribution and send the value down to the left child node. + // If no left child exists, create one + else if (val <= this.cut_point) { + this.leftStatistics.addToValue(0, 1); + this.leftStatistics.addToValue(1, label); + this.leftStatistics.addToValue(2, label * label); + if (this.left == null) { + this.left = new Node(val, label, weight); + } else { + this.left.insertValue(val, label, weight); + } + } // If the new value is greater than the value in a node, update the + // right (>) distribution and send the value down to the right child node. + // If no right child exists, create one + else { // val > cut_point + this.rightStatistics.addToValue(0, 1); + this.rightStatistics.addToValue(1, label); + this.rightStatistics.addToValue(2, label * label); + if (this.right == null) { + this.right = new Node(val, label, weight); + } else { + this.right.insertValue(val, label, weight); + } + } + } + } + + // Root node of the E-BST structure for this attribute + public Node root = null; + + // Global variables for use in the FindBestSplit algorithm + double sumTotalLeft; + double sumTotalRight; + double sumSqTotalLeft; + double sumSqTotalRight; + double countRightTotal; + double countLeftTotal; + + public void observeAttributeClass(double attVal, double classVal, double weight) { + if (!Double.isNaN(attVal)) { + if (this.root == null) { + this.root = new Node(attVal, classVal, weight); + } else { + this.root.insertValue(attVal, classVal, weight); + } + } + } + + @Override + public double probabilityOfAttributeValueGivenClass(double attVal, + int classVal) { + // TODO: NaiveBayes broken until implemented + return 0.0; + } + + @Override + public AttributeSplitSuggestion getBestEvaluatedSplitSuggestion(SplitCriterion criterion, double[] preSplitDist, int attIndex, boolean binaryOnly) { + + // Initialise global variables + sumTotalLeft = 0; + sumTotalRight = preSplitDist[1]; + sumSqTotalLeft = 0; + sumSqTotalRight = preSplitDist[2]; + countLeftTotal = 0; + countRightTotal = preSplitDist[0]; + return searchForBestSplitOption(this.root, null, criterion, attIndex); + } + + /** + * Implementation of the FindBestSplit algorithm from E.Ikonomovska et al. + */ + protected AttributeSplitSuggestion searchForBestSplitOption(Node currentNode, AttributeSplitSuggestion currentBestOption, SplitCriterion criterion, int attIndex) { + // Return null if the current node is null or we have finished looking through all the possible splits + if (currentNode == null || countRightTotal == 0.0) { + return currentBestOption; + } + + if (currentNode.left != null) { + currentBestOption = searchForBestSplitOption(currentNode.left, currentBestOption, criterion, attIndex); + } + + sumTotalLeft += currentNode.leftStatistics.getValue(1); + sumTotalRight -= currentNode.leftStatistics.getValue(1); + sumSqTotalLeft += currentNode.leftStatistics.getValue(2); + sumSqTotalRight -= currentNode.leftStatistics.getValue(2); + countLeftTotal += currentNode.leftStatistics.getValue(0); + countRightTotal -= currentNode.leftStatistics.getValue(0); + + double[][] postSplitDists = new double[][]{{countLeftTotal, sumTotalLeft, sumSqTotalLeft}, {countRightTotal, sumTotalRight, sumSqTotalRight}}; + double[] preSplitDist = new double[]{(countLeftTotal + countRightTotal), (sumTotalLeft + sumTotalRight), (sumSqTotalLeft + sumSqTotalRight)}; + double merit = criterion.getMeritOfSplit(preSplitDist, postSplitDists); + + if ((currentBestOption == null) || (merit > currentBestOption.merit)) { + currentBestOption = new AttributeSplitSuggestion( + new NumericAttributeBinaryTest(attIndex, + currentNode.cut_point, true), postSplitDists, merit); + + } + + if (currentNode.right != null) { + currentBestOption = searchForBestSplitOption(currentNode.right, currentBestOption, criterion, attIndex); + } + sumTotalLeft -= currentNode.leftStatistics.getValue(1); + sumTotalRight += currentNode.leftStatistics.getValue(1); + sumSqTotalLeft -= currentNode.leftStatistics.getValue(2); + sumSqTotalRight += currentNode.leftStatistics.getValue(2); + countLeftTotal -= currentNode.leftStatistics.getValue(0); + countRightTotal += currentNode.leftStatistics.getValue(0); + + return currentBestOption; + } + + /** + * A method to remove all nodes in the E-BST in which it and all it's + * children represent 'bad' split points + */ + public void removeBadSplits(SplitCriterion criterion, double lastCheckRatio, double lastCheckSDR, double lastCheckE) { + removeBadSplitNodes(criterion, this.root, lastCheckRatio, lastCheckSDR, lastCheckE); + } + + /** + * Recursive method that first checks all of a node's children before + * deciding if it is 'bad' and may be removed + */ + private boolean removeBadSplitNodes(SplitCriterion criterion, Node currentNode, double lastCheckRatio, double lastCheckSDR, double lastCheckE) { + boolean isBad = false; + + if (currentNode == null) { + return true; + } + + if (currentNode.left != null) { + isBad = removeBadSplitNodes(criterion, currentNode.left, lastCheckRatio, lastCheckSDR, lastCheckE); + } + + if (currentNode.right != null && isBad) { + isBad = removeBadSplitNodes(criterion, currentNode.left, lastCheckRatio, lastCheckSDR, lastCheckE); + } + + if (isBad) { + + double[][] postSplitDists = new double[][]{{currentNode.leftStatistics.getValue(0), currentNode.leftStatistics.getValue(1), currentNode.leftStatistics.getValue(2)}, {currentNode.rightStatistics.getValue(0), currentNode.rightStatistics.getValue(1), currentNode.rightStatistics.getValue(2)}}; + double[] preSplitDist = new double[]{(currentNode.leftStatistics.getValue(0) + currentNode.rightStatistics.getValue(0)), (currentNode.leftStatistics.getValue(1) + currentNode.rightStatistics.getValue(1)), (currentNode.leftStatistics.getValue(2) + currentNode.rightStatistics.getValue(2))}; + double merit = criterion.getMeritOfSplit(preSplitDist, postSplitDists); + + if ((merit / lastCheckSDR) < (lastCheckRatio - (2 * lastCheckE))) { + currentNode = null; + return true; + } + } + + return false; + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + // TODO Auto-generated method stub + } + + @Override + protected void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { + // TODO Auto-generated method stub + } +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/GaussianNumericAttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/GaussianNumericAttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/GaussianNumericAttributeClassObserver.java new file mode 100644 index 0000000..21f58b1 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/GaussianNumericAttributeClassObserver.java @@ -0,0 +1,182 @@ +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.moa.core.ObjectRepository; +import com.yahoo.labs.samoa.moa.tasks.TaskMonitor; +import com.yahoo.labs.samoa.moa.core.Utils; + +import java.util.Set; +import java.util.TreeSet; +import com.yahoo.labs.samoa.moa.classifiers.core.AttributeSplitSuggestion; +import com.yahoo.labs.samoa.moa.classifiers.core.conditionaltests.NumericAttributeBinaryTest; +import com.yahoo.labs.samoa.moa.classifiers.core.splitcriteria.SplitCriterion; + +import com.yahoo.labs.samoa.moa.core.AutoExpandVector; +import com.yahoo.labs.samoa.moa.core.DoubleVector; +import com.yahoo.labs.samoa.moa.core.GaussianEstimator; +import com.yahoo.labs.samoa.moa.options.AbstractOptionHandler; +import com.github.javacliparser.IntOption; + +/** + * Class for observing the class data distribution for a numeric attribute using gaussian estimators. + * This observer monitors the class distribution of a given attribute. + * Used in naive Bayes and decision trees to monitor data statistics on leaves. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public class GaussianNumericAttributeClassObserver extends AbstractOptionHandler + implements NumericAttributeClassObserver { + + private static final long serialVersionUID = 1L; + + protected DoubleVector minValueObservedPerClass = new DoubleVector(); + + protected DoubleVector maxValueObservedPerClass = new DoubleVector(); + + protected AutoExpandVector<GaussianEstimator> attValDistPerClass = new AutoExpandVector<>(); + + /** + * @param classVal + * @return The requested Estimator if it exists, or null if not present. + */ + public GaussianEstimator getEstimator(int classVal) { + return this.attValDistPerClass.get(classVal); + } + + public IntOption numBinsOption = new IntOption("numBins", 'n', + "The number of bins.", 10, 1, Integer.MAX_VALUE); + + @Override + public void observeAttributeClass(double attVal, int classVal, double weight) { + if (!Utils.isMissingValue(attVal)) { + GaussianEstimator valDist = this.attValDistPerClass.get(classVal); + if (valDist == null) { + valDist = new GaussianEstimator(); + this.attValDistPerClass.set(classVal, valDist); + this.minValueObservedPerClass.setValue(classVal, attVal); + this.maxValueObservedPerClass.setValue(classVal, attVal); + } else { + if (attVal < this.minValueObservedPerClass.getValue(classVal)) { + this.minValueObservedPerClass.setValue(classVal, attVal); + } + if (attVal > this.maxValueObservedPerClass.getValue(classVal)) { + this.maxValueObservedPerClass.setValue(classVal, attVal); + } + } + valDist.addObservation(attVal, weight); + } + } + + @Override + public double probabilityOfAttributeValueGivenClass(double attVal, + int classVal) { + GaussianEstimator obs = this.attValDistPerClass.get(classVal); + return obs != null ? obs.probabilityDensity(attVal) : 0.0; + } + + @Override + public AttributeSplitSuggestion getBestEvaluatedSplitSuggestion( + SplitCriterion criterion, double[] preSplitDist, int attIndex, + boolean binaryOnly) { + AttributeSplitSuggestion bestSuggestion = null; + double[] suggestedSplitValues = getSplitPointSuggestions(); + for (double splitValue : suggestedSplitValues) { + double[][] postSplitDists = getClassDistsResultingFromBinarySplit(splitValue); + double merit = criterion.getMeritOfSplit(preSplitDist, + postSplitDists); + if ((bestSuggestion == null) || (merit > bestSuggestion.merit)) { + bestSuggestion = new AttributeSplitSuggestion( + new NumericAttributeBinaryTest(attIndex, splitValue, + true), postSplitDists, merit); + } + } + return bestSuggestion; + } + + public double[] getSplitPointSuggestions() { + Set<Double> suggestedSplitValues = new TreeSet<>(); + double minValue = Double.POSITIVE_INFINITY; + double maxValue = Double.NEGATIVE_INFINITY; + for (int i = 0; i < this.attValDistPerClass.size(); i++) { + GaussianEstimator estimator = this.attValDistPerClass.get(i); + if (estimator != null) { + if (this.minValueObservedPerClass.getValue(i) < minValue) { + minValue = this.minValueObservedPerClass.getValue(i); + } + if (this.maxValueObservedPerClass.getValue(i) > maxValue) { + maxValue = this.maxValueObservedPerClass.getValue(i); + } + } + } + if (minValue < Double.POSITIVE_INFINITY) { + double range = maxValue - minValue; + for (int i = 0; i < this.numBinsOption.getValue(); i++) { + double splitValue = range / (this.numBinsOption.getValue() + 1.0) * (i + 1) + + minValue; + if ((splitValue > minValue) && (splitValue < maxValue)) { + suggestedSplitValues.add(splitValue); + } + } + } + double[] suggestions = new double[suggestedSplitValues.size()]; + int i = 0; + for (double suggestion : suggestedSplitValues) { + suggestions[i++] = suggestion; + } + return suggestions; + } + + // assume all values equal to splitValue go to lhs + public double[][] getClassDistsResultingFromBinarySplit(double splitValue) { + DoubleVector lhsDist = new DoubleVector(); + DoubleVector rhsDist = new DoubleVector(); + for (int i = 0; i < this.attValDistPerClass.size(); i++) { + GaussianEstimator estimator = this.attValDistPerClass.get(i); + if (estimator != null) { + if (splitValue < this.minValueObservedPerClass.getValue(i)) { + rhsDist.addToValue(i, estimator.getTotalWeightObserved()); + } else if (splitValue >= this.maxValueObservedPerClass.getValue(i)) { + lhsDist.addToValue(i, estimator.getTotalWeightObserved()); + } else { + double[] weightDist = estimator.estimatedWeight_LessThan_EqualTo_GreaterThan_Value(splitValue); + lhsDist.addToValue(i, weightDist[0] + weightDist[1]); + rhsDist.addToValue(i, weightDist[2]); + } + } + } + return new double[][]{lhsDist.getArrayRef(), rhsDist.getArrayRef()}; + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + } + + @Override + protected void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { + } + + @Override + public void observeAttributeTarget(double attVal, double target) { + throw new UnsupportedOperationException("Not supported yet."); + } +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/GreenwaldKhannaNumericAttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/GreenwaldKhannaNumericAttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/GreenwaldKhannaNumericAttributeClassObserver.java new file mode 100644 index 0000000..3de1146 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/GreenwaldKhannaNumericAttributeClassObserver.java @@ -0,0 +1,126 @@ +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.moa.classifiers.core.AttributeSplitSuggestion; +import com.yahoo.labs.samoa.moa.classifiers.core.conditionaltests.NumericAttributeBinaryTest; +import com.yahoo.labs.samoa.moa.classifiers.core.splitcriteria.SplitCriterion; +import com.yahoo.labs.samoa.moa.core.Utils; + +import com.yahoo.labs.samoa.moa.core.AutoExpandVector; +import com.yahoo.labs.samoa.moa.core.DoubleVector; +import com.yahoo.labs.samoa.moa.core.GreenwaldKhannaQuantileSummary; +import com.yahoo.labs.samoa.moa.core.ObjectRepository; +import com.yahoo.labs.samoa.moa.options.AbstractOptionHandler; +import com.github.javacliparser.IntOption; +import com.yahoo.labs.samoa.moa.tasks.TaskMonitor; + +/** + * Class for observing the class data distribution for a numeric attribute using Greenwald and Khanna methodology. + * This observer monitors the class distribution of a given attribute. + * Used in naive Bayes and decision trees to monitor data statistics on leaves. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public class GreenwaldKhannaNumericAttributeClassObserver extends AbstractOptionHandler implements NumericAttributeClassObserver { + + private static final long serialVersionUID = 1L; + + protected AutoExpandVector<GreenwaldKhannaQuantileSummary> attValDistPerClass = new AutoExpandVector<>(); + + public IntOption numTuplesOption = new IntOption("numTuples", 'n', + "The number of tuples.", 10, 1, Integer.MAX_VALUE); + + @Override + public void observeAttributeClass(double attVal, int classVal, double weight) { + if (!Utils.isMissingValue(attVal)) { + GreenwaldKhannaQuantileSummary valDist = this.attValDistPerClass.get(classVal); + if (valDist == null) { + valDist = new GreenwaldKhannaQuantileSummary(this.numTuplesOption.getValue()); + this.attValDistPerClass.set(classVal, valDist); + } + // TODO: not taking weight into account + valDist.insert(attVal); + } + } + + @Override + public double probabilityOfAttributeValueGivenClass(double attVal, + int classVal) { + // TODO: NaiveBayes broken until implemented + return 0.0; + } + + @Override + public AttributeSplitSuggestion getBestEvaluatedSplitSuggestion( + SplitCriterion criterion, double[] preSplitDist, int attIndex, + boolean binaryOnly) { + AttributeSplitSuggestion bestSuggestion = null; + for (GreenwaldKhannaQuantileSummary qs : this.attValDistPerClass) { + if (qs != null) { + double[] cutpoints = qs.getSuggestedCutpoints(); + for (double cutpoint : cutpoints) { + double[][] postSplitDists = getClassDistsResultingFromBinarySplit(cutpoint); + double merit = criterion.getMeritOfSplit(preSplitDist, + postSplitDists); + if ((bestSuggestion == null) + || (merit > bestSuggestion.merit)) { + bestSuggestion = new AttributeSplitSuggestion( + new NumericAttributeBinaryTest(attIndex, + cutpoint, true), postSplitDists, merit); + } + } + } + } + return bestSuggestion; + } + + // assume all values equal to splitValue go to lhs + public double[][] getClassDistsResultingFromBinarySplit(double splitValue) { + DoubleVector lhsDist = new DoubleVector(); + DoubleVector rhsDist = new DoubleVector(); + for (int i = 0; i < this.attValDistPerClass.size(); i++) { + GreenwaldKhannaQuantileSummary estimator = this.attValDistPerClass.get(i); + if (estimator != null) { + long countBelow = estimator.getCountBelow(splitValue); + lhsDist.addToValue(i, countBelow); + rhsDist.addToValue(i, estimator.getTotalCount() - countBelow); + } + } + return new double[][]{lhsDist.getArrayRef(), rhsDist.getArrayRef()}; + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + // TODO Auto-generated method stub + } + + @Override + protected void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { + // TODO Auto-generated method stub + } + + @Override + public void observeAttributeTarget(double attVal, double target) { + throw new UnsupportedOperationException("Not supported yet."); + } +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NominalAttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NominalAttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NominalAttributeClassObserver.java new file mode 100644 index 0000000..d605e84 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NominalAttributeClassObserver.java @@ -0,0 +1,178 @@ +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.moa.classifiers.core.AttributeSplitSuggestion; +import com.yahoo.labs.samoa.moa.classifiers.core.conditionaltests.NominalAttributeBinaryTest; +import com.yahoo.labs.samoa.moa.classifiers.core.conditionaltests.NominalAttributeMultiwayTest; +import com.yahoo.labs.samoa.moa.classifiers.core.splitcriteria.SplitCriterion; +import com.yahoo.labs.samoa.moa.core.ObjectRepository; +import com.yahoo.labs.samoa.moa.tasks.TaskMonitor; +import com.yahoo.labs.samoa.moa.core.Utils; + +import com.yahoo.labs.samoa.moa.core.AutoExpandVector; +import com.yahoo.labs.samoa.moa.core.DoubleVector; +import com.yahoo.labs.samoa.moa.options.AbstractOptionHandler; + +/** + * Class for observing the class data distribution for a nominal attribute. + * This observer monitors the class distribution of a given attribute. + * Used in naive Bayes and decision trees to monitor data statistics on leaves. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public class NominalAttributeClassObserver extends AbstractOptionHandler implements DiscreteAttributeClassObserver { + + private static final long serialVersionUID = 1L; + + protected double totalWeightObserved = 0.0; + + protected double missingWeightObserved = 0.0; + + public AutoExpandVector<DoubleVector> attValDistPerClass = new AutoExpandVector<>(); + + @Override + public void observeAttributeClass(double attVal, int classVal, double weight) { + if (Utils.isMissingValue(attVal)) { + this.missingWeightObserved += weight; + } else { + int attValInt = (int) attVal; + DoubleVector valDist = this.attValDistPerClass.get(classVal); + if (valDist == null) { + valDist = new DoubleVector(); + this.attValDistPerClass.set(classVal, valDist); + } + valDist.addToValue(attValInt, weight); + } + this.totalWeightObserved += weight; + } + + @Override + public double probabilityOfAttributeValueGivenClass(double attVal, + int classVal) { + DoubleVector obs = this.attValDistPerClass.get(classVal); + return obs != null ? (obs.getValue((int) attVal) + 1.0) + / (obs.sumOfValues() + obs.numValues()) : 0.0; + } + + public double totalWeightOfClassObservations() { + return this.totalWeightObserved; + } + + public double weightOfObservedMissingValues() { + return this.missingWeightObserved; + } + + @Override + public AttributeSplitSuggestion getBestEvaluatedSplitSuggestion( + SplitCriterion criterion, double[] preSplitDist, int attIndex, + boolean binaryOnly) { + AttributeSplitSuggestion bestSuggestion = null; + int maxAttValsObserved = getMaxAttValsObserved(); + if (!binaryOnly) { + double[][] postSplitDists = getClassDistsResultingFromMultiwaySplit(maxAttValsObserved); + double merit = criterion.getMeritOfSplit(preSplitDist, + postSplitDists); + bestSuggestion = new AttributeSplitSuggestion( + new NominalAttributeMultiwayTest(attIndex), postSplitDists, + merit); + } + for (int valIndex = 0; valIndex < maxAttValsObserved; valIndex++) { + double[][] postSplitDists = getClassDistsResultingFromBinarySplit(valIndex); + double merit = criterion.getMeritOfSplit(preSplitDist, + postSplitDists); + if ((bestSuggestion == null) || (merit > bestSuggestion.merit)) { + bestSuggestion = new AttributeSplitSuggestion( + new NominalAttributeBinaryTest(attIndex, valIndex), + postSplitDists, merit); + } + } + return bestSuggestion; + } + + public int getMaxAttValsObserved() { + int maxAttValsObserved = 0; + for (DoubleVector attValDist : this.attValDistPerClass) { + if ((attValDist != null) + && (attValDist.numValues() > maxAttValsObserved)) { + maxAttValsObserved = attValDist.numValues(); + } + } + return maxAttValsObserved; + } + + public double[][] getClassDistsResultingFromMultiwaySplit( + int maxAttValsObserved) { + DoubleVector[] resultingDists = new DoubleVector[maxAttValsObserved]; + for (int i = 0; i < resultingDists.length; i++) { + resultingDists[i] = new DoubleVector(); + } + for (int i = 0; i < this.attValDistPerClass.size(); i++) { + DoubleVector attValDist = this.attValDistPerClass.get(i); + if (attValDist != null) { + for (int j = 0; j < attValDist.numValues(); j++) { + resultingDists[j].addToValue(i, attValDist.getValue(j)); + } + } + } + double[][] distributions = new double[maxAttValsObserved][]; + for (int i = 0; i < distributions.length; i++) { + distributions[i] = resultingDists[i].getArrayRef(); + } + return distributions; + } + + public double[][] getClassDistsResultingFromBinarySplit(int valIndex) { + DoubleVector equalsDist = new DoubleVector(); + DoubleVector notEqualDist = new DoubleVector(); + for (int i = 0; i < this.attValDistPerClass.size(); i++) { + DoubleVector attValDist = this.attValDistPerClass.get(i); + if (attValDist != null) { + for (int j = 0; j < attValDist.numValues(); j++) { + if (j == valIndex) { + equalsDist.addToValue(i, attValDist.getValue(j)); + } else { + notEqualDist.addToValue(i, attValDist.getValue(j)); + } + } + } + } + return new double[][]{equalsDist.getArrayRef(), + notEqualDist.getArrayRef()}; + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + // TODO Auto-generated method stub + } + + @Override + protected void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { + // TODO Auto-generated method stub + } + + @Override + public void observeAttributeTarget(double attVal, double target) { + throw new UnsupportedOperationException("Not supported yet."); + } + +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NullAttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NullAttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NullAttributeClassObserver.java new file mode 100644 index 0000000..def0666 --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NullAttributeClassObserver.java @@ -0,0 +1,80 @@ +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +import com.yahoo.labs.samoa.moa.classifiers.core.AttributeSplitSuggestion; +import com.yahoo.labs.samoa.moa.classifiers.core.splitcriteria.SplitCriterion; +import com.yahoo.labs.samoa.moa.core.ObjectRepository; +import com.yahoo.labs.samoa.moa.options.AbstractOptionHandler; +import com.yahoo.labs.samoa.moa.tasks.TaskMonitor; + +/** + * Class for observing the class data distribution for a null attribute. + * This method is used to disable the observation for an attribute. + * Used in decision trees to monitor data statistics on leaves. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public class NullAttributeClassObserver extends AbstractOptionHandler implements AttributeClassObserver { + + private static final long serialVersionUID = 1L; + + @Override + public void observeAttributeClass(double attVal, int classVal, double weight) { + } + + @Override + public double probabilityOfAttributeValueGivenClass(double attVal, + int classVal) { + return 0.0; + } + + public double totalWeightOfClassObservations() { + return 0.0; + } + + public double weightOfObservedMissingValues() { + return 0.0; + } + + @Override + public void getDescription(StringBuilder sb, int indent) { + // TODO Auto-generated method stub + } + + @Override + public AttributeSplitSuggestion getBestEvaluatedSplitSuggestion( + SplitCriterion criterion, double[] preSplitDist, int attIndex, + boolean binaryOnly) { + return null; + } + + @Override + protected void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { + // TODO Auto-generated method stub + } + + @Override + public void observeAttributeTarget(double attVal, double target) { + throw new UnsupportedOperationException("Not supported yet."); + } +} http://git-wip-us.apache.org/repos/asf/incubator-samoa/blob/787864b6/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NumericAttributeClassObserver.java ---------------------------------------------------------------------- diff --git a/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NumericAttributeClassObserver.java b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NumericAttributeClassObserver.java new file mode 100644 index 0000000..1660d5f --- /dev/null +++ b/samoa-api/src/main/java/com/yahoo/labs/samoa/moa/classifiers/core/attributeclassobservers/NumericAttributeClassObserver.java @@ -0,0 +1,34 @@ +package com.yahoo.labs.samoa.moa.classifiers.core.attributeclassobservers; + +/* + * #%L + * SAMOA + * %% + * Copyright (C) 2007 University of Waikato, Hamilton, New Zealand + * %% + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * #L% + */ + +/** + * Interface for observing the class data distribution for a numeric attribute. + * This observer monitors the class distribution of a given attribute. + * Used in naive Bayes and decision trees to monitor data statistics on leaves. + * + * @author Richard Kirkby ([email protected]) + * @version $Revision: 7 $ + */ +public interface NumericAttributeClassObserver extends AttributeClassObserver { + + +}
