classifier: AbstractVectorClassifier.java OnlineLearner.java

tdunning Mon, 16 Aug 2010 07:15:17 -0700

Author: tdunning
Date: Mon Aug 16 14:13:55 2010
New Revision: 985948

URL: http://svn.apache.org/viewvc?rev=985948&view=rev
Log:
MAHOUT-228 Interfaces for on-line classifiers


Added:
    
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java

Added: 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java?rev=985948&view=auto
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
 (added)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
 Mon Aug 16 14:13:55 2010
@@ -0,0 +1,156 @@
+package org.apache.mahout.classifier;
+
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Defines the interface for classifiers that take input as a vector.  This is 
implemented
+ * as an abstract class so that it can implement a number of handy convenience 
methods
+ * related to classification of vectors.
+ */
+public abstract class AbstractVectorClassifier {
+  // ------ These are all that are necessary to define a vector classifier.
+
+  /**
+   * Returns the number of categories for the target variable.  A vector 
classifier
+   * will encode it's output using a zero-based 1 of numCategories encoding.
+   * @return The number of categories.
+   */
+  public abstract int numCategories();
+
+  /**
+   * Classify a vector returning a vector of numCategories-1 scores.  It is 
assumed that
+   * the score for the missing category is one minus the sum of the scores 
that are returned.
+   *
+   * Note that the missing score is the 0-th score.
+   * @param instance  A feature vector to be classified.
+   * @return  A vector of probabilities in 1 of n-1 encoding.
+   */
+  public abstract Vector classify(Vector instance);
+
+  /**
+   * Classifies a vector in the special case of a binary classifier where
+   * <code>classify(Vector)</code> would return a vector with only one 
element.  As such,
+   * using this method can void the allocation of a vector.
+   * @param instance   The feature vector to be classified.
+   * @return The score for category 1.
+   *
+   * @see #classify(Vector)
+   */
+  public abstract double classifyScalar(Vector instance);
+
+  // ------- From here on, we have convenience methods that provide an easier 
API to use.
+
+  /**
+   * Returns n probabilities, one for each category.  If you can use an n-1 
coding, and are touchy
+   * about allocation performance, then the classify method is probably better 
to use.  The 0-th
+   * element of the score vector returned by this method is the missing score 
as computed by the
+   * classify method.
+   *
+   * @see #classify(Vector)
+   * @see #classifyFull(Vector r, Vector instance)
+   *
+   * @param instance A vector of features to be classified.
+   * @return A vector of probabilities, one for each category.
+   */
+  public Vector classifyFull(Vector instance) {
+    return classifyFull(new DenseVector(numCategories()), instance);
+  }
+
+  /**
+   * Returns n probabilities, one for each category into a pre-allocated 
vector.  One
+   * vector allocation is still done in the process of multiplying by the 
coefficient
+   * matrix, but that is hard to avoid.  The cost of such an ephemeral 
allocation is
+   * very small in any case compared to the multiplication itself.
+   *
+   * @param r        Where to put the results.
+   * @param instance A vector of features to be classified.
+   * @return A vector of probabilities, one for each category.
+   */
+  public Vector classifyFull(Vector r, Vector instance) {
+    r.viewPart(1, numCategories() - 1).assign(classify(instance));
+    r.setQuick(0, 1 - r.zSum());
+    return r;
+  }
+
+
+  /**
+   * Returns n-1 probabilities, one for each category but the last, for each 
row of a matrix. The
+   * probability of the missing 0-th category is 1 - rowSum(this result).
+   *
+   * @param data The matrix whose rows are vectors to classify
+   * @return A matrix of scores, one row per row of the input matrix, one 
column for each but the
+   *         last category.
+   */
+  public Matrix classify(Matrix data) {
+    Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
+    for (int row = 0; row < data.numRows(); row++) {
+      r.assignRow(row, classify(data.getRow(row)));
+    }
+    return r;
+  }
+
+  /**
+   * Returns n probabilities, one for each category, for each row of a matrix.
+   *
+   * @param data The matrix whose rows are vectors to classify
+   * @return A matrix of scores, one row per row of the input matrix, one 
column for each but the
+   *         last category.
+   */
+  public Matrix classifyFull(Matrix data) {
+    Matrix r = new DenseMatrix(data.numRows(), numCategories());
+    for (int row = 0; row < data.numRows(); row++) {
+      classifyFull(r.viewRow(row), data.getRow(row));
+    }
+    return r;
+  }
+
+  /**
+   * Returns a vector of probabilities of the first category, one for each row 
of a matrix. This
+   * only makes sense if there are exactly two categories, but calling this 
method in that case can
+   * save a number of vector allocations.
+   *
+   * @param data The matrix whose rows are vectors to classify
+   * @return A vector of scores, with one value per row of the input matrix.
+   */
+  public Vector classifyScalar(Matrix data) {
+    if (numCategories() != 2) {
+      throw new IllegalArgumentException("Can only call classifyScalar with 
two categories");
+    }
+
+    Vector r = new DenseVector(data.numRows());
+    for (int row = 0; row < data.numRows(); row++) {
+      r.set(row, classifyScalar(data.getRow(row)));
+    }
+    return r;
+  }
+
+  /**
+   * Returns a measure of how good the classification for a particular example 
actually is.
+   *
+   * @param actual The correct category for the example.
+   * @param data   The vector to be classified.
+   * @return The log likelihood of the correct answer as estimated by the 
current model.  This will
+   *         always be <= 0 and larger (closer to 0) indicates better 
accuracy.  In order to simplify
+   *         code that maintains running averages, we bound this value at -100.
+   */
+  public double logLikelihood(int actual, Vector data) {
+    if (numCategories() == 2) {
+      double p = classifyScalar(data);
+      if (actual > 0) {
+        return Math.max(-100, Math.log(p));
+      } else {
+        return Math.max(-100, Math.log(1 - p));
+      }
+    } else {
+      Vector p = classify(data);
+      if (actual > 0) {
+        return Math.max(-100, Math.log(p.get(actual - 1)));
+      } else {
+        return Math.max(-100, Math.log(1 - p.zSum()));
+      }
+    }
+  }
+}

Added: 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java?rev=985948&view=auto
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java 
(added)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java 
Mon Aug 16 14:13:55 2010
@@ -0,0 +1,54 @@
+package org.apache.mahout.classifier;
+
+import org.apache.mahout.math.Vector;
+
+/**
+ * The simplest interface for online learning algorithms.
+ */
+public interface OnlineLearner {
+  /**
+   * Updates the model using a particular target variable value and a feature 
vector.
+   * <p/>
+   * There may an assumption that if multiple passes through the training data 
are necessary, then
+   * the training examples will be presented in the same order.  This is 
because the order of
+   * training examples may be used to assign records to different data splits 
for evaluation by
+   * cross-validation.  Without the order invariance, records might be 
assigned to training and test
+   * splits and error estimates could be seriously affected.
+   * <p/>
+   * If re-ordering is necessary, then using the alternative API which allows 
a tracking key to be
+   * added to the training example can be used.
+   *
+   * @param actual   The value of the target variable.  This value should be 
in the half-open
+   *                 interval [0..n) where n is the number of target 
categories.
+   * @param instance The feature vector for this example.
+   */
+  void train(int actual, Vector instance);
+
+  /**
+   * Updates the model using a particular target variable value and a feature 
vector.
+   * <p/>
+   * There may an assumption that if multiple passes through the training data 
are necessary that
+   * the tracking key for a record will be the same for each pass and that 
there will be a
+   * relatively large number of distinct tracking keys and that the low-order 
bits of the tracking
+   * keys will not correlate with any of the input variables.  This tracking 
key is used to assign
+   * training examples to different test/training splits.
+   * <p/>
+   * Examples of useful tracking keys include id-numbers for the training 
records derived from
+   * a database id for the base table from the which the record is derived, or 
the offset of
+   * the original data record in a data file.
+   *
+   * @param trackingKey The tracking key for this training example.
+   * @param actual   The value of the target variable.  This value should be 
in the half-open
+   *                 interval [0..n) where n is the number of target 
categories.
+   * @param instance The feature vector for this example.
+   */
+  void train(int trackingKey, int actual, Vector instance);
+
+  /**
+   * Prepares the classifier for classification and deallocates any temporary 
data structures.
+   *
+   * An online classifier should be able to accept more training after being 
closed, but
+   * closing the classifier may make classification more efficient.
+   */
+  void close();
+}

svn commit: r985948 - in /mahout/trunk/core/src/main/java/org/apache/mahout/classifier: AbstractVectorClassifier.java OnlineLearner.java

Reply via email to