Author: tdunning
Date: Mon Aug 16 14:13:55 2010
New Revision: 985948
URL: http://svn.apache.org/viewvc?rev=985948&view=rev
Log:
MAHOUT-228 Interfaces for on-line classifiers
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java?rev=985948&view=auto
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
(added)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/AbstractVectorClassifier.java
Mon Aug 16 14:13:55 2010
@@ -0,0 +1,156 @@
+package org.apache.mahout.classifier;
+
+import org.apache.mahout.math.DenseMatrix;
+import org.apache.mahout.math.DenseVector;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.Vector;
+
+/**
+ * Defines the interface for classifiers that take input as a vector. This is
implemented
+ * as an abstract class so that it can implement a number of handy convenience
methods
+ * related to classification of vectors.
+ */
+public abstract class AbstractVectorClassifier {
+ // ------ These are all that are necessary to define a vector classifier.
+
+ /**
+ * Returns the number of categories for the target variable. A vector
classifier
+ * will encode it's output using a zero-based 1 of numCategories encoding.
+ * @return The number of categories.
+ */
+ public abstract int numCategories();
+
+ /**
+ * Classify a vector returning a vector of numCategories-1 scores. It is
assumed that
+ * the score for the missing category is one minus the sum of the scores
that are returned.
+ *
+ * Note that the missing score is the 0-th score.
+ * @param instance A feature vector to be classified.
+ * @return A vector of probabilities in 1 of n-1 encoding.
+ */
+ public abstract Vector classify(Vector instance);
+
+ /**
+ * Classifies a vector in the special case of a binary classifier where
+ * <code>classify(Vector)</code> would return a vector with only one
element. As such,
+ * using this method can void the allocation of a vector.
+ * @param instance The feature vector to be classified.
+ * @return The score for category 1.
+ *
+ * @see #classify(Vector)
+ */
+ public abstract double classifyScalar(Vector instance);
+
+ // ------- From here on, we have convenience methods that provide an easier
API to use.
+
+ /**
+ * Returns n probabilities, one for each category. If you can use an n-1
coding, and are touchy
+ * about allocation performance, then the classify method is probably better
to use. The 0-th
+ * element of the score vector returned by this method is the missing score
as computed by the
+ * classify method.
+ *
+ * @see #classify(Vector)
+ * @see #classifyFull(Vector r, Vector instance)
+ *
+ * @param instance A vector of features to be classified.
+ * @return A vector of probabilities, one for each category.
+ */
+ public Vector classifyFull(Vector instance) {
+ return classifyFull(new DenseVector(numCategories()), instance);
+ }
+
+ /**
+ * Returns n probabilities, one for each category into a pre-allocated
vector. One
+ * vector allocation is still done in the process of multiplying by the
coefficient
+ * matrix, but that is hard to avoid. The cost of such an ephemeral
allocation is
+ * very small in any case compared to the multiplication itself.
+ *
+ * @param r Where to put the results.
+ * @param instance A vector of features to be classified.
+ * @return A vector of probabilities, one for each category.
+ */
+ public Vector classifyFull(Vector r, Vector instance) {
+ r.viewPart(1, numCategories() - 1).assign(classify(instance));
+ r.setQuick(0, 1 - r.zSum());
+ return r;
+ }
+
+
+ /**
+ * Returns n-1 probabilities, one for each category but the last, for each
row of a matrix. The
+ * probability of the missing 0-th category is 1 - rowSum(this result).
+ *
+ * @param data The matrix whose rows are vectors to classify
+ * @return A matrix of scores, one row per row of the input matrix, one
column for each but the
+ * last category.
+ */
+ public Matrix classify(Matrix data) {
+ Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
+ for (int row = 0; row < data.numRows(); row++) {
+ r.assignRow(row, classify(data.getRow(row)));
+ }
+ return r;
+ }
+
+ /**
+ * Returns n probabilities, one for each category, for each row of a matrix.
+ *
+ * @param data The matrix whose rows are vectors to classify
+ * @return A matrix of scores, one row per row of the input matrix, one
column for each but the
+ * last category.
+ */
+ public Matrix classifyFull(Matrix data) {
+ Matrix r = new DenseMatrix(data.numRows(), numCategories());
+ for (int row = 0; row < data.numRows(); row++) {
+ classifyFull(r.viewRow(row), data.getRow(row));
+ }
+ return r;
+ }
+
+ /**
+ * Returns a vector of probabilities of the first category, one for each row
of a matrix. This
+ * only makes sense if there are exactly two categories, but calling this
method in that case can
+ * save a number of vector allocations.
+ *
+ * @param data The matrix whose rows are vectors to classify
+ * @return A vector of scores, with one value per row of the input matrix.
+ */
+ public Vector classifyScalar(Matrix data) {
+ if (numCategories() != 2) {
+ throw new IllegalArgumentException("Can only call classifyScalar with
two categories");
+ }
+
+ Vector r = new DenseVector(data.numRows());
+ for (int row = 0; row < data.numRows(); row++) {
+ r.set(row, classifyScalar(data.getRow(row)));
+ }
+ return r;
+ }
+
+ /**
+ * Returns a measure of how good the classification for a particular example
actually is.
+ *
+ * @param actual The correct category for the example.
+ * @param data The vector to be classified.
+ * @return The log likelihood of the correct answer as estimated by the
current model. This will
+ * always be <= 0 and larger (closer to 0) indicates better
accuracy. In order to simplify
+ * code that maintains running averages, we bound this value at -100.
+ */
+ public double logLikelihood(int actual, Vector data) {
+ if (numCategories() == 2) {
+ double p = classifyScalar(data);
+ if (actual > 0) {
+ return Math.max(-100, Math.log(p));
+ } else {
+ return Math.max(-100, Math.log(1 - p));
+ }
+ } else {
+ Vector p = classify(data);
+ if (actual > 0) {
+ return Math.max(-100, Math.log(p.get(actual - 1)));
+ } else {
+ return Math.max(-100, Math.log(1 - p.zSum()));
+ }
+ }
+ }
+}
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java?rev=985948&view=auto
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
(added)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/OnlineLearner.java
Mon Aug 16 14:13:55 2010
@@ -0,0 +1,54 @@
+package org.apache.mahout.classifier;
+
+import org.apache.mahout.math.Vector;
+
+/**
+ * The simplest interface for online learning algorithms.
+ */
+public interface OnlineLearner {
+ /**
+ * Updates the model using a particular target variable value and a feature
vector.
+ * <p/>
+ * There may an assumption that if multiple passes through the training data
are necessary, then
+ * the training examples will be presented in the same order. This is
because the order of
+ * training examples may be used to assign records to different data splits
for evaluation by
+ * cross-validation. Without the order invariance, records might be
assigned to training and test
+ * splits and error estimates could be seriously affected.
+ * <p/>
+ * If re-ordering is necessary, then using the alternative API which allows
a tracking key to be
+ * added to the training example can be used.
+ *
+ * @param actual The value of the target variable. This value should be
in the half-open
+ * interval [0..n) where n is the number of target
categories.
+ * @param instance The feature vector for this example.
+ */
+ void train(int actual, Vector instance);
+
+ /**
+ * Updates the model using a particular target variable value and a feature
vector.
+ * <p/>
+ * There may an assumption that if multiple passes through the training data
are necessary that
+ * the tracking key for a record will be the same for each pass and that
there will be a
+ * relatively large number of distinct tracking keys and that the low-order
bits of the tracking
+ * keys will not correlate with any of the input variables. This tracking
key is used to assign
+ * training examples to different test/training splits.
+ * <p/>
+ * Examples of useful tracking keys include id-numbers for the training
records derived from
+ * a database id for the base table from the which the record is derived, or
the offset of
+ * the original data record in a data file.
+ *
+ * @param trackingKey The tracking key for this training example.
+ * @param actual The value of the target variable. This value should be
in the half-open
+ * interval [0..n) where n is the number of target
categories.
+ * @param instance The feature vector for this example.
+ */
+ void train(int trackingKey, int actual, Vector instance);
+
+ /**
+ * Prepares the classifier for classification and deallocates any temporary
data structures.
+ *
+ * An online classifier should be able to accept more training after being
closed, but
+ * closing the classifier may make classification more efficient.
+ */
+ void close();
+}