Author: tdunning
Date: Fri Aug 20 03:23:19 2010
New Revision: 987369
URL: http://svn.apache.org/viewvc?rev=987369&view=rev
Log:
Setting up for interaction effects in vector encoding.
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java?rev=987369&r1=987368&r2=987369&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
Fri Aug 20 03:23:19 2010
@@ -110,6 +110,29 @@ public abstract class FeatureVectorEncod
}
/**
+ * Hash four strings and an integer into the range [0..numFeatures-1].
+ *
+ * @param term1 The first string.
+ * @param term2 The second string.
+ * @param term3 The third string
+ * @param term4 And the fourth.
+ * @param probe An integer that modifies the resulting hash.
+ * @param numFeatures The range into which the resulting hash must fit.
+ * @return An integer in the range [0..numFeatures-1] that has good spread
for small changes in
+ * term and probe.
+ */
+ protected int hash(String term1, String term2, String term3, String term4,
int probe, int numFeatures) {
+ long r = MurmurHash.hash64A(term1.getBytes(Charset.forName("UTF-8")),
probe);
+ r = MurmurHash.hash64A(term2.getBytes(Charset.forName("UTF-8")), (int) r)
% numFeatures;
+ r = MurmurHash.hash64A(term3.getBytes(Charset.forName("UTF-8")), (int) r)
% numFeatures;
+ r = MurmurHash.hash64A(term4.getBytes(Charset.forName("UTF-8")), (int) r)
% numFeatures;
+ if (r < 0) {
+ r += numFeatures;
+ }
+ return (int) r;
+ }
+
+ /**
* Converts a value into a form that would help a human understand the
internals of how the value
* is being interpreted. For text-like things, this is likely to be a list
of the terms found
* with associated weights (if any).
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java?rev=987369&r1=987368&r2=987369&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/InteractionValueEncoder.java
Fri Aug 20 03:23:19 2010
@@ -21,62 +21,56 @@ import org.apache.mahout.math.Vector;
public class InteractionValueEncoder extends FeatureVectorEncoder {
- protected static final int INTERACTION_VALUE_HASH_SEED_1 = 100;
- protected static final int INTERACTION_VALUE_HASH_SEED_2 = 200;
-
- protected InteractionValueEncoder(String name) {
- super(name, 2);
- }
-
- /**
- * Adds a value to a vector.
- *
- * @param originalForm The original form of the first value as a string.
- * @param data The vector to which the value should be added.
- */
- @Override
- public void addToVector(String originalForm, double w, Vector data) {
- }
-
- /**
- * Adds a value to a vector.
- *
- * @param originalForm1 The original form of the first value as a string.
- * @param originalForm2 The original form of the second value as a string.
- * @param data The vector to which the value should be added.
- */
- public void addInteractionToVector(String originalForm1, String
originalForm2, Vector data) {
- int probes = getProbes();
- String name = getName();
- for (int i = 0; i < probes; i++) {
- int h1 = hash1(name, originalForm1, i, data.size());
- int h2 = hash2(name, originalForm1, i, data.size());
- int j = hash1(name, originalForm2, i, data.size());
- int n = (h1 + j*h2) % data.size();
- trace(String.format("%s:%s", originalForm1, originalForm2), n);
- data.set(n, data.get(n) + 1);
- }
- }
-
- /**
- * Converts a value into a form that would help a human understand the
internals of how the value
- * is being interpreted. For text-like things, this is likely to be a
list of the terms found with
- * associated weights (if any).
- *
- * @param originalForm The original form of the value as a string.
- * @return A string that a human can read.
- */
- @Override
- public String asString(String originalForm) {
- return String.format("%s:%s", getName(), originalForm);
- }
-
- protected int hash1(String term1, String term2, int probe, int
numFeatures) {
- return hash(term1, term2,
probe+INTERACTION_VALUE_HASH_SEED_1,numFeatures);
- }
-
- protected int hash2(String term1, String term2, int probe, int
numFeatures) {
- return hash(term1, term2,
probe+INTERACTION_VALUE_HASH_SEED_2,numFeatures);
- }
+ protected static final int INTERACTION_VALUE_HASH_SEED_1 = 100;
+ protected static final int INTERACTION_VALUE_HASH_SEED_2 = 200;
+ private String name1;
+ private String name2;
+
+ protected InteractionValueEncoder(String name1, String name2) {
+ super(name1 + ":" + name2, 2);
+ this.name1 = name1;
+ this.name2 = name2;
+ }
+
+ /**
+ * Adds a value to a vector.
+ *
+ * @param originalForm The original form of the first value as a string.
+ * @param data The vector to which the value should be added.
+ */
+ @Override
+ public void addToVector(String originalForm, double w, Vector data) {
+ throw new UnsupportedOperationException("Must have two arguments to encode
interaction");
+ }
+
+ /**
+ * Adds a value to a vector.
+ *
+ * @param originalForm1 The original form of the first value as a string.
+ * @param originalForm2 The original form of the second value as a string.
+ * @param data The vector to which the value should be added.
+ */
+ public void addToVector(String originalForm1, String originalForm2, Vector
data) {
+ int probes = getProbes();
+ String name = getName();
+ for (int i = 0; i < probes; i++) {
+ int n = hash(name1, originalForm1, name2, originalForm2, i, data.size());
+ trace(String.format("%s:%s", originalForm1, originalForm2), n);
+ data.set(n, data.get(n) + 1);
+ }
+ }
+
+ /**
+ * Converts a value into a form that would help a human understand the
internals of how the
+ * value is being interpreted. For text-like things, this is likely to be a
list of the terms
+ * found with associated weights (if any).
+ *
+ * @param originalForm The original form of the value as a string.
+ * @return A string that a human can read.
+ */
+ @Override
+ public String asString(String originalForm) {
+ return String.format("%s:%s", getName(), originalForm);
+ }
}