Author: tdunning
Date: Fri Sep 17 21:32:55 2010
New Revision: 998337
URL: http://svn.apache.org/viewvc?rev=998337&view=rev
Log:
Split text encoding into two phases so that a document can be built up
line by line. Also reverted addToVector to be both phases.
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java?rev=998337&r1=998336&r2=998337&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
Fri Sep 17 21:32:55 2010
@@ -21,7 +21,6 @@ import com.google.common.base.Charsets;
import com.google.common.collect.Sets;
import org.apache.mahout.math.Vector;
-import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
@@ -276,8 +275,4 @@ public abstract class FeatureVectorEncod
protected byte[] bytesForString(String x){
return x.getBytes(Charsets.UTF_8);
}
-
- public void flush(double weight, Vector data) {
- // default is to do nothing
- }
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java?rev=998337&r1=998336&r2=998337&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
Fri Sep 17 21:32:55 2010
@@ -54,17 +54,36 @@ public class TextValueEncoder extends Fe
*/
@Override
public void addToVector(byte[] originalForm, double weight, Vector data) {
- for (String word : tokenize(new String(originalForm))) {
+ addText(originalForm);
+ flush(weight, data);
+ }
+
+ /**
+ * Adds text to the internal word counter, but delays converting it to vector
+ * form until flush is called.
+ * @param originalForm The original text encoded as UTF-8
+ */
+ public void addText(byte[] originalForm) {
+ String text = new String(originalForm, Charsets.UTF_8);
+ addText(text);
+ }
+
+ /**
+ * Adds text to the internal word counter, but delays converting it to vector
+ * form until flush is called.
+ * @param text The original text encoded as UTF-8
+ */
+ public void addText(String text) {
+ for (String word : tokenize(text)) {
counts.add(word);
}
}
/**
* Adds all of the tokens that we counted up to a vector.
- * @param weight
+ * @param weight The weight
* @param data
*/
- @Override
public void flush(double weight, Vector data) {
for (String word : counts.elementSet()) {
// weight words by log_2(tf) times whatever other weight we are given
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java?rev=998337&r1=998336&r2=998337&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java
Fri Sep 17 21:32:55 2010
@@ -40,7 +40,7 @@ public final class TextValueEncoderTest
// now some fancy weighting
StaticWordValueEncoder w = new StaticWordValueEncoder("text");
- w.setDictionary(ImmutableMap.<byte[],
Double>of("word1".getBytes(Charsets.UTF_8), 3.0,
"word2".getBytes(Charsets.UTF_8), 1.5));
+ w.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2",
1.5));
enc.setWordEncoder(w);
// should set 6 locations to something