Author: tdunning
Date: Fri Sep 17 21:32:55 2010
New Revision: 998337

URL: http://svn.apache.org/viewvc?rev=998337&view=rev
Log:
Split text encoding into two phases so that a document can be built up
line by line.  Also reverted addToVector to be both phases.

Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java?rev=998337&r1=998336&r2=998337&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/FeatureVectorEncoder.java
 Fri Sep 17 21:32:55 2010
@@ -21,7 +21,6 @@ import com.google.common.base.Charsets;
 import com.google.common.collect.Sets;
 import org.apache.mahout.math.Vector;
 
-import java.nio.charset.Charset;
 import java.util.Collections;
 import java.util.Map;
 import java.util.Set;
@@ -276,8 +275,4 @@ public abstract class FeatureVectorEncod
   protected byte[] bytesForString(String x){
     return x.getBytes(Charsets.UTF_8);
   }
-
-  public void flush(double weight, Vector data) {
-    // default is to do nothing
-  }
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java?rev=998337&r1=998336&r2=998337&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java 
(original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectors/TextValueEncoder.java 
Fri Sep 17 21:32:55 2010
@@ -54,17 +54,36 @@ public class TextValueEncoder extends Fe
    */
   @Override
   public void addToVector(byte[] originalForm, double weight, Vector data) {
-    for (String word : tokenize(new String(originalForm))) {
+    addText(originalForm);
+    flush(weight, data);
+  }
+
+  /**
+   * Adds text to the internal word counter, but delays converting it to vector
+   * form until flush is called.
+   * @param originalForm  The original text encoded as UTF-8
+   */
+  public void addText(byte[] originalForm) {
+    String text = new String(originalForm, Charsets.UTF_8);
+    addText(text);
+  }
+
+  /**
+   * Adds text to the internal word counter, but delays converting it to vector
+   * form until flush is called.
+   * @param text  The original text encoded as UTF-8
+   */
+  public void addText(String text) {
+    for (String word : tokenize(text)) {
       counts.add(word);
     }
   }
 
   /**
    * Adds all of the tokens that we counted up to a vector.
-   * @param weight
+   * @param weight  The weight
    * @param data
    */
-  @Override
   public void flush(double weight, Vector data) {
     for (String word : counts.elementSet()) {
       // weight words by log_2(tf) times whatever other weight we are given

Modified: 
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java?rev=998337&r1=998336&r2=998337&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/vectors/TextValueEncoderTest.java
 Fri Sep 17 21:32:55 2010
@@ -40,7 +40,7 @@ public final class TextValueEncoderTest 
 
     // now some fancy weighting
     StaticWordValueEncoder w = new StaticWordValueEncoder("text");
-    w.setDictionary(ImmutableMap.<byte[], 
Double>of("word1".getBytes(Charsets.UTF_8), 3.0, 
"word2".getBytes(Charsets.UTF_8), 1.5));
+    w.setDictionary(ImmutableMap.<String, Double>of("word1", 3.0, "word2", 
1.5));
     enc.setWordEncoder(w);
 
     // should set 6 locations to something


Reply via email to