Author: colen
Date: Tue May 13 17:04:50 2014
New Revision: 1594287

URL: http://svn.apache.org/r1594287
Log:
OPENNLP-695 Added support to extra info field to Doccat

Modified:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
 Tue May 13 17:04:50 2014
@@ -20,6 +20,7 @@ package opennlp.tools.doccat;
 
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Map;
 
 import opennlp.tools.util.featuregen.StringPattern;
 
@@ -38,7 +39,7 @@ public class BagOfWordsFeatureGenerator 
   }
 
   @Override
-  public Collection<String> extractFeatures(String[] text) {
+  public Collection<String> extractFeatures(String[] text, Map<String, Object> 
extraInformation) {
 
     Collection<String> bagOfWords = new ArrayList<String>(text.length);
 

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
 Tue May 13 17:04:50 2014
@@ -36,6 +36,8 @@ public interface DocumentCategorizer {
    */
   public double[] categorize(String text[]);
 
+  public double[] categorize(String text[], Map<String, Object> 
extraInformation);
+
   public String getBestCategory(double[] outcome);
 
   public int getIndex(String category);
@@ -46,6 +48,8 @@ public interface DocumentCategorizer {
 
   public double[] categorize(String documentText);
 
+  public double[] categorize(String documentText, Map<String, Object> 
extraInformation);
+
   public String getAllResults(double results[]);
 
   public Map<String, Double> scoreMap(String text);

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
 Tue May 13 17:04:50 2014
@@ -20,6 +20,7 @@ package opennlp.tools.doccat;
 
 import java.util.Collection;
 import java.util.LinkedList;
+import java.util.Map;
 
 /**
  *
@@ -32,13 +33,13 @@ class DocumentCategorizerContextGenerato
     mFeatureGenerators = featureGenerators;
   }
 
-  public String[] getContext(String text[]) {
+  public String[] getContext(String text[], Map<String, Object> 
extraInformation) {
 
     Collection<String> context = new LinkedList<String>();
 
     for (int i = 0; i < mFeatureGenerators.length; i++) {
       Collection<String> extractedFeatures =
-          mFeatureGenerators[i].extractFeatures(text);
+          mFeatureGenerators[i].extractFeatures(text, extraInformation);
       context.addAll(extractedFeatures);
     }
 

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
 Tue May 13 17:04:50 2014
@@ -60,7 +60,7 @@ public class DocumentCategorizerEvaluato
 
     String document[] = sample.getText();
 
-    double probs[] = categorizer.categorize(document);
+    double probs[] = categorizer.categorize(document, 
sample.getExtraInformation());
 
     String cat = categorizer.getBestCategory(probs);
 

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
 Tue May 13 17:04:50 2014
@@ -73,7 +73,7 @@ public class DocumentCategorizerEventStr
         isVirgin = false;
 
         return new Event(sample.getCategory(),
-            mContextGenerator.getContext(sample.getText()));
+            mContextGenerator.getContext(sample.getText(), 
sample.getExtraInformation()));
       }
 
       public void remove() {

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
 Tue May 13 17:04:50 2014
@@ -18,16 +18,17 @@ package opennlp.tools.doccat;
 
 import java.io.IOException;
 import java.io.ObjectStreamException;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
-import java.util.NavigableMap;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
 
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.ml.model.TrainUtil;
+import opennlp.tools.tokenize.SimpleTokenizer;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.TrainingParameters;
@@ -74,13 +75,31 @@ public class DocumentCategorizerME imple
             .getFactory().getFeatureGenerators());
   }
 
+  @Override
+  public double[] categorize(String[] text, Map<String, Object> 
extraInformation) {
+    return model.getMaxentModel().eval(
+        mContextGenerator.getContext(text, extraInformation));
+  }
+
   /**
    * Categorizes the given text.
    *
    * @param text
    */
   public double[] categorize(String text[]) {
-    return model.getMaxentModel().eval(mContextGenerator.getContext(text));
+    return this.categorize(text, Collections.<String, Object>emptyMap());
+  }
+
+  /**
+   * Categorizes the given text. The Tokenizer is obtained from
+   * {@link DoccatFactory#getTokenizer()} and defaults to
+   * {@link SimpleTokenizer}.
+   */
+  @Override
+  public double[] categorize(String documentText,
+      Map<String, Object> extraInformation) {
+    Tokenizer tokenizer = model.getFactory().getTokenizer();
+    return categorize(tokenizer.tokenize(documentText), extraInformation);
   }
 
   /**
@@ -89,8 +108,10 @@ public class DocumentCategorizerME imple
    */
   public double[] categorize(String documentText) {
     Tokenizer tokenizer = model.getFactory().getTokenizer();
-    return categorize(tokenizer.tokenize(documentText));
+    return categorize(tokenizer.tokenize(documentText),
+        Collections.<String, Object> emptyMap());
   }
+
 /**
  * Returns a map in which the key is the category name and the value is the 
score
  * @param text the input text to classify

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
 Tue May 13 17:04:50 2014
@@ -22,6 +22,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 
 import opennlp.tools.tokenize.WhitespaceTokenizer;
 
@@ -32,12 +33,17 @@ public class DocumentSample {
 
   private final String category;
   private final List<String> text;
+  private final Map<String, Object> extraInformation;
 
   public DocumentSample(String category, String text) {
     this(category, WhitespaceTokenizer.INSTANCE.tokenize(text));
   }
 
   public DocumentSample(String category, String text[]) {
+    this(category, text, null);
+  }
+
+  public DocumentSample(String category, String text[], Map<String, Object> 
extraInformation) {
     if (category == null) {
       throw new IllegalArgumentException("category must not be null");
     }
@@ -47,6 +53,12 @@ public class DocumentSample {
 
     this.category = category;
     this.text = Collections.unmodifiableList(new 
ArrayList<String>(Arrays.asList(text)));
+
+    if(extraInformation == null) {
+      this.extraInformation = Collections.emptyMap();
+    } else {
+      this.extraInformation = extraInformation;
+    }
   }
 
   public String getCategory() {
@@ -57,6 +69,10 @@ public class DocumentSample {
     return text.toArray(new String[text.size()]);
   }
 
+  public Map<String, Object> getExtraInformation() {  
+    return extraInformation;
+  }
+
   @Override
   public String toString() {
 
@@ -72,10 +88,10 @@ public class DocumentSample {
       // remove last space
       sampleString.setLength(sampleString.length() - 1);
     }
-
+    
     return sampleString.toString();
   }
-
+  
   @Override
   public boolean equals(Object obj) {
     if (this == obj) {

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
 Tue May 13 17:04:50 2014
@@ -19,10 +19,11 @@
 package opennlp.tools.doccat;
 
 import java.util.Collection;
+import java.util.Map;
 
 /**
  * Interface for generating features for document categorization.
  */
 public interface FeatureGenerator {
-  public Collection<String> extractFeatures(String[] text);
+  public Collection<String> extractFeatures(String[] text, Map<String, Object> 
extraInformation);
 }

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
 Tue May 13 17:04:50 2014
@@ -20,10 +20,11 @@ package opennlp.tools.doccat;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
+import java.util.Map;
 
 public class NGramFeatureGenerator implements FeatureGenerator {
 
-  public Collection<String> extractFeatures(String[] text) {
+  public Collection<String> extractFeatures(String[] text, Map<String, Object> 
extraInfo) {
 
     List<String> features = new ArrayList<String>();
 


Reply via email to