Author: colen
Date: Tue May 13 17:04:50 2014
New Revision: 1594287
URL: http://svn.apache.org/r1594287
Log:
OPENNLP-695 Added support to extra info field to Doccat
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
Tue May 13 17:04:50 2014
@@ -20,6 +20,7 @@ package opennlp.tools.doccat;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.Map;
import opennlp.tools.util.featuregen.StringPattern;
@@ -38,7 +39,7 @@ public class BagOfWordsFeatureGenerator
}
@Override
- public Collection<String> extractFeatures(String[] text) {
+ public Collection<String> extractFeatures(String[] text, Map<String, Object>
extraInformation) {
Collection<String> bagOfWords = new ArrayList<String>(text.length);
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
Tue May 13 17:04:50 2014
@@ -36,6 +36,8 @@ public interface DocumentCategorizer {
*/
public double[] categorize(String text[]);
+ public double[] categorize(String text[], Map<String, Object>
extraInformation);
+
public String getBestCategory(double[] outcome);
public int getIndex(String category);
@@ -46,6 +48,8 @@ public interface DocumentCategorizer {
public double[] categorize(String documentText);
+ public double[] categorize(String documentText, Map<String, Object>
extraInformation);
+
public String getAllResults(double results[]);
public Map<String, Double> scoreMap(String text);
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
Tue May 13 17:04:50 2014
@@ -20,6 +20,7 @@ package opennlp.tools.doccat;
import java.util.Collection;
import java.util.LinkedList;
+import java.util.Map;
/**
*
@@ -32,13 +33,13 @@ class DocumentCategorizerContextGenerato
mFeatureGenerators = featureGenerators;
}
- public String[] getContext(String text[]) {
+ public String[] getContext(String text[], Map<String, Object>
extraInformation) {
Collection<String> context = new LinkedList<String>();
for (int i = 0; i < mFeatureGenerators.length; i++) {
Collection<String> extractedFeatures =
- mFeatureGenerators[i].extractFeatures(text);
+ mFeatureGenerators[i].extractFeatures(text, extraInformation);
context.addAll(extractedFeatures);
}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
Tue May 13 17:04:50 2014
@@ -60,7 +60,7 @@ public class DocumentCategorizerEvaluato
String document[] = sample.getText();
- double probs[] = categorizer.categorize(document);
+ double probs[] = categorizer.categorize(document,
sample.getExtraInformation());
String cat = categorizer.getBestCategory(probs);
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
Tue May 13 17:04:50 2014
@@ -73,7 +73,7 @@ public class DocumentCategorizerEventStr
isVirgin = false;
return new Event(sample.getCategory(),
- mContextGenerator.getContext(sample.getText()));
+ mContextGenerator.getContext(sample.getText(),
sample.getExtraInformation()));
}
public void remove() {
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
Tue May 13 17:04:50 2014
@@ -18,16 +18,17 @@ package opennlp.tools.doccat;
import java.io.IOException;
import java.io.ObjectStreamException;
+import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
-import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.ml.model.TrainUtil;
+import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
@@ -74,13 +75,31 @@ public class DocumentCategorizerME imple
.getFactory().getFeatureGenerators());
}
+ @Override
+ public double[] categorize(String[] text, Map<String, Object>
extraInformation) {
+ return model.getMaxentModel().eval(
+ mContextGenerator.getContext(text, extraInformation));
+ }
+
/**
* Categorizes the given text.
*
* @param text
*/
public double[] categorize(String text[]) {
- return model.getMaxentModel().eval(mContextGenerator.getContext(text));
+ return this.categorize(text, Collections.<String, Object>emptyMap());
+ }
+
+ /**
+ * Categorizes the given text. The Tokenizer is obtained from
+ * {@link DoccatFactory#getTokenizer()} and defaults to
+ * {@link SimpleTokenizer}.
+ */
+ @Override
+ public double[] categorize(String documentText,
+ Map<String, Object> extraInformation) {
+ Tokenizer tokenizer = model.getFactory().getTokenizer();
+ return categorize(tokenizer.tokenize(documentText), extraInformation);
}
/**
@@ -89,8 +108,10 @@ public class DocumentCategorizerME imple
*/
public double[] categorize(String documentText) {
Tokenizer tokenizer = model.getFactory().getTokenizer();
- return categorize(tokenizer.tokenize(documentText));
+ return categorize(tokenizer.tokenize(documentText),
+ Collections.<String, Object> emptyMap());
}
+
/**
* Returns a map in which the key is the category name and the value is the
score
* @param text the input text to classify
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
Tue May 13 17:04:50 2014
@@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
+import java.util.Map;
import opennlp.tools.tokenize.WhitespaceTokenizer;
@@ -32,12 +33,17 @@ public class DocumentSample {
private final String category;
private final List<String> text;
+ private final Map<String, Object> extraInformation;
public DocumentSample(String category, String text) {
this(category, WhitespaceTokenizer.INSTANCE.tokenize(text));
}
public DocumentSample(String category, String text[]) {
+ this(category, text, null);
+ }
+
+ public DocumentSample(String category, String text[], Map<String, Object>
extraInformation) {
if (category == null) {
throw new IllegalArgumentException("category must not be null");
}
@@ -47,6 +53,12 @@ public class DocumentSample {
this.category = category;
this.text = Collections.unmodifiableList(new
ArrayList<String>(Arrays.asList(text)));
+
+ if(extraInformation == null) {
+ this.extraInformation = Collections.emptyMap();
+ } else {
+ this.extraInformation = extraInformation;
+ }
}
public String getCategory() {
@@ -57,6 +69,10 @@ public class DocumentSample {
return text.toArray(new String[text.size()]);
}
+ public Map<String, Object> getExtraInformation() {
+ return extraInformation;
+ }
+
@Override
public String toString() {
@@ -72,10 +88,10 @@ public class DocumentSample {
// remove last space
sampleString.setLength(sampleString.length() - 1);
}
-
+
return sampleString.toString();
}
-
+
@Override
public boolean equals(Object obj) {
if (this == obj) {
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
Tue May 13 17:04:50 2014
@@ -19,10 +19,11 @@
package opennlp.tools.doccat;
import java.util.Collection;
+import java.util.Map;
/**
* Interface for generating features for document categorization.
*/
public interface FeatureGenerator {
- public Collection<String> extractFeatures(String[] text);
+ public Collection<String> extractFeatures(String[] text, Map<String, Object>
extraInformation);
}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java?rev=1594287&r1=1594286&r2=1594287&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
Tue May 13 17:04:50 2014
@@ -20,10 +20,11 @@ package opennlp.tools.doccat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
+import java.util.Map;
public class NGramFeatureGenerator implements FeatureGenerator {
- public Collection<String> extractFeatures(String[] text) {
+ public Collection<String> extractFeatures(String[] text, Map<String, Object>
extraInfo) {
List<String> features = new ArrayList<String>();