Author: markg
Date: Tue Apr 29 01:10:24 2014
New Revision: 1590852
URL: http://svn.apache.org/r1590852
Log:
OPENNLP-679
Added two methods that return Map and SortedMap. Test includes the sortedMap
call to get the last key.
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java?rev=1590852&r1=1590851&r2=1590852&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
Tue Apr 29 01:10:24 2014
@@ -18,6 +18,12 @@
package opennlp.tools.doccat;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Set;
+import java.util.SortedMap;
+
/**
* Interface for classes which categorize documents.
*/
@@ -41,5 +47,10 @@ public interface DocumentCategorizer {
public double[] categorize(String documentText);
public String getAllResults(double results[]);
+
+ public Map<String, Double> scoreMap(String text);
+
+ public SortedMap<Double, Set<String>> sortedScoreMap(String text);
+
}
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1590852&r1=1590851&r2=1590852&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
(original)
+++
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
Tue Apr 29 01:10:24 2014
@@ -14,14 +14,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
-
package opennlp.tools.doccat;
import java.io.IOException;
import java.io.ObjectStreamException;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.ml.model.TrainUtil;
@@ -52,7 +55,7 @@ public class DocumentCategorizerME imple
* @param featureGenerators
*
* @deprecated train a {@link DoccatModel} with a specific
- * {@link DoccatFactory} to customize the {@link
FeatureGenerator}s
+ * {@link DoccatFactory} to customize the {@link FeatureGenerator}s
*/
public DocumentCategorizerME(DoccatModel model, FeatureGenerator...
featureGenerators) {
this.model = model;
@@ -60,14 +63,15 @@ public class DocumentCategorizerME imple
}
/**
- * Initializes the current instance with a doccat model. Default feature
generation is used.
+ * Initializes the current instance with a doccat model. Default feature
+ * generation is used.
*
* @param model
*/
public DocumentCategorizerME(DoccatModel model) {
this.model = model;
this.mContextGenerator = new DocumentCategorizerContextGenerator(this.model
- .getFactory().getFeatureGenerators());
+ .getFactory().getFeatureGenerators());
}
/**
@@ -80,13 +84,53 @@ public class DocumentCategorizerME imple
}
/**
- * Categorizes the given text. The text is tokenized with the
SimpleTokenizer before it
- * is passed to the feature generation.
+ * Categorizes the given text. The text is tokenized with the SimpleTokenizer
+ * before it is passed to the feature generation.
*/
public double[] categorize(String documentText) {
Tokenizer tokenizer = model.getFactory().getTokenizer();
return categorize(tokenizer.tokenize(documentText));
}
+/**
+ * Returns a map in which the key is the category name and the value is the
score
+ * @param text the input text to classify
+ * @return
+ */
+ public Map<String, Double> scoreMap(String text) {
+ Map<String, Double> probDist = new HashMap<String, Double>();
+
+ double[] categorize = categorize(text);
+ int catSize = getNumberOfCategories();
+ for (int i = 0; i < catSize; i++) {
+ String category = getCategory(i);
+ probDist.put(category, categorize[getIndex(category)]);
+ }
+ return probDist;
+
+ }
+/**
+ * Returns a map with the score as a key in ascendng order. The value is a Set
of categories with the score.
+ * Many categories can have the same score, hence the Set as value
+ * @param text the input text to classify
+ * @return
+ */
+ public SortedMap<Double, Set<String>> sortedScoreMap(String text) {
+ SortedMap<Double, Set<String>> descendingMap = new TreeMap<Double,
Set<String>>();
+ double[] categorize = categorize(text);
+ int catSize = getNumberOfCategories();
+ for (int i = 0; i < catSize; i++) {
+ String category = getCategory(i);
+ double score = categorize[getIndex(category)];
+ if (descendingMap.containsKey(score)) {
+ descendingMap.get(score).add(category);
+ } else {
+ Set<String> newset = new HashSet<>();
+ newset.add(category);
+ descendingMap.put(score, newset);
+ }
+ }
+ return descendingMap;
+ }
public String getBestCategory(double[] outcome) {
return model.getMaxentModel().getBestOutcome(outcome);
@@ -108,40 +152,40 @@ public class DocumentCategorizerME imple
return model.getMaxentModel().getAllOutcomes(results);
}
- /**
+ /**
* @deprecated Use
- * {@link #train(String, ObjectStream, TrainingParameters,
DoccatFactory)}
- * instead.
+ * {@link #train(String, ObjectStream, TrainingParameters, DoccatFactory)}
+ * instead.
*/
- public static DoccatModel train(String languageCode,
ObjectStream<DocumentSample> samples,
- TrainingParameters mlParams, FeatureGenerator... featureGenerators)
- throws IOException {
+ public static DoccatModel train(String languageCode,
ObjectStream<DocumentSample> samples,
+ TrainingParameters mlParams, FeatureGenerator... featureGenerators)
+ throws IOException {
- if (featureGenerators.length == 0) {
- featureGenerators = new FeatureGenerator[]{defaultFeatureGenerator};
- }
+ if (featureGenerators.length == 0) {
+ featureGenerators = new FeatureGenerator[]{defaultFeatureGenerator};
+ }
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- MaxentModel model = TrainUtil.train(
- new DocumentCategorizerEventStream(samples, featureGenerators),
- mlParams.getSettings(), manifestInfoEntries);
+ MaxentModel model = TrainUtil.train(
+ new DocumentCategorizerEventStream(samples, featureGenerators),
+ mlParams.getSettings(), manifestInfoEntries);
- return new DoccatModel(languageCode, model, manifestInfoEntries);
- }
+ return new DoccatModel(languageCode, model, manifestInfoEntries);
+ }
- public static DoccatModel train(String languageCode,
ObjectStream<DocumentSample> samples,
- TrainingParameters mlParams, DoccatFactory factory)
- throws IOException {
+ public static DoccatModel train(String languageCode,
ObjectStream<DocumentSample> samples,
+ TrainingParameters mlParams, DoccatFactory factory)
+ throws IOException {
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+ Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- MaxentModel model = TrainUtil.train(
- new DocumentCategorizerEventStream(samples,
factory.getFeatureGenerators()),
- mlParams.getSettings(), manifestInfoEntries);
+ MaxentModel model = TrainUtil.train(
+ new DocumentCategorizerEventStream(samples,
factory.getFeatureGenerators()),
+ mlParams.getSettings(), manifestInfoEntries);
- return new DoccatModel(languageCode, model, manifestInfoEntries, factory);
- }
+ return new DoccatModel(languageCode, model, manifestInfoEntries, factory);
+ }
/**
* Trains a doccat model with default feature generation.
@@ -155,8 +199,8 @@ public class DocumentCategorizerME imple
* @throws ObjectStreamException
*
* @deprecated Use
- * {@link #train(String, ObjectStream, TrainingParameters,
DoccatFactory)}
- * instead.
+ * {@link #train(String, ObjectStream, TrainingParameters, DoccatFactory)}
+ * instead.
*/
public static DoccatModel train(String languageCode,
ObjectStream<DocumentSample> samples) throws IOException {
return train(languageCode, samples,
ModelUtil.createDefaultTrainingParameters(), defaultFeatureGenerator);
Modified:
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
URL:
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java?rev=1590852&r1=1590851&r2=1590852&view=diff
==============================================================================
---
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
(original)
+++
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
Tue Apr 29 01:10:24 2014
@@ -14,12 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package opennlp.tools.doccat;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ObjectStreamUtils;
@@ -31,29 +33,38 @@ public class DocumentCategorizerMETest {
@Test
public void testSimpleTraining() throws IOException {
-
+
ObjectStream<DocumentSample> samples =
ObjectStreamUtils.createObjectStream(new DocumentSample[]{
- new DocumentSample("1", new String[]{"a", "b", "c"}),
- new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}),
- new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}),
- new DocumentSample("0", new String[]{"x", "y", "z"}),
- new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}),
- new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})
+ new DocumentSample("1", new String[]{"a", "b", "c"}),
+ new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}),
+ new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}),
+ new DocumentSample("0", new String[]{"x", "y", "z"}),
+ new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}),
+ new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})
});
-
+
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
-
+
DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
- params, new BagOfWordsFeatureGenerator());
-
+ params, new BagOfWordsFeatureGenerator());
+
DocumentCategorizer doccat = new DocumentCategorizerME(model);
-
+
double aProbs[] = doccat.categorize("a");
assertEquals("1", doccat.getBestCategory(aProbs));
-
+
double bProbs[] = doccat.categorize("x");
assertEquals("0", doccat.getBestCategory(bProbs));
+
+ //test to make sure sorted map's last key is cat 1 because it has the
highest score.
+ SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap("a");
+ for (String cat : sortedScoreMap.get(sortedScoreMap.lastKey())) {
+ assertEquals("1", cat);
+ break;
+ }
+ System.out.println("");
+
}
}