DocumentCategorizerMETest.java

markg Mon, 28 Apr 2014 18:11:22 -0700

Author: markg
Date: Tue Apr 29 01:10:24 2014
New Revision: 1590852

URL: http://svn.apache.org/r1590852
Log:
OPENNLP-679
Added two methods that return Map and SortedMap. Test includes the sortedMap 
call to get the last key.


Modified:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
    
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java?rev=1590852&r1=1590851&r2=1590852&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
 Tue Apr 29 01:10:24 2014
@@ -18,6 +18,12 @@
 
 package opennlp.tools.doccat;
 
+import java.util.HashMap;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Set;
+import java.util.SortedMap;
+
 /**
  * Interface for classes which categorize documents.
  */
@@ -41,5 +47,10 @@ public interface DocumentCategorizer {
   public double[] categorize(String documentText);
 
   public String getAllResults(double results[]);
+  
+  public Map<String, Double> scoreMap(String text); 
+
+  public SortedMap<Double, Set<String>> sortedScoreMap(String text);
+  
 }
 

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java?rev=1590852&r1=1590851&r2=1590852&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
 Tue Apr 29 01:10:24 2014
@@ -14,14 +14,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-
 package opennlp.tools.doccat;
 
 import java.io.IOException;
 import java.io.ObjectStreamException;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
 
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.ml.model.TrainUtil;
@@ -52,7 +55,7 @@ public class DocumentCategorizerME imple
    * @param featureGenerators
    *
    * @deprecated train a {@link DoccatModel} with a specific
-   *             {@link DoccatFactory} to customize the {@link 
FeatureGenerator}s
+   * {@link DoccatFactory} to customize the {@link FeatureGenerator}s
    */
   public DocumentCategorizerME(DoccatModel model, FeatureGenerator... 
featureGenerators) {
     this.model = model;
@@ -60,14 +63,15 @@ public class DocumentCategorizerME imple
   }
 
   /**
-   * Initializes the current instance with a doccat model. Default feature 
generation is used.
+   * Initializes the current instance with a doccat model. Default feature
+   * generation is used.
    *
    * @param model
    */
   public DocumentCategorizerME(DoccatModel model) {
     this.model = model;
     this.mContextGenerator = new DocumentCategorizerContextGenerator(this.model
-        .getFactory().getFeatureGenerators());
+            .getFactory().getFeatureGenerators());
   }
 
   /**
@@ -80,13 +84,53 @@ public class DocumentCategorizerME imple
   }
 
   /**
-   * Categorizes the given text. The text is tokenized with the 
SimpleTokenizer before it
-   * is passed to the feature generation.
+   * Categorizes the given text. The text is tokenized with the SimpleTokenizer
+   * before it is passed to the feature generation.
    */
   public double[] categorize(String documentText) {
     Tokenizer tokenizer = model.getFactory().getTokenizer();
     return categorize(tokenizer.tokenize(documentText));
   }
+/**
+ * Returns a map in which the key is the category name and the value is the 
score
+ * @param text the input text to classify
+ * @return 
+ */
+  public Map<String, Double> scoreMap(String text) {
+    Map<String, Double> probDist = new HashMap<String, Double>();
+
+    double[] categorize = categorize(text);
+    int catSize = getNumberOfCategories();
+    for (int i = 0; i < catSize; i++) {
+      String category = getCategory(i);
+      probDist.put(category, categorize[getIndex(category)]);
+    }
+    return probDist;
+
+  }
+/**
+ * Returns a map with the score as a key in ascendng order. The value is a Set 
of categories with the score. 
+ * Many categories can have the same score, hence the Set as value
+ * @param text the input text to classify
+ * @return 
+ */
+  public SortedMap<Double, Set<String>> sortedScoreMap(String text) {
+    SortedMap<Double, Set<String>> descendingMap = new TreeMap<Double, 
Set<String>>();
+    double[] categorize = categorize(text);
+    int catSize = getNumberOfCategories();
+    for (int i = 0; i < catSize; i++) {
+      String category = getCategory(i);
+      double score = categorize[getIndex(category)];
+      if (descendingMap.containsKey(score)) {
+        descendingMap.get(score).add(category);
+      } else {
+        Set<String> newset = new HashSet<>();
+        newset.add(category);
+        descendingMap.put(score, newset);
+      }
+    }
+    return descendingMap;
+  }
 
   public String getBestCategory(double[] outcome) {
     return model.getMaxentModel().getBestOutcome(outcome);
@@ -108,40 +152,40 @@ public class DocumentCategorizerME imple
     return model.getMaxentModel().getAllOutcomes(results);
   }
 
-   /**
+  /**
    * @deprecated Use
-   *             {@link #train(String, ObjectStream, TrainingParameters, 
DoccatFactory)}
-   *             instead.
+   * {@link #train(String, ObjectStream, TrainingParameters, DoccatFactory)}
+   * instead.
    */
-   public static DoccatModel train(String languageCode, 
ObjectStream<DocumentSample> samples,
-       TrainingParameters mlParams, FeatureGenerator... featureGenerators)
-   throws IOException {
+  public static DoccatModel train(String languageCode, 
ObjectStream<DocumentSample> samples,
+          TrainingParameters mlParams, FeatureGenerator... featureGenerators)
+          throws IOException {
 
-     if (featureGenerators.length == 0) {
-       featureGenerators = new FeatureGenerator[]{defaultFeatureGenerator};
-     }
+    if (featureGenerators.length == 0) {
+      featureGenerators = new FeatureGenerator[]{defaultFeatureGenerator};
+    }
 
-     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
 
-     MaxentModel model = TrainUtil.train(
-         new DocumentCategorizerEventStream(samples, featureGenerators),
-         mlParams.getSettings(), manifestInfoEntries);
+    MaxentModel model = TrainUtil.train(
+            new DocumentCategorizerEventStream(samples, featureGenerators),
+            mlParams.getSettings(), manifestInfoEntries);
 
-     return new DoccatModel(languageCode, model, manifestInfoEntries);
-   }
+    return new DoccatModel(languageCode, model, manifestInfoEntries);
+  }
 
-   public static DoccatModel train(String languageCode, 
ObjectStream<DocumentSample> samples,
-       TrainingParameters mlParams, DoccatFactory factory)
-   throws IOException {
+  public static DoccatModel train(String languageCode, 
ObjectStream<DocumentSample> samples,
+          TrainingParameters mlParams, DoccatFactory factory)
+          throws IOException {
 
-     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
+    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
 
-     MaxentModel model = TrainUtil.train(
-         new DocumentCategorizerEventStream(samples, 
factory.getFeatureGenerators()),
-         mlParams.getSettings(), manifestInfoEntries);
+    MaxentModel model = TrainUtil.train(
+            new DocumentCategorizerEventStream(samples, 
factory.getFeatureGenerators()),
+            mlParams.getSettings(), manifestInfoEntries);
 
-     return new DoccatModel(languageCode, model, manifestInfoEntries, factory);
-   }
+    return new DoccatModel(languageCode, model, manifestInfoEntries, factory);
+  }
 
   /**
    * Trains a doccat model with default feature generation.
@@ -155,8 +199,8 @@ public class DocumentCategorizerME imple
    * @throws ObjectStreamException
    *
    * @deprecated Use
-   *             {@link #train(String, ObjectStream, TrainingParameters, 
DoccatFactory)}
-   *             instead.
+   * {@link #train(String, ObjectStream, TrainingParameters, DoccatFactory)}
+   * instead.
    */
   public static DoccatModel train(String languageCode, 
ObjectStream<DocumentSample> samples) throws IOException {
     return train(languageCode, samples, 
ModelUtil.createDefaultTrainingParameters(), defaultFeatureGenerator);

Modified: 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java?rev=1590852&r1=1590851&r2=1590852&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
 Tue Apr 29 01:10:24 2014
@@ -14,12 +14,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package opennlp.tools.doccat;
 
 import static org.junit.Assert.assertEquals;
 
 import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
 
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.ObjectStreamUtils;
@@ -31,29 +33,38 @@ public class DocumentCategorizerMETest {
 
   @Test
   public void testSimpleTraining() throws IOException {
-   
+
     ObjectStream<DocumentSample> samples = 
ObjectStreamUtils.createObjectStream(new DocumentSample[]{
-        new DocumentSample("1", new String[]{"a", "b", "c"}),
-        new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}),
-        new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}),
-        new DocumentSample("0", new String[]{"x", "y", "z"}),
-        new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}),
-        new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})
+      new DocumentSample("1", new String[]{"a", "b", "c"}),
+      new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}),
+      new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}),
+      new DocumentSample("0", new String[]{"x", "y", "z"}),
+      new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}),
+      new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})
     });
-    
+
     TrainingParameters params = new TrainingParameters();
     params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
     params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
-    
+
     DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
-        params, new BagOfWordsFeatureGenerator());
-    
+            params, new BagOfWordsFeatureGenerator());
+
     DocumentCategorizer doccat = new DocumentCategorizerME(model);
-    
+
     double aProbs[] = doccat.categorize("a");
     assertEquals("1", doccat.getBestCategory(aProbs));
-    
+
     double bProbs[] = doccat.categorize("x");
     assertEquals("0", doccat.getBestCategory(bProbs));
+
+    //test to make sure sorted map's last key is cat 1 because it has the 
highest score.
+    SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap("a");
+    for (String cat : sortedScoreMap.get(sortedScoreMap.lastKey())) {
+      assertEquals("1", cat);
+      break;
+    }
+    System.out.println("");
+
   }
 }

svn commit: r1590852 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/doccat/DocumentCategorizer.java main/java/opennlp/tools/doccat/DocumentCategorizerME.java test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java

Reply via email to