opennlp git commit: Replace private text with Leipzig english news

joern Mon, 19 Dec 2016 01:37:44 -0800

Repository: opennlp
Updated Branches:
  refs/heads/trunk bbc5a34a8 -> a7826d2b8



Replace private text with Leipzig english news

See issue OPENNLP-877


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a7826d2b
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a7826d2b
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a7826d2b

Branch: refs/heads/trunk
Commit: a7826d2b86810ea0e094281c575b99d377021295
Parents: bbc5a34
Author: Joern Kottmann <[email protected]>
Authored: Thu Dec 15 21:03:28 2016 +0100
Committer: Joern Kottmann <[email protected]>
Committed: Mon Dec 19 10:36:36 2016 +0100

----------------------------------------------------------------------
 .../formats/LeipzigDoccatSampleStream.java      |  23 ++-
 .../tools/eval/SourceForgeModelEval.java        | 169 +++++++++++++------
 2 files changed, 136 insertions(+), 56 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/a7826d2b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 0af66ae..0ac318a 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
 
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.util.FilterObjectStream;
 import opennlp.tools.util.InputStreamFactory;
 import opennlp.tools.util.PlainTextByLineStream;
@@ -40,6 +41,8 @@ import opennlp.tools.util.PlainTextByLineStream;
 public class LeipzigDoccatSampleStream extends
     FilterObjectStream<String, DocumentSample> {
 
+  private final Tokenizer tokenizer;
+
   private final String language;
   private final int sentencesPerDocument;
 
@@ -51,12 +54,26 @@ public class LeipzigDoccatSampleStream extends
    * @param in the InputStream pointing to the contents of the sentences.txt 
input file
    * @throws IOException IOException
    */
-  LeipzigDoccatSampleStream(String language, int sentencesPerDocument,
-      InputStreamFactory in) throws IOException {
+  public LeipzigDoccatSampleStream(String language, int sentencesPerDocument, 
Tokenizer tokenizer,
+                                   InputStreamFactory in) throws IOException {
     super(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
     System.setOut(new PrintStream(System.out, true, "UTF-8"));
     this.language = language;
     this.sentencesPerDocument = sentencesPerDocument;
+    this.tokenizer = tokenizer;
+  }
+
+  /**
+   * Creates a new LeipzigDoccatSampleStream with the specified parameters.
+   *
+   * @param language the Leipzig input sentences.txt file
+   * @param sentencesPerDocument the number of sentences which should be 
grouped into once {@link DocumentSample}
+   * @param in the InputStream pointing to the contents of the sentences.txt 
input file
+   * @throws IOException IOException
+   */
+  public LeipzigDoccatSampleStream(String language, int sentencesPerDocument,
+      InputStreamFactory in) throws IOException {
+    this(language, sentencesPerDocument, SimpleTokenizer.INSTANCE, in);
   }
   
   public DocumentSample read() throws IOException {
@@ -68,7 +85,7 @@ public class LeipzigDoccatSampleStream extends
     String line;
     while (count < sentencesPerDocument && (line = samples.read()) != null) {
 
-      String tokens[] = SimpleTokenizer.INSTANCE.tokenize(line);
+      String tokens[] = tokenizer.tokenize(line);
 
       if (tokens.length == 0) {
         throw new IOException("Empty lines are not allowed!");

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a7826d2b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index f63fcb5..d996afa 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -22,6 +22,7 @@ import opennlp.tools.chunker.Chunker;
 import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.chunker.ChunkerModel;
 import opennlp.tools.cmdline.parser.ParserTool;
+import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.formats.LeipzigDoccatSampleStream;
 import opennlp.tools.namefind.NameFinderME;
 import opennlp.tools.namefind.TokenNameFinder;
@@ -55,12 +56,27 @@ import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 
 /**
- * The tests only run if the input text files are available and those
- * are derived from the leipzig corpus.
+ * This tests ensures that the existing SourceForge models perform
+ * like they are expected to.
  *
- * Next step is to replace the input texts with ones that don't have license 
issues.
- * Wikinews is probably a vey good source. In addition also models that
- * can be shared are required to give everyone the possibilty to run this.
+ * To run this tests external the leipzig sentences files is needed:
+ * leipzig/eng_news_2010_300K-sentences.txt, this file can be
+ * obtained from the leipzig corpus project. <br>
+ *
+ * And all the SourceForge models:<br>
+ * - models-sf/en-sent.bin<br>
+ * - models-sf/en-token.bin<br>
+ * - models-sf/en-ner-date.bin<br>
+ * - models-sf/en-ner-location.binn<br>
+ * - models-sf/en-ner-money.bin<br>
+ * - models-sf/en-ner-organization.bin<br>
+ * - models-sf/en-ner-percentage.bi<br>
+ * - models-sf/en-ner-person.bin<br>
+ * - models-sf/en-ner-time.bin<br>
+ * - models-sf/en-chunker.bin<br>
+ * - models-sf/en-pos-maxent.bin<br>
+ * - models-sf/en-pos-perceptron.bin<br>
+ * - models-sf/en-parser-chunking.bin.bin<br>
  */
 public class SourceForgeModelEval {
 
@@ -73,10 +89,28 @@ public class SourceForgeModelEval {
   }
 
   @Test
+  public void ensureTestDataIsCorrect() throws IOException {
+    MessageDigest digest = createDigest();
+
+    try (ObjectStream<String> lines = new PlainTextByLineStream(
+            new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
+                    "leipzig/eng_news_2010_300K-sentences.txt")), 
Charset.forName("UTF-8"))) {
+
+      String line;
+      while ((line = lines.read()) != null) {
+        digest.update(line.getBytes("UTF-8"));
+      }
+
+      Assert.assertEquals(new 
BigInteger("248567841356936801447294643695012852392"),
+              new BigInteger(1, digest.digest()));
+    }
+  }
+
+  @Test
   public void evalSentenceModel() throws IOException {
 
     SentenceModel model = new SentenceModel(
-            new File("/home/burn/opennlp-data-dir", "models-sf/en-sent.bin"));
+            new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"));
 
     MessageDigest digest = createDigest();
 
@@ -84,13 +118,16 @@ public class SourceForgeModelEval {
 
     StringBuilder text = new StringBuilder();
 
-    try (ObjectStream<String> lines = new PlainTextByLineStream(
-            new MarkableFileInputStreamFactory(new 
File("/home/burn/opennlp-data-dir",
-            "leipzig/sentences.txt")), Charset.forName("UTF-8"))) {
+    try (ObjectStream<DocumentSample> lineBatches = new 
LeipzigDoccatSampleStream("en", 25,
+            new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
+                    "leipzig/eng_news_2010_300K-sentences.txt")))) {
 
-      String line;
-      while ((line = lines.read()) != null) {
-        text.append(line).append(" ");
+      DocumentSample lineBatch ;
+      while ((lineBatch = lineBatches.read()) != null) {
+        // TODO: Replace with Java 8 join
+        for (String token : lineBatch.getText()) {
+          text.append(token).append(" ");
+        }
       }
     }
 
@@ -100,13 +137,17 @@ public class SourceForgeModelEval {
       digest.update(sentence.getBytes("UTF-8"));
     }
 
-    Assert.assertEquals(new 
BigInteger("54058993675314170033586747935067060992"),
+    Assert.assertEquals(new 
BigInteger("228544068397077998410949364710969159291"),
             new BigInteger(1, digest.digest()));
   }
 
   @Test
   public void evalTokenModel() throws IOException {
 
+    // the input stream is currently tokenized, we should detokenize it again,
+    //    (or extend to pass in tokenizer, then whitespace tokenizer can be 
passed)
+    // and then tokenize it here
+
     TokenizerModel model = new TokenizerModel(
             new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"));
 
@@ -114,23 +155,38 @@ public class SourceForgeModelEval {
 
     Tokenizer tokenizer = new TokenizerME(model);
 
-    try (ObjectStream<String> lines = new PlainTextByLineStream(
+    try (ObjectStream<DocumentSample> lines = new 
LeipzigDoccatSampleStream("en", 1,
+            WhitespaceTokenizer.INSTANCE,
             new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
-            "leipzig/sentences.txt")), Charset.forName("UTF-8"))) {
+                    "leipzig/eng_news_2010_300K-sentences.txt")))) {
 
-      String line;
+      DocumentSample line;
       while ((line = lines.read()) != null) {
-        String[] tokens = tokenizer.tokenize(line);
+
+        // TODO: Replace with Java 8 join
+        StringBuffer text = new StringBuffer();
+        for (String token : line.getText()) {
+          text.append(token).append(' ');
+        }
+
+        String[] tokens = tokenizer.tokenize(text.toString());
         for (String token : tokens) {
           digest.update(token.getBytes("UTF-8"));
         }
       }
     }
 
-    Assert.assertEquals(new 
BigInteger("309548448163611475251363008574168734058"),
+    Assert.assertEquals(new 
BigInteger("180602607571756839321060482558626151930"),
             new BigInteger(1, digest.digest()));
   }
 
+  private ObjectStream<DocumentSample> createLineWiseStream() throws 
IOException {
+    return new LeipzigDoccatSampleStream("en", 1,
+            new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
+                    "leipzig/eng_news_2010_300K-sentences.txt")));
+  }
+
+
   private void evalNameFinder(TokenNameFinderModel model, BigInteger 
expectedHash)
       throws IOException {
 
@@ -138,13 +194,11 @@ public class SourceForgeModelEval {
 
     TokenNameFinder nameFinder = new NameFinderME(model);
 
-    try (ObjectStream<String> lines = new PlainTextByLineStream(
-        new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(), "leipzig/simpleTok.txt")),
-        Charset.forName("UTF-8"))) {
+    try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
 
-      String line;
+      DocumentSample line;
       while ((line = lines.read()) != null) {
-        Span[] names = 
nameFinder.find(WhitespaceTokenizer.INSTANCE.tokenize(line));
+        Span[] names = nameFinder.find(line.getText());
         for (Span name : names) {
           digest.update((name.getType() + name.getStart() + 
name.getEnd()).getBytes("UTF-8"));
         }
@@ -159,7 +213,7 @@ public class SourceForgeModelEval {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
             new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-date.bin"));
 
-    evalNameFinder(personModel, new 
BigInteger("13595680199220579055030594287753821185"));
+    evalNameFinder(personModel, new 
BigInteger("116570003910213570906062355532299200317"));
   }
 
   @Test
@@ -167,7 +221,7 @@ public class SourceForgeModelEval {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
             new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-location.bin"));
 
-    evalNameFinder(personModel, new 
BigInteger("61423868331440897441202803979849564658"));
+    evalNameFinder(personModel, new 
BigInteger("44810593886021404716125849669208680993"));
   }
 
   @Test
@@ -175,7 +229,7 @@ public class SourceForgeModelEval {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
             new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-money.bin"));
 
-    evalNameFinder(personModel, new 
BigInteger("31779803056581858429003932617173745364"));
+    evalNameFinder(personModel, new 
BigInteger("65248897509365807977219790824670047287"));
   }
 
   @Test
@@ -183,7 +237,7 @@ public class SourceForgeModelEval {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
             new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-organization.bin"));
 
-    evalNameFinder(personModel, new 
BigInteger("268615755804346283904103340480818555730"));
+    evalNameFinder(personModel, new 
BigInteger("50454559690338630659278005157657197233"));
   }
 
   @Test
@@ -191,7 +245,7 @@ public class SourceForgeModelEval {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
             new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-percentage.bin"));
 
-    evalNameFinder(personModel, new 
BigInteger("1793019183238911248412519564457497503"));
+    evalNameFinder(personModel, new 
BigInteger("320996882594215344113023719117249515343"));
   }
 
   @Test
@@ -207,26 +261,25 @@ public class SourceForgeModelEval {
     TokenNameFinderModel personModel = new TokenNameFinderModel(
         new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-time.bin"));
 
-    evalNameFinder(personModel, new 
BigInteger("264798318876255738642952635833268231353"));
+    evalNameFinder(personModel, new 
BigInteger("282941772380683328816791801782579055940"));
   }
 
   @Test
   public void evalChunkerModel() throws IOException {
 
-    ChunkerModel model = new ChunkerModel(
-            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-chunker.bin"));
-
     MessageDigest digest = createDigest();
 
-    Chunker chunker = new ChunkerME(model);
+    POSTagger tagger = new POSTaggerME(new POSModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-pos-perceptron.bin")));
 
-    try (ObjectStream<String> lines = new PlainTextByLineStream(
-            new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(), "leipzig/simpleTokPos.txt")),
-            Charset.forName("UTF-8"))) {
+    Chunker chunker = new ChunkerME(new ChunkerModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-chunker.bin")));
 
-      String line;
+    try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
+
+      DocumentSample line;
       while ((line = lines.read()) != null) {
-        POSSample sentence = POSSample.parse(line);
+        POSSample sentence = new POSSample(line.getText(), 
tagger.tag(line.getText()));
 
         String[] chunks = chunker.chunk(sentence.getSentence(), 
sentence.getTags());
         for (String chunk : chunks) {
@@ -235,22 +288,24 @@ public class SourceForgeModelEval {
       }
     }
 
-    Assert.assertEquals(new 
BigInteger("87766988424222321513554054789708059330"),
+    Assert.assertEquals(new 
BigInteger("226003515785585284478071030961407561943"),
         new BigInteger(1, digest.digest()));
   }
 
   private void evalPosModel(POSModel model, BigInteger expectedHash) throws 
IOException {
+
+    // break the input stream into sentences
+    // The input stream is tokenized and can be processed here directly
+
     MessageDigest digest = createDigest();
 
     POSTagger tagger = new POSTaggerME(model);
 
-    try (ObjectStream<String> lines = new PlainTextByLineStream(
-            new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
-            "leipzig/simpleTok.txt")), Charset.forName("UTF-8"))) {
+    try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
 
-      String line;
+      DocumentSample line;
       while ((line = lines.read()) != null) {
-        String[] tags = 
tagger.tag(WhitespaceTokenizer.INSTANCE.tokenize(line));
+        String[] tags = tagger.tag(line.getText());
         for (String tag : tags) {
           digest.update(tag.getBytes("UTF-8"));
         }
@@ -265,7 +320,7 @@ public class SourceForgeModelEval {
     POSModel maxentModel = new POSModel(
             new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-pos-maxent.bin"));
 
-    evalPosModel(maxentModel, new 
BigInteger("6912278014292642909634347798602234960"));
+    evalPosModel(maxentModel, new 
BigInteger("231995214522232523777090597594904492687"));
   }
 
   @Test
@@ -273,28 +328,36 @@ public class SourceForgeModelEval {
     POSModel perceptronModel = new POSModel(
             new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-pos-perceptron.bin"));
 
-    evalPosModel(perceptronModel, new 
BigInteger("333081688760132868394207450128996236484"));
+    evalPosModel(perceptronModel, new 
BigInteger("209440430718727101220960491543652921728"));
   }
 
   @Test
   public void evalParserModel() throws IOException {
 
+    // break input stream into sentences
+    // input is tokenized already
+
     ParserModel model = new ParserModel(
-            new File("/home/burn/opennlp-data-dir", 
"models-sf/en-parser-chunking.bin"));
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-parser-chunking.bin"));
 
     MessageDigest digest = createDigest();
 
 
     Parser parser = ParserFactory.create(model);
 
-    try (ObjectStream<String> lines = new PlainTextByLineStream(
-            new MarkableFileInputStreamFactory(new 
File("/home/burn/opennlp-data-dir",
-            "leipzig/simpleTok.txt")), Charset.forName("UTF-8"))) {
+    try (ObjectStream<DocumentSample> lines = createLineWiseStream()) {
 
-      String line;
+      DocumentSample line;
       while ((line = lines.read()) != null) {
 
-        Parse[] parse = ParserTool.parseLine(line, parser, 1);
+        StringBuilder textLine = new StringBuilder();
+
+        // TODO: Replace with Java 8 join
+        for (String token : line.getText()) {
+          textLine.append(token).append(' ');
+        }
+
+        Parse[] parse = ParserTool.parseLine(textLine.toString(), parser, 1);
         if (parse.length > 0) {
           digest.update(parse[0].toString().getBytes("UTF-8"));
         }
@@ -304,7 +367,7 @@ public class SourceForgeModelEval {
       }
     }
 
-    Assert.assertEquals(new 
BigInteger("95566096874728850374427554294889512256"),
+    Assert.assertEquals(new 
BigInteger("226003515785585284478071030961407561943"),
             new BigInteger(1, digest.digest()));
   }
 }

opennlp git commit: Replace private text with Leipzig english news

Reply via email to