Repository: opennlp
Updated Branches:
  refs/heads/master c2097051c -> b5b6d5c27


OPENNLP-1082: Add EOS to SDEventStream if missing

closes apache/opennlp#234


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/b5b6d5c2
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/b5b6d5c2
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/b5b6d5c2

Branch: refs/heads/master
Commit: b5b6d5c27443e1837b80b089206aad480852cd1c
Parents: c209705
Author: William D C M SILVA <[email protected]>
Authored: Thu Jun 22 00:15:59 2017 -0300
Committer: William D C M SILVA <[email protected]>
Committed: Thu Jun 22 00:15:59 2017 -0300

----------------------------------------------------------------------
 .../opennlp/tools/cmdline/ArgumentParser.java   | 16 +++++++++
 .../SentenceDetectorCrossValidatorTool.java     |  9 ++++-
 .../sentdetect/SentenceDetectorTrainerTool.java |  9 ++++-
 .../cmdline/sentdetect/TrainingParams.java      |  5 +++
 .../tools/sentdetect/SDCrossValidator.java      | 23 +++++++++++--
 .../opennlp/tools/sentdetect/SDEventStream.java | 35 ++++++++++++++++++--
 .../tools/sentdetect/SentenceDetectorME.java    | 19 +++++++----
 .../opennlp/tools/sentdetect/lang/Factory.java  |  6 ++--
 .../tools/sentdetect/SDEventStreamTest.java     | 33 +++++++++++++++---
 9 files changed, 135 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java
index 8243560..e05a682 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java
@@ -127,6 +127,21 @@ public class ArgumentParser {
     }
   }
 
+  private static class CharacterArgumentFactory implements ArgumentFactory {
+
+    public Object parseArgument(Method method, String argName, String 
argValue) {
+      if (argValue != null) {
+        char[] chars = argValue.toCharArray();
+        if (chars.length != 1) {
+          throw new TerminateToolException(1,  String.format(INVALID_ARG, 
argName, argValue) +
+              "Character should have size 1.");
+        }
+        return new Character(chars[0]);
+      }
+      return null;
+    }
+  }
+
   private static class ArgumentProxy implements InvocationHandler {
 
     private final Map<String, Object> arguments;
@@ -154,6 +169,7 @@ public class ArgumentParser {
     factories.put(String.class, new StringArgumentFactory());
     factories.put(File.class, new FileArgumentFactory());
     factories.put(Charset.class, new CharsetArgumentFactory());
+    factories.put(Character.class, new CharacterArgumentFactory());
 
     argumentFactories = Collections.unmodifiableMap(factories);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
index 55d1df6..3a28254 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java
@@ -67,11 +67,18 @@ public final class SentenceDetectorCrossValidatorTool
       eos = eosString.toCharArray();
     }
 
+    Character defaultEOS;
+    if (params.getDefaultEosChar() != null) {
+      defaultEOS = params.getDefaultEosChar();
+    } else {
+      defaultEOS = '\n';
+    }
+
     try {
       Dictionary abbreviations = 
SentenceDetectorTrainerTool.loadDict(params.getAbbDict());
       SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
           params.getFactory(), params.getLang(), true, abbreviations, eos);
-      validator = new SDCrossValidator(params.getLang(), mlParams, sdFactory,
+      validator = new SDCrossValidator(params.getLang(), mlParams, sdFactory, 
defaultEOS,
           errorListener);
 
       validator.evaluate(sampleStream, params.getFolds());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
index cdd6916..b63de0f 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java
@@ -84,6 +84,13 @@ public final class SentenceDetectorTrainerTool
       eos = eosString.toCharArray();
     }
 
+    Character defaultEOS;
+    if (params.getDefaultEosChar() != null) {
+      defaultEOS = params.getDefaultEosChar();
+    } else {
+      defaultEOS = '\n';
+    }
+
     SentenceModel model;
 
     try {
@@ -91,7 +98,7 @@ public final class SentenceDetectorTrainerTool
       SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create(
           params.getFactory(), params.getLang(), true, dict, eos);
       model = SentenceDetectorME.train(params.getLang(), sampleStream,
-          sdFactory, mlParams);
+          sdFactory, mlParams, defaultEOS);
     } catch (IOException e) {
       throw createTerminationIOException(e);
     }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
index fbdf4db..5b7289a 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java
@@ -38,6 +38,11 @@ interface TrainingParams extends BasicTrainingParams {
   @OptionalParameter
   String getEosChars();
 
+  @ParameterDescription(valueName = "string", description = "EOS character to 
use if EOS is " +
+      "missing in sample. Default is \\n.")
+  @OptionalParameter
+  Character getDefaultEosChar();
+
   @ParameterDescription(valueName = "factoryName",
       description = "A sub-class of SentenceDetectorFactory where to get 
implementation and resources.")
   @OptionalParameter

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java
index 2f6daec..cb43896 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java
@@ -34,6 +34,8 @@ public class SDCrossValidator {
 
   private final TrainingParameters params;
 
+  private final Character defaultEOS;
+
   private FMeasure fmeasure = new FMeasure();
 
   private SentenceDetectorEvaluationMonitor[] listeners;
@@ -41,11 +43,25 @@ public class SDCrossValidator {
   private SentenceDetectorFactory sdFactory;
 
   public SDCrossValidator(String languageCode, TrainingParameters params,
-      SentenceDetectorFactory sdFactory, SentenceDetectorEvaluationMonitor... 
listeners) {
+                          SentenceDetectorFactory sdFactory, Character 
defaultEOS,
+                          SentenceDetectorEvaluationMonitor... listeners) {
     this.languageCode = languageCode;
     this.params = params;
     this.listeners = listeners;
     this.sdFactory = sdFactory;
+    this.defaultEOS = defaultEOS;
+  }
+
+  /**
+   * @deprecated Use
+   *             {@link #SDCrossValidator(String, TrainingParameters,
+   *             SentenceDetectorFactory, Character, 
SentenceDetectorEvaluationMonitor...)}
+   *             and pass in a {@link SentenceDetectorFactory}.
+   */
+  @Deprecated
+  public SDCrossValidator(String languageCode, TrainingParameters params,
+                          SentenceDetectorFactory sdFactory, 
SentenceDetectorEvaluationMonitor... listeners) {
+    this(languageCode, params, sdFactory, '\n', listeners);
   }
 
   /**
@@ -54,6 +70,7 @@ public class SDCrossValidator {
    *             SentenceDetectorFactory, 
SentenceDetectorEvaluationMonitor...)}
    *             and pass in a {@link SentenceDetectorFactory}.
    */
+  @Deprecated
   public SDCrossValidator(String languageCode, TrainingParameters params) {
     this(languageCode, params, new SentenceDetectorFactory(languageCode, true,
         null, null));
@@ -65,6 +82,7 @@ public class SDCrossValidator {
    *             SentenceDetectorEvaluationMonitor...)}
    *             instead and pass in a TrainingParameters object.
    */
+  @Deprecated
   public SDCrossValidator(String languageCode, TrainingParameters params,
       SentenceDetectorEvaluationMonitor... listeners) {
     this(languageCode, params, new SentenceDetectorFactory(languageCode, true,
@@ -76,6 +94,7 @@ public class SDCrossValidator {
    *     SentenceDetectorFactory, SentenceDetectorEvaluationMonitor...)}
    *     instead and pass in a TrainingParameters object.
    */
+  @Deprecated
   public SDCrossValidator(String languageCode) {
     this(languageCode, ModelUtil.createDefaultTrainingParameters());
   }
@@ -103,7 +122,7 @@ public class SDCrossValidator {
       SentenceModel model;
 
       model = SentenceDetectorME.train(languageCode, trainingSampleStream,
-          sdFactory, params);
+          sdFactory, params, defaultEOS);
 
       // do testing
       SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator(

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java
index 6f3aad8..aefb54a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java
@@ -20,6 +20,7 @@ package opennlp.tools.sentdetect;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
+import java.util.List;
 
 import opennlp.tools.ml.model.Event;
 import opennlp.tools.util.AbstractEventStream;
@@ -28,6 +29,7 @@ import opennlp.tools.util.Span;
 
 public class SDEventStream extends AbstractEventStream<SentenceSample> {
 
+  private final Character defaultEOS;
   private SDContextGenerator cg;
   private EndOfSentenceScanner scanner;
 
@@ -37,21 +39,40 @@ public class SDEventStream extends 
AbstractEventStream<SentenceSample> {
    * @param samples
    */
   public SDEventStream(ObjectStream<SentenceSample> samples, 
SDContextGenerator cg,
-      EndOfSentenceScanner scanner) {
+                       EndOfSentenceScanner scanner, Character defaultEOS) {
     super(samples);
 
     this.cg = cg;
     this.scanner = scanner;
+    this.defaultEOS = defaultEOS;
+  }
+
+  /**
+   * Initializes the current instance with NEW LINE as default EOS.
+   *
+   * @param samples
+   */
+  public SDEventStream(ObjectStream<SentenceSample> samples, 
SDContextGenerator cg,
+                       EndOfSentenceScanner scanner) {
+    super(samples);
+
+    this.cg = cg;
+    this.scanner = scanner;
+    this.defaultEOS = '\n';
   }
 
   @Override
   protected Iterator<Event> createEvents(SentenceSample sample) {
 
-    Collection<Event> events = new ArrayList<Event>();
+    Collection<Event> events = new ArrayList();
 
     for (Span sentenceSpan : sample.getSentences()) {
       String sentenceString = 
sentenceSpan.getCoveredText(sample.getDocument()).toString();
 
+      // last position should be a EOS, if not we add it.
+      sentenceString = addTrailingEosIfMissing(sentenceString);
+
+
       for (Iterator<Integer> it = scanner.getPositions(
           sentenceString).iterator(); it.hasNext();) {
 
@@ -69,4 +90,14 @@ public class SDEventStream extends 
AbstractEventStream<SentenceSample> {
 
     return events.iterator();
   }
+
+  protected String addTrailingEosIfMissing(String sentenceString) {
+    List<Integer> positions = scanner.getPositions(
+        sentenceString.substring(sentenceString.length() - 2));
+    if (positions.size() > 0) {
+      // trailing is a EOS
+      return sentenceString;
+    }
+    return sentenceString + defaultEOS;
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index b5ad804..c76342e 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -299,7 +299,7 @@ public class SentenceDetectorME implements SentenceDetector 
{
 
   /**
    * @deprecated Use
-   *             {@link #train(String, ObjectStream, SentenceDetectorFactory, 
TrainingParameters)}
+   *             {@link #train(String, ObjectStream, SentenceDetectorFactory, 
TrainingParameters, Character)}
    *             and pass in af {@link SentenceDetectorFactory}.
    */
   public static SentenceModel train(String languageCode,
@@ -307,18 +307,25 @@ public class SentenceDetectorME implements 
SentenceDetector {
       Dictionary abbreviations, TrainingParameters mlParams) throws 
IOException {
     SentenceDetectorFactory sdFactory = new SentenceDetectorFactory(
         languageCode, useTokenEnd, abbreviations, null);
-    return train(languageCode, samples, sdFactory, mlParams);
+    return train(languageCode, samples, sdFactory, mlParams, null);
+  }
+
+  public static SentenceModel train(String languageCode,
+                                    ObjectStream<SentenceSample> samples, 
SentenceDetectorFactory sdFactory,
+                                    TrainingParameters mlParams) throws 
IOException {
+
+    return train(languageCode, samples, sdFactory, mlParams, '\n');
   }
 
   public static SentenceModel train(String languageCode,
       ObjectStream<SentenceSample> samples, SentenceDetectorFactory sdFactory,
-      TrainingParameters mlParams) throws IOException {
+      TrainingParameters mlParams, Character defaultEOS) throws IOException {
 
-    Map<String, String> manifestInfoEntries = new HashMap<>();
+    Map<String, String> manifestInfoEntries = new HashMap();
 
     // TODO: Fix the EventStream to throw exceptions when training goes wrong
     ObjectStream<Event> eventStream = new SDEventStream(samples,
-        sdFactory.getSDContextGenerator(), 
sdFactory.getEndOfSentenceScanner());
+        sdFactory.getSDContextGenerator(), 
sdFactory.getEndOfSentenceScanner(), defaultEOS);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, 
manifestInfoEntries);
 
@@ -329,7 +336,7 @@ public class SentenceDetectorME implements SentenceDetector 
{
 
   /**
    * @deprecated Use
-   *             {@link #train(String, ObjectStream, SentenceDetectorFactory, 
TrainingParameters)}
+   *             {@link #train(String, ObjectStream, SentenceDetectorFactory, 
TrainingParameters, Character)}
    *             and pass in af {@link SentenceDetectorFactory}.
    */
   @Deprecated

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
index 4a34229..f4959ea 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java
@@ -29,13 +29,13 @@ import 
opennlp.tools.sentdetect.lang.th.SentenceContextGenerator;
 public class Factory {
 
   public static final char[] ptEosCharacters = new char[] { '.', '?', '!', ';',
-      ':', '(', ')', '«', '»', '\'', '"' };
+      ':', '(', ')', '«', '»', '\'', '"', '\n'};
 
-  public static final char[] defaultEosCharacters = new char[] { '.', '!', '?' 
};
+  public static final char[] defaultEosCharacters = new char[] { '.', '!', 
'?', '\n'};
 
   public static final char[] thEosCharacters = new char[] { ' ','\n' };
 
-  public static final char[] jpEosCharacters = new char[] {'。', '!', 
'?'};
+  public static final char[] jpEosCharacters = new char[] {'。', '!', 
'?', '\n'};
 
   public EndOfSentenceScanner createEndOfSentenceScanner(String languageCode) {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java 
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java
index 138e915..25a8add 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java
@@ -42,11 +42,7 @@ public class SDEventStreamTest {
     ObjectStream<SentenceSample> sampleStream =
         ObjectStreamUtils.createObjectStream(sample);
 
-    Factory factory = new Factory();
-
-    ObjectStream<Event> eventStream = new SDEventStream(sampleStream,
-        factory.createSentenceContextGenerator("eng"),
-        factory.createEndOfSentenceScanner("eng"));
+    ObjectStream<Event> eventStream = createSDEventStream(sampleStream,"eng", 
'\n');
 
     Assert.assertEquals(SentenceDetectorME.NO_SPLIT, 
eventStream.read().getOutcome());
     Assert.assertEquals(SentenceDetectorME.SPLIT, 
eventStream.read().getOutcome());
@@ -55,4 +51,31 @@ public class SDEventStreamTest {
 
     Assert.assertNull(eventStream.read());
   }
+
+  @Test
+  public void testInsertDefaultEOS() throws IOException {
+
+    String document = "Test sent. one Test sent. 2";
+    SentenceSample sample = new SentenceSample(document,
+        new Span(0, 14), new Span(15, 27));
+
+    ObjectStream<SentenceSample> sampleStream =
+        ObjectStreamUtils.createObjectStream(sample);
+
+
+    SDEventStream eventStream = createSDEventStream(sampleStream,"eng", '\n');
+
+    String sent = "abc";
+    Assert.assertEquals(sent + "\n", 
eventStream.addTrailingEosIfMissing(sent));
+    sent = "abc.";
+    Assert.assertEquals(sent, eventStream.addTrailingEosIfMissing(sent));
+  }
+
+  private SDEventStream createSDEventStream(ObjectStream<SentenceSample> 
sampleStream,
+                                            String languageCode, Character 
defaultEOS) {
+    Factory factory = new Factory();
+    return new SDEventStream(sampleStream,
+        factory.createSentenceContextGenerator(languageCode),
+        factory.createEndOfSentenceScanner(languageCode), defaultEOS);
+  }
 }

Reply via email to