Repository: opennlp Updated Branches: refs/heads/master c2097051c -> b5b6d5c27
OPENNLP-1082: Add EOS to SDEventStream if missing closes apache/opennlp#234 Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/b5b6d5c2 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/b5b6d5c2 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/b5b6d5c2 Branch: refs/heads/master Commit: b5b6d5c27443e1837b80b089206aad480852cd1c Parents: c209705 Author: William D C M SILVA <[email protected]> Authored: Thu Jun 22 00:15:59 2017 -0300 Committer: William D C M SILVA <[email protected]> Committed: Thu Jun 22 00:15:59 2017 -0300 ---------------------------------------------------------------------- .../opennlp/tools/cmdline/ArgumentParser.java | 16 +++++++++ .../SentenceDetectorCrossValidatorTool.java | 9 ++++- .../sentdetect/SentenceDetectorTrainerTool.java | 9 ++++- .../cmdline/sentdetect/TrainingParams.java | 5 +++ .../tools/sentdetect/SDCrossValidator.java | 23 +++++++++++-- .../opennlp/tools/sentdetect/SDEventStream.java | 35 ++++++++++++++++++-- .../tools/sentdetect/SentenceDetectorME.java | 19 +++++++---- .../opennlp/tools/sentdetect/lang/Factory.java | 6 ++-- .../tools/sentdetect/SDEventStreamTest.java | 33 +++++++++++++++--- 9 files changed, 135 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java index 8243560..e05a682 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java @@ -127,6 +127,21 @@ public class ArgumentParser { } } + private static class CharacterArgumentFactory implements ArgumentFactory { + + public Object parseArgument(Method method, String argName, String argValue) { + if (argValue != null) { + char[] chars = argValue.toCharArray(); + if (chars.length != 1) { + throw new TerminateToolException(1, String.format(INVALID_ARG, argName, argValue) + + "Character should have size 1."); + } + return new Character(chars[0]); + } + return null; + } + } + private static class ArgumentProxy implements InvocationHandler { private final Map<String, Object> arguments; @@ -154,6 +169,7 @@ public class ArgumentParser { factories.put(String.class, new StringArgumentFactory()); factories.put(File.class, new FileArgumentFactory()); factories.put(Charset.class, new CharsetArgumentFactory()); + factories.put(Character.class, new CharacterArgumentFactory()); argumentFactories = Collections.unmodifiableMap(factories); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java index 55d1df6..3a28254 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java @@ -67,11 +67,18 @@ public final class SentenceDetectorCrossValidatorTool eos = eosString.toCharArray(); } + Character defaultEOS; + if (params.getDefaultEosChar() != null) { + defaultEOS = params.getDefaultEosChar(); + } else { + defaultEOS = '\n'; + } + try { Dictionary abbreviations = SentenceDetectorTrainerTool.loadDict(params.getAbbDict()); SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create( params.getFactory(), params.getLang(), true, abbreviations, eos); - validator = new SDCrossValidator(params.getLang(), mlParams, sdFactory, + validator = new SDCrossValidator(params.getLang(), mlParams, sdFactory, defaultEOS, errorListener); validator.evaluate(sampleStream, params.getFolds()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java index cdd6916..b63de0f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java @@ -84,6 +84,13 @@ public final class SentenceDetectorTrainerTool eos = eosString.toCharArray(); } + Character defaultEOS; + if (params.getDefaultEosChar() != null) { + defaultEOS = params.getDefaultEosChar(); + } else { + defaultEOS = '\n'; + } + SentenceModel model; try { @@ -91,7 +98,7 @@ public final class SentenceDetectorTrainerTool SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create( params.getFactory(), params.getLang(), true, dict, eos); model = SentenceDetectorME.train(params.getLang(), sampleStream, - sdFactory, mlParams); + sdFactory, mlParams, defaultEOS); } catch (IOException e) { throw createTerminationIOException(e); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java index fbdf4db..5b7289a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java @@ -38,6 +38,11 @@ interface TrainingParams extends BasicTrainingParams { @OptionalParameter String getEosChars(); + @ParameterDescription(valueName = "string", description = "EOS character to use if EOS is " + + "missing in sample. Default is \\n.") + @OptionalParameter + Character getDefaultEosChar(); + @ParameterDescription(valueName = "factoryName", description = "A sub-class of SentenceDetectorFactory where to get implementation and resources.") @OptionalParameter http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java index 2f6daec..cb43896 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java @@ -34,6 +34,8 @@ public class SDCrossValidator { private final TrainingParameters params; + private final Character defaultEOS; + private FMeasure fmeasure = new FMeasure(); private SentenceDetectorEvaluationMonitor[] listeners; @@ -41,11 +43,25 @@ public class SDCrossValidator { private SentenceDetectorFactory sdFactory; public SDCrossValidator(String languageCode, TrainingParameters params, - SentenceDetectorFactory sdFactory, SentenceDetectorEvaluationMonitor... listeners) { + SentenceDetectorFactory sdFactory, Character defaultEOS, + SentenceDetectorEvaluationMonitor... listeners) { this.languageCode = languageCode; this.params = params; this.listeners = listeners; this.sdFactory = sdFactory; + this.defaultEOS = defaultEOS; + } + + /** + * @deprecated Use + * {@link #SDCrossValidator(String, TrainingParameters, + * SentenceDetectorFactory, Character, SentenceDetectorEvaluationMonitor...)} + * and pass in a {@link SentenceDetectorFactory}. + */ + @Deprecated + public SDCrossValidator(String languageCode, TrainingParameters params, + SentenceDetectorFactory sdFactory, SentenceDetectorEvaluationMonitor... listeners) { + this(languageCode, params, sdFactory, '\n', listeners); } /** @@ -54,6 +70,7 @@ public class SDCrossValidator { * SentenceDetectorFactory, SentenceDetectorEvaluationMonitor...)} * and pass in a {@link SentenceDetectorFactory}. */ + @Deprecated public SDCrossValidator(String languageCode, TrainingParameters params) { this(languageCode, params, new SentenceDetectorFactory(languageCode, true, null, null)); @@ -65,6 +82,7 @@ public class SDCrossValidator { * SentenceDetectorEvaluationMonitor...)} * instead and pass in a TrainingParameters object. */ + @Deprecated public SDCrossValidator(String languageCode, TrainingParameters params, SentenceDetectorEvaluationMonitor... listeners) { this(languageCode, params, new SentenceDetectorFactory(languageCode, true, @@ -76,6 +94,7 @@ public class SDCrossValidator { * SentenceDetectorFactory, SentenceDetectorEvaluationMonitor...)} * instead and pass in a TrainingParameters object. */ + @Deprecated public SDCrossValidator(String languageCode) { this(languageCode, ModelUtil.createDefaultTrainingParameters()); } @@ -103,7 +122,7 @@ public class SDCrossValidator { SentenceModel model; model = SentenceDetectorME.train(languageCode, trainingSampleStream, - sdFactory, params); + sdFactory, params, defaultEOS); // do testing SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator( http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java index 6f3aad8..aefb54a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java @@ -20,6 +20,7 @@ package opennlp.tools.sentdetect; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; +import java.util.List; import opennlp.tools.ml.model.Event; import opennlp.tools.util.AbstractEventStream; @@ -28,6 +29,7 @@ import opennlp.tools.util.Span; public class SDEventStream extends AbstractEventStream<SentenceSample> { + private final Character defaultEOS; private SDContextGenerator cg; private EndOfSentenceScanner scanner; @@ -37,21 +39,40 @@ public class SDEventStream extends AbstractEventStream<SentenceSample> { * @param samples */ public SDEventStream(ObjectStream<SentenceSample> samples, SDContextGenerator cg, - EndOfSentenceScanner scanner) { + EndOfSentenceScanner scanner, Character defaultEOS) { super(samples); this.cg = cg; this.scanner = scanner; + this.defaultEOS = defaultEOS; + } + + /** + * Initializes the current instance with NEW LINE as default EOS. + * + * @param samples + */ + public SDEventStream(ObjectStream<SentenceSample> samples, SDContextGenerator cg, + EndOfSentenceScanner scanner) { + super(samples); + + this.cg = cg; + this.scanner = scanner; + this.defaultEOS = '\n'; } @Override protected Iterator<Event> createEvents(SentenceSample sample) { - Collection<Event> events = new ArrayList<Event>(); + Collection<Event> events = new ArrayList(); for (Span sentenceSpan : sample.getSentences()) { String sentenceString = sentenceSpan.getCoveredText(sample.getDocument()).toString(); + // last position should be a EOS, if not we add it. + sentenceString = addTrailingEosIfMissing(sentenceString); + + for (Iterator<Integer> it = scanner.getPositions( sentenceString).iterator(); it.hasNext();) { @@ -69,4 +90,14 @@ public class SDEventStream extends AbstractEventStream<SentenceSample> { return events.iterator(); } + + protected String addTrailingEosIfMissing(String sentenceString) { + List<Integer> positions = scanner.getPositions( + sentenceString.substring(sentenceString.length() - 2)); + if (positions.size() > 0) { + // trailing is a EOS + return sentenceString; + } + return sentenceString + defaultEOS; + } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java index b5ad804..c76342e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -299,7 +299,7 @@ public class SentenceDetectorME implements SentenceDetector { /** * @deprecated Use - * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters)} + * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters, Character)} * and pass in af {@link SentenceDetectorFactory}. */ public static SentenceModel train(String languageCode, @@ -307,18 +307,25 @@ public class SentenceDetectorME implements SentenceDetector { Dictionary abbreviations, TrainingParameters mlParams) throws IOException { SentenceDetectorFactory sdFactory = new SentenceDetectorFactory( languageCode, useTokenEnd, abbreviations, null); - return train(languageCode, samples, sdFactory, mlParams); + return train(languageCode, samples, sdFactory, mlParams, null); + } + + public static SentenceModel train(String languageCode, + ObjectStream<SentenceSample> samples, SentenceDetectorFactory sdFactory, + TrainingParameters mlParams) throws IOException { + + return train(languageCode, samples, sdFactory, mlParams, '\n'); } public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples, SentenceDetectorFactory sdFactory, - TrainingParameters mlParams) throws IOException { + TrainingParameters mlParams, Character defaultEOS) throws IOException { - Map<String, String> manifestInfoEntries = new HashMap<>(); + Map<String, String> manifestInfoEntries = new HashMap(); // TODO: Fix the EventStream to throw exceptions when training goes wrong ObjectStream<Event> eventStream = new SDEventStream(samples, - sdFactory.getSDContextGenerator(), sdFactory.getEndOfSentenceScanner()); + sdFactory.getSDContextGenerator(), sdFactory.getEndOfSentenceScanner(), defaultEOS); EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries); @@ -329,7 +336,7 @@ public class SentenceDetectorME implements SentenceDetector { /** * @deprecated Use - * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters)} + * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters, Character)} * and pass in af {@link SentenceDetectorFactory}. */ @Deprecated http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java index 4a34229..f4959ea 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java @@ -29,13 +29,13 @@ import opennlp.tools.sentdetect.lang.th.SentenceContextGenerator; public class Factory { public static final char[] ptEosCharacters = new char[] { '.', '?', '!', ';', - ':', '(', ')', '«', '»', '\'', '"' }; + ':', '(', ')', '«', '»', '\'', '"', '\n'}; - public static final char[] defaultEosCharacters = new char[] { '.', '!', '?' }; + public static final char[] defaultEosCharacters = new char[] { '.', '!', '?', '\n'}; public static final char[] thEosCharacters = new char[] { ' ','\n' }; - public static final char[] jpEosCharacters = new char[] {'ã', 'ï¼', 'ï¼'}; + public static final char[] jpEosCharacters = new char[] {'ã', 'ï¼', 'ï¼', '\n'}; public EndOfSentenceScanner createEndOfSentenceScanner(String languageCode) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/b5b6d5c2/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java index 138e915..25a8add 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java @@ -42,11 +42,7 @@ public class SDEventStreamTest { ObjectStream<SentenceSample> sampleStream = ObjectStreamUtils.createObjectStream(sample); - Factory factory = new Factory(); - - ObjectStream<Event> eventStream = new SDEventStream(sampleStream, - factory.createSentenceContextGenerator("eng"), - factory.createEndOfSentenceScanner("eng")); + ObjectStream<Event> eventStream = createSDEventStream(sampleStream,"eng", '\n'); Assert.assertEquals(SentenceDetectorME.NO_SPLIT, eventStream.read().getOutcome()); Assert.assertEquals(SentenceDetectorME.SPLIT, eventStream.read().getOutcome()); @@ -55,4 +51,31 @@ public class SDEventStreamTest { Assert.assertNull(eventStream.read()); } + + @Test + public void testInsertDefaultEOS() throws IOException { + + String document = "Test sent. one Test sent. 2"; + SentenceSample sample = new SentenceSample(document, + new Span(0, 14), new Span(15, 27)); + + ObjectStream<SentenceSample> sampleStream = + ObjectStreamUtils.createObjectStream(sample); + + + SDEventStream eventStream = createSDEventStream(sampleStream,"eng", '\n'); + + String sent = "abc"; + Assert.assertEquals(sent + "\n", eventStream.addTrailingEosIfMissing(sent)); + sent = "abc."; + Assert.assertEquals(sent, eventStream.addTrailingEosIfMissing(sent)); + } + + private SDEventStream createSDEventStream(ObjectStream<SentenceSample> sampleStream, + String languageCode, Character defaultEOS) { + Factory factory = new Factory(); + return new SDEventStream(sampleStream, + factory.createSentenceContextGenerator(languageCode), + factory.createEndOfSentenceScanner(languageCode), defaultEOS); + } }
