Repository: opennlp Updated Branches: refs/heads/master 2284819f3 -> 05a916ef1
Revert "OPENNLP-1082: Add EOS to SDEventStream if missing" This reverts commit b5b6d5c27443e1837b80b089206aad480852cd1c. Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/05a916ef Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/05a916ef Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/05a916ef Branch: refs/heads/master Commit: 05a916ef1726e4487e5aefaec1c170a3ee763895 Parents: 2284819 Author: Jörn Kottmann <[email protected]> Authored: Mon Jul 3 16:27:32 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Mon Jul 3 16:44:32 2017 +0200 ---------------------------------------------------------------------- .../opennlp/tools/cmdline/ArgumentParser.java | 18 ++-------- .../SentenceDetectorCrossValidatorTool.java | 9 +---- .../sentdetect/SentenceDetectorTrainerTool.java | 9 +---- .../cmdline/sentdetect/TrainingParams.java | 5 --- .../tools/sentdetect/SDCrossValidator.java | 23 ++----------- .../opennlp/tools/sentdetect/SDEventStream.java | 35 ++------------------ .../tools/sentdetect/SentenceDetectorME.java | 19 ++++------- .../opennlp/tools/sentdetect/lang/Factory.java | 6 ++-- .../tools/sentdetect/SDEventStreamTest.java | 33 +++--------------- 9 files changed, 22 insertions(+), 135 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java index 4d028cd..8243560 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/ArgumentParser.java @@ -127,21 +127,6 @@ public class ArgumentParser { } } - private static class CharacterArgumentFactory implements ArgumentFactory { - - public Object parseArgument(Method method, String argName, String argValue) { - if (argValue != null) { - char[] chars = argValue.toCharArray(); - if (chars.length != 1) { - throw new TerminateToolException(1, String.format(INVALID_ARG, argName, argValue) + - "Character should have size 1."); - } - return Character.valueOf(chars[0]); - } - return null; - } - } - private static class ArgumentProxy implements InvocationHandler { private final Map<String, Object> arguments; @@ -169,7 +154,6 @@ public class ArgumentParser { factories.put(String.class, new StringArgumentFactory()); factories.put(File.class, new FileArgumentFactory()); factories.put(Charset.class, new CharsetArgumentFactory()); - factories.put(Character.class, new CharacterArgumentFactory()); argumentFactories = Collections.unmodifiableMap(factories); } @@ -236,6 +220,7 @@ public class ArgumentParser { * @param argProxyInterface interface with parameter descriptions * @return the help message usage string */ + @SuppressWarnings({"unchecked"}) public static <T> String createUsage(Class<T> argProxyInterface) { return createUsage(new Class[]{argProxyInterface}); } @@ -399,6 +384,7 @@ public class ArgumentParser { * @param argProxyInterface interface with parameters description * @return true, if arguments are valid */ + @SuppressWarnings({"unchecked"}) public static <T> boolean validateArguments(String[] args, Class<T> argProxyInterface) { return validateArguments(args, new Class[]{argProxyInterface}); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java index 3a28254..55d1df6 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorCrossValidatorTool.java @@ -67,18 +67,11 @@ public final class SentenceDetectorCrossValidatorTool eos = eosString.toCharArray(); } - Character defaultEOS; - if (params.getDefaultEosChar() != null) { - defaultEOS = params.getDefaultEosChar(); - } else { - defaultEOS = '\n'; - } - try { Dictionary abbreviations = SentenceDetectorTrainerTool.loadDict(params.getAbbDict()); SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create( params.getFactory(), params.getLang(), true, abbreviations, eos); - validator = new SDCrossValidator(params.getLang(), mlParams, sdFactory, defaultEOS, + validator = new SDCrossValidator(params.getLang(), mlParams, sdFactory, errorListener); validator.evaluate(sampleStream, params.getFolds()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java index b63de0f..cdd6916 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/SentenceDetectorTrainerTool.java @@ -84,13 +84,6 @@ public final class SentenceDetectorTrainerTool eos = eosString.toCharArray(); } - Character defaultEOS; - if (params.getDefaultEosChar() != null) { - defaultEOS = params.getDefaultEosChar(); - } else { - defaultEOS = '\n'; - } - SentenceModel model; try { @@ -98,7 +91,7 @@ public final class SentenceDetectorTrainerTool SentenceDetectorFactory sdFactory = SentenceDetectorFactory.create( params.getFactory(), params.getLang(), true, dict, eos); model = SentenceDetectorME.train(params.getLang(), sampleStream, - sdFactory, mlParams, defaultEOS); + sdFactory, mlParams); } catch (IOException e) { throw createTerminationIOException(e); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java index 5b7289a..fbdf4db 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/sentdetect/TrainingParams.java @@ -38,11 +38,6 @@ interface TrainingParams extends BasicTrainingParams { @OptionalParameter String getEosChars(); - @ParameterDescription(valueName = "string", description = "EOS character to use if EOS is " + - "missing in sample. Default is \\n.") - @OptionalParameter - Character getDefaultEosChar(); - @ParameterDescription(valueName = "factoryName", description = "A sub-class of SentenceDetectorFactory where to get implementation and resources.") @OptionalParameter http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java index cb43896..2f6daec 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDCrossValidator.java @@ -34,8 +34,6 @@ public class SDCrossValidator { private final TrainingParameters params; - private final Character defaultEOS; - private FMeasure fmeasure = new FMeasure(); private SentenceDetectorEvaluationMonitor[] listeners; @@ -43,25 +41,11 @@ public class SDCrossValidator { private SentenceDetectorFactory sdFactory; public SDCrossValidator(String languageCode, TrainingParameters params, - SentenceDetectorFactory sdFactory, Character defaultEOS, - SentenceDetectorEvaluationMonitor... listeners) { + SentenceDetectorFactory sdFactory, SentenceDetectorEvaluationMonitor... listeners) { this.languageCode = languageCode; this.params = params; this.listeners = listeners; this.sdFactory = sdFactory; - this.defaultEOS = defaultEOS; - } - - /** - * @deprecated Use - * {@link #SDCrossValidator(String, TrainingParameters, - * SentenceDetectorFactory, Character, SentenceDetectorEvaluationMonitor...)} - * and pass in a {@link SentenceDetectorFactory}. - */ - @Deprecated - public SDCrossValidator(String languageCode, TrainingParameters params, - SentenceDetectorFactory sdFactory, SentenceDetectorEvaluationMonitor... listeners) { - this(languageCode, params, sdFactory, '\n', listeners); } /** @@ -70,7 +54,6 @@ public class SDCrossValidator { * SentenceDetectorFactory, SentenceDetectorEvaluationMonitor...)} * and pass in a {@link SentenceDetectorFactory}. */ - @Deprecated public SDCrossValidator(String languageCode, TrainingParameters params) { this(languageCode, params, new SentenceDetectorFactory(languageCode, true, null, null)); @@ -82,7 +65,6 @@ public class SDCrossValidator { * SentenceDetectorEvaluationMonitor...)} * instead and pass in a TrainingParameters object. */ - @Deprecated public SDCrossValidator(String languageCode, TrainingParameters params, SentenceDetectorEvaluationMonitor... listeners) { this(languageCode, params, new SentenceDetectorFactory(languageCode, true, @@ -94,7 +76,6 @@ public class SDCrossValidator { * SentenceDetectorFactory, SentenceDetectorEvaluationMonitor...)} * instead and pass in a TrainingParameters object. */ - @Deprecated public SDCrossValidator(String languageCode) { this(languageCode, ModelUtil.createDefaultTrainingParameters()); } @@ -122,7 +103,7 @@ public class SDCrossValidator { SentenceModel model; model = SentenceDetectorME.train(languageCode, trainingSampleStream, - sdFactory, params, defaultEOS); + sdFactory, params); // do testing SentenceDetectorEvaluator evaluator = new SentenceDetectorEvaluator( http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java index aefb54a..a656143 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SDEventStream.java @@ -20,7 +20,6 @@ package opennlp.tools.sentdetect; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; -import java.util.List; import opennlp.tools.ml.model.Event; import opennlp.tools.util.AbstractEventStream; @@ -29,7 +28,6 @@ import opennlp.tools.util.Span; public class SDEventStream extends AbstractEventStream<SentenceSample> { - private final Character defaultEOS; private SDContextGenerator cg; private EndOfSentenceScanner scanner; @@ -39,40 +37,21 @@ public class SDEventStream extends AbstractEventStream<SentenceSample> { * @param samples */ public SDEventStream(ObjectStream<SentenceSample> samples, SDContextGenerator cg, - EndOfSentenceScanner scanner, Character defaultEOS) { + EndOfSentenceScanner scanner) { super(samples); this.cg = cg; this.scanner = scanner; - this.defaultEOS = defaultEOS; - } - - /** - * Initializes the current instance with NEW LINE as default EOS. - * - * @param samples - */ - public SDEventStream(ObjectStream<SentenceSample> samples, SDContextGenerator cg, - EndOfSentenceScanner scanner) { - super(samples); - - this.cg = cg; - this.scanner = scanner; - this.defaultEOS = '\n'; } @Override protected Iterator<Event> createEvents(SentenceSample sample) { - Collection<Event> events = new ArrayList(); + Collection<Event> events = new ArrayList<>(); for (Span sentenceSpan : sample.getSentences()) { String sentenceString = sentenceSpan.getCoveredText(sample.getDocument()).toString(); - // last position should be a EOS, if not we add it. - sentenceString = addTrailingEosIfMissing(sentenceString); - - for (Iterator<Integer> it = scanner.getPositions( sentenceString).iterator(); it.hasNext();) { @@ -90,14 +69,4 @@ public class SDEventStream extends AbstractEventStream<SentenceSample> { return events.iterator(); } - - protected String addTrailingEosIfMissing(String sentenceString) { - List<Integer> positions = scanner.getPositions( - sentenceString.substring(sentenceString.length() - 2)); - if (positions.size() > 0) { - // trailing is a EOS - return sentenceString; - } - return sentenceString + defaultEOS; - } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java index c76342e..b5ad804 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -299,7 +299,7 @@ public class SentenceDetectorME implements SentenceDetector { /** * @deprecated Use - * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters, Character)} + * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters)} * and pass in af {@link SentenceDetectorFactory}. */ public static SentenceModel train(String languageCode, @@ -307,25 +307,18 @@ public class SentenceDetectorME implements SentenceDetector { Dictionary abbreviations, TrainingParameters mlParams) throws IOException { SentenceDetectorFactory sdFactory = new SentenceDetectorFactory( languageCode, useTokenEnd, abbreviations, null); - return train(languageCode, samples, sdFactory, mlParams, null); - } - - public static SentenceModel train(String languageCode, - ObjectStream<SentenceSample> samples, SentenceDetectorFactory sdFactory, - TrainingParameters mlParams) throws IOException { - - return train(languageCode, samples, sdFactory, mlParams, '\n'); + return train(languageCode, samples, sdFactory, mlParams); } public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples, SentenceDetectorFactory sdFactory, - TrainingParameters mlParams, Character defaultEOS) throws IOException { + TrainingParameters mlParams) throws IOException { - Map<String, String> manifestInfoEntries = new HashMap(); + Map<String, String> manifestInfoEntries = new HashMap<>(); // TODO: Fix the EventStream to throw exceptions when training goes wrong ObjectStream<Event> eventStream = new SDEventStream(samples, - sdFactory.getSDContextGenerator(), sdFactory.getEndOfSentenceScanner(), defaultEOS); + sdFactory.getSDContextGenerator(), sdFactory.getEndOfSentenceScanner()); EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries); @@ -336,7 +329,7 @@ public class SentenceDetectorME implements SentenceDetector { /** * @deprecated Use - * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters, Character)} + * {@link #train(String, ObjectStream, SentenceDetectorFactory, TrainingParameters)} * and pass in af {@link SentenceDetectorFactory}. */ @Deprecated http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java index f4959ea..4a34229 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/lang/Factory.java @@ -29,13 +29,13 @@ import opennlp.tools.sentdetect.lang.th.SentenceContextGenerator; public class Factory { public static final char[] ptEosCharacters = new char[] { '.', '?', '!', ';', - ':', '(', ')', '«', '»', '\'', '"', '\n'}; + ':', '(', ')', '«', '»', '\'', '"' }; - public static final char[] defaultEosCharacters = new char[] { '.', '!', '?', '\n'}; + public static final char[] defaultEosCharacters = new char[] { '.', '!', '?' }; public static final char[] thEosCharacters = new char[] { ' ','\n' }; - public static final char[] jpEosCharacters = new char[] {'ã', 'ï¼', 'ï¼', '\n'}; + public static final char[] jpEosCharacters = new char[] {'ã', 'ï¼', 'ï¼'}; public EndOfSentenceScanner createEndOfSentenceScanner(String languageCode) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/05a916ef/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java index 25a8add..138e915 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SDEventStreamTest.java @@ -42,7 +42,11 @@ public class SDEventStreamTest { ObjectStream<SentenceSample> sampleStream = ObjectStreamUtils.createObjectStream(sample); - ObjectStream<Event> eventStream = createSDEventStream(sampleStream,"eng", '\n'); + Factory factory = new Factory(); + + ObjectStream<Event> eventStream = new SDEventStream(sampleStream, + factory.createSentenceContextGenerator("eng"), + factory.createEndOfSentenceScanner("eng")); Assert.assertEquals(SentenceDetectorME.NO_SPLIT, eventStream.read().getOutcome()); Assert.assertEquals(SentenceDetectorME.SPLIT, eventStream.read().getOutcome()); @@ -51,31 +55,4 @@ public class SDEventStreamTest { Assert.assertNull(eventStream.read()); } - - @Test - public void testInsertDefaultEOS() throws IOException { - - String document = "Test sent. one Test sent. 2"; - SentenceSample sample = new SentenceSample(document, - new Span(0, 14), new Span(15, 27)); - - ObjectStream<SentenceSample> sampleStream = - ObjectStreamUtils.createObjectStream(sample); - - - SDEventStream eventStream = createSDEventStream(sampleStream,"eng", '\n'); - - String sent = "abc"; - Assert.assertEquals(sent + "\n", eventStream.addTrailingEosIfMissing(sent)); - sent = "abc."; - Assert.assertEquals(sent, eventStream.addTrailingEosIfMissing(sent)); - } - - private SDEventStream createSDEventStream(ObjectStream<SentenceSample> sampleStream, - String languageCode, Character defaultEOS) { - Factory factory = new Factory(); - return new SDEventStream(sampleStream, - factory.createSentenceContextGenerator(languageCode), - factory.createEndOfSentenceScanner(languageCode), defaultEOS); - } }
