OPENNLP-1076: Add validation of spans to SentenceSample
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d378c065 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d378c065 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d378c065 Branch: refs/heads/LangDetect Commit: d378c0656ff2374a867abe0383aa841275a47d8d Parents: 226612f Author: Jörn Kottmann <[email protected]> Authored: Wed May 24 12:10:37 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Wed May 24 12:10:37 2017 +0200 ---------------------------------------------------------------------- .../main/java/opennlp/tools/sentdetect/SentenceSample.java | 9 +++++++++ .../java/opennlp/tools/sentdetect/SentenceSampleTest.java | 7 ++++++- 2 files changed, 15 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/d378c065/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java index dbbd193..7891cfd 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java @@ -45,6 +45,15 @@ public class SentenceSample { public SentenceSample(CharSequence document, Span... sentences) { this.document = document.toString(); this.sentences = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(sentences))); + + // validate that all spans are inside the document text + for (Span sentence : sentences) { + if (sentence.getEnd() > document.length()) { + throw new IllegalArgumentException( + String.format("Sentence span is outside of document text [len %d] and span %s", + document.length(), sentence)); + } + } } public SentenceSample(Detokenizer detokenizer, String[][] sentences) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/d378c065/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java index 163cb73..2ec0978 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java @@ -29,7 +29,6 @@ public class SentenceSampleTest { @Test public void testRetrievingContent() { - SentenceSample sample = new SentenceSample("1. 2.", new Span(0, 2), new Span(3, 5)); @@ -38,6 +37,12 @@ public class SentenceSampleTest { Assert.assertEquals(new Span(3, 5), sample.getSentences()[1]); } + @Test(expected = IllegalArgumentException.class) + public void testInvalidSpansFailFast() { + SentenceSample sample = new SentenceSample("1. 2.", + new Span(0, 2), new Span(5, 7)); + } + @Test public void testEquals() { Assert.assertFalse(createGoldSample() == createGoldSample());
