This is an automated email from the ASF dual-hosted git repository.
thygesen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 9959b3c OPENNLP-1132: Fail with exception if not enough lines in
leipzig parser
9959b3c is described below
commit 9959b3c5a777845aa19040a5a761814e483a6483
Author: thygesen <[email protected]>
AuthorDate: Thu Sep 14 15:53:58 2017 +0200
OPENNLP-1132: Fail with exception if not enough lines in leipzig parser
---
.../leipzig/LeipzigLanguageSampleStream.java | 10 ++++++--
.../leipzig/LeipzigLanguageSampleStreamTest.java | 27 +++++++++++++++-------
2 files changed, 27 insertions(+), 10 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
index eff2935..28b2f64 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -36,6 +36,7 @@ import java.util.stream.IntStream;
import opennlp.tools.langdetect.Language;
import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
@@ -56,14 +57,19 @@ public class LeipzigLanguageSampleStream implements
ObjectStream<LanguageSample>
// The file name contains the number of lines, but to make this more
stable
// the file is once scanned for the count even tough this is slower
int totalLineCount = (int) Files.lines(sentencesFile.toPath()).count();
+ int requiredLines = sentencesPerSample * numberOfSamples;
+
+ if (totalLineCount < requiredLines)
+ throw new InvalidFormatException(
+ String.format("%s does not contain enough lines (%d lines < %d
required lines).",
+ sentencesFile.getPath(), totalLineCount,
requiredLines));
List<Integer> indexes = IntStream.range(0, totalLineCount)
.boxed().collect(Collectors.toList());
Collections.shuffle(indexes, random);
- Set<Integer> selectedLines = new HashSet<>(
- indexes.subList(0, sentencesPerSample * numberOfSamples));
+ Set<Integer> selectedLines = new HashSet<>(indexes.subList(0,
requiredLines));
List<String> sentences = new ArrayList<>();
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
index b03291f..b6efab4 100644
---
a/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
+++
b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
@@ -23,29 +23,28 @@ import java.io.IOException;
import org.junit.Assert;
import org.junit.Test;
-import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InvalidFormatException;
/**
* Tests for the {@link LeipzigLanguageSampleStream} class.
*/
public class LeipzigLanguageSampleStreamTest {
+ private static String testDataPath = LeipzigLanguageSampleStreamTest.class
+
.getClassLoader().getResource("opennlp/tools/formats/leipzig/samples").getPath();
+
@Test
public void testReadSentenceFiles() {
- String testDataPath = LeipzigLanguageSampleStreamTest.class
-
.getClassLoader().getResource("opennlp/tools/formats/leipzig/samples").getPath();
+
int samplesPerLanguage = 2;
int sentencesPerSample = 1;
try {
-
LeipzigLanguageSampleStream stream = new LeipzigLanguageSampleStream(new
File(testDataPath),
sentencesPerSample, samplesPerLanguage);
int count = 0;
- LanguageSample sample = null;
- while ((sample = stream.read()) != null) {
+ while (stream.read() != null)
count++;
- System.out.println(sample.getContext());
- }
+
Assert.assertEquals(4, count);
} catch (IOException e) {
@@ -53,4 +52,16 @@ public class LeipzigLanguageSampleStreamTest {
}
}
+ @Test(expected = InvalidFormatException.class)
+ public void testNotEnoughSentences() throws IOException {
+ int samplesPerLanguage = 2;
+ int sentencesPerSample = 2;
+
+ LeipzigLanguageSampleStream stream =
+ new LeipzigLanguageSampleStream(new File(testDataPath),
+ sentencesPerSample, samplesPerLanguage);
+ while (stream.read() != null);
+
+ }
+
}
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].