This is an automated email from the ASF dual-hosted git repository.

thygesen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 9959b3c  OPENNLP-1132: Fail with exception if not enough lines in 
leipzig parser
9959b3c is described below

commit 9959b3c5a777845aa19040a5a761814e483a6483
Author: thygesen <[email protected]>
AuthorDate: Thu Sep 14 15:53:58 2017 +0200

    OPENNLP-1132: Fail with exception if not enough lines in leipzig parser
---
 .../leipzig/LeipzigLanguageSampleStream.java       | 10 ++++++--
 .../leipzig/LeipzigLanguageSampleStreamTest.java   | 27 +++++++++++++++-------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
index eff2935..28b2f64 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -36,6 +36,7 @@ import java.util.stream.IntStream;
 
 import opennlp.tools.langdetect.Language;
 import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.MarkableFileInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
@@ -56,14 +57,19 @@ public class LeipzigLanguageSampleStream implements 
ObjectStream<LanguageSample>
       // The file name contains the number of lines, but to make this more 
stable
       // the file is once scanned for the count even tough this is slower
       int totalLineCount = (int) Files.lines(sentencesFile.toPath()).count();
+      int requiredLines = sentencesPerSample * numberOfSamples;
+
+      if (totalLineCount < requiredLines)
+        throw new InvalidFormatException(
+                String.format("%s does not contain enough lines (%d lines < %d 
required lines).",
+                        sentencesFile.getPath(), totalLineCount, 
requiredLines));
 
       List<Integer> indexes = IntStream.range(0, totalLineCount)
           .boxed().collect(Collectors.toList());
 
       Collections.shuffle(indexes, random);
 
-      Set<Integer> selectedLines = new HashSet<>(
-          indexes.subList(0, sentencesPerSample * numberOfSamples));
+      Set<Integer> selectedLines = new HashSet<>(indexes.subList(0, 
requiredLines));
 
       List<String> sentences = new ArrayList<>();
 
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
index b03291f..b6efab4 100644
--- 
a/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
@@ -23,29 +23,28 @@ import java.io.IOException;
 import org.junit.Assert;
 import org.junit.Test;
 
-import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InvalidFormatException;
 
 /**
  * Tests for the {@link LeipzigLanguageSampleStream} class.
  */
 public class LeipzigLanguageSampleStreamTest {
 
+  private static String testDataPath = LeipzigLanguageSampleStreamTest.class
+          
.getClassLoader().getResource("opennlp/tools/formats/leipzig/samples").getPath();
+
   @Test
   public void testReadSentenceFiles() {
-    String testDataPath = LeipzigLanguageSampleStreamTest.class
-            
.getClassLoader().getResource("opennlp/tools/formats/leipzig/samples").getPath();
+
     int samplesPerLanguage = 2;
     int sentencesPerSample = 1;
     try {
-
       LeipzigLanguageSampleStream stream = new LeipzigLanguageSampleStream(new 
File(testDataPath),
               sentencesPerSample, samplesPerLanguage);
       int count = 0;
-      LanguageSample sample = null;
-      while ((sample = stream.read()) != null) {
+      while (stream.read() != null)
         count++;
-        System.out.println(sample.getContext());
-      }
+
       Assert.assertEquals(4, count);
 
     } catch (IOException e) {
@@ -53,4 +52,16 @@ public class LeipzigLanguageSampleStreamTest {
     }
   }
 
+  @Test(expected = InvalidFormatException.class)
+  public void testNotEnoughSentences() throws IOException {
+    int samplesPerLanguage = 2;
+    int sentencesPerSample = 2;
+
+    LeipzigLanguageSampleStream stream =
+            new LeipzigLanguageSampleStream(new File(testDataPath),
+              sentencesPerSample, samplesPerLanguage);
+    while (stream.read() != null);
+
+  }
+
 }

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to