This is an automated email from the ASF dual-hosted git repository.

thygesen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 5bd8230  OPENNLP-1131: Ignore hidden files in Leipzig format parser
5bd8230 is described below

commit 5bd82306a8e44e2b9ff098f29ebf4c69c195b9ad
Author: thygesen <[email protected]>
AuthorDate: Thu Sep 14 12:46:33 2017 +0200

    OPENNLP-1131: Ignore hidden files in Leipzig format parser
---
 .../leipzig/LeipzigLanguageSampleStream.java       | 13 +++--
 .../leipzig/LeipzigLanguageSampleStreamTest.java   | 56 ++++++++++++++++++++++
 .../opennlp/tools/formats/leipzig/samples/.hidden  |  1 +
 .../tools/formats/leipzig/samples/123-skipped.txt  |  1 +
 .../formats/leipzig/samples/dan-sentences.txt      |  3 ++
 .../leipzig/samples/dontread/xxx-sentences.txt     |  3 ++
 .../formats/leipzig/samples/eng-sentences.txt      |  2 +
 7 files changed, 76 insertions(+), 3 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
index 9374a20..eff2935 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -18,6 +18,7 @@
 package opennlp.tools.formats.leipzig;
 
 import java.io.File;
+import java.io.FileFilter;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
@@ -125,9 +126,15 @@ public class LeipzigLanguageSampleStream implements 
ObjectStream<LanguageSample>
   public LeipzigLanguageSampleStream(File leipzigFolder, final int 
sentencesPerSample,
                                      final int samplesPerLanguage) throws 
IOException {
     this.sentencesPerSample = sentencesPerSample;
-    // TODO: Use a FileFilter to make this more reliable in case there are
-    //       files which should be ignored or are shorter than 3 chars for the 
lang detect substring
-    sentencesFiles = leipzigFolder.listFiles();
+
+    sentencesFiles = leipzigFolder.listFiles(new FileFilter() {
+      @Override
+      public boolean accept(File pathname) {
+        return !pathname.isHidden() && pathname.isFile()
+                && pathname.getName().length() >= 3
+                && pathname.getName().substring(0,3).matches("[a-z]+");
+      }
+    });
     Arrays.sort(sentencesFiles);
 
     Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
 
b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
new file mode 100644
index 0000000..b03291f
--- /dev/null
+++ 
b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.langdetect.LanguageSample;
+
+/**
+ * Tests for the {@link LeipzigLanguageSampleStream} class.
+ */
+public class LeipzigLanguageSampleStreamTest {
+
+  @Test
+  public void testReadSentenceFiles() {
+    String testDataPath = LeipzigLanguageSampleStreamTest.class
+            
.getClassLoader().getResource("opennlp/tools/formats/leipzig/samples").getPath();
+    int samplesPerLanguage = 2;
+    int sentencesPerSample = 1;
+    try {
+
+      LeipzigLanguageSampleStream stream = new LeipzigLanguageSampleStream(new 
File(testDataPath),
+              sentencesPerSample, samplesPerLanguage);
+      int count = 0;
+      LanguageSample sample = null;
+      while ((sample = stream.read()) != null) {
+        count++;
+        System.out.println(sample.getContext());
+      }
+      Assert.assertEquals(4, count);
+
+    } catch (IOException e) {
+      Assert.fail();
+    }
+  }
+
+}
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/.hidden
 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/.hidden
new file mode 100644
index 0000000..736877f
--- /dev/null
+++ 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/.hidden
@@ -0,0 +1 @@
+Nothing in here
\ No newline at end of file
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/123-skipped.txt
 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/123-skipped.txt
new file mode 100644
index 0000000..570ad53
--- /dev/null
+++ 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/123-skipped.txt
@@ -0,0 +1 @@
+skip this file
\ No newline at end of file
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dan-sentences.txt
 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dan-sentences.txt
new file mode 100644
index 0000000..6f01de8
--- /dev/null
+++ 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dan-sentences.txt
@@ -0,0 +1,3 @@
+1      Der var engang en mand.
+2      Der boede i en spand.
+3      Spanden var af ler.
\ No newline at end of file
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dontread/xxx-sentences.txt
 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dontread/xxx-sentences.txt
new file mode 100644
index 0000000..4a5a581
--- /dev/null
+++ 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dontread/xxx-sentences.txt
@@ -0,0 +1,3 @@
+1      This sentence should not be read.
+2      The same goes for this sentence.
+3      If we got this far then something went wrong!
\ No newline at end of file
diff --git 
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/eng-sentences.txt
 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/eng-sentences.txt
new file mode 100644
index 0000000..7102384
--- /dev/null
+++ 
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/eng-sentences.txt
@@ -0,0 +1,2 @@
+1      This is a sentence.
+2      This is another sentences.
\ No newline at end of file

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to