This is an automated email from the ASF dual-hosted git repository.
thygesen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 5bd8230 OPENNLP-1131: Ignore hidden files in Leipzig format parser
5bd8230 is described below
commit 5bd82306a8e44e2b9ff098f29ebf4c69c195b9ad
Author: thygesen <[email protected]>
AuthorDate: Thu Sep 14 12:46:33 2017 +0200
OPENNLP-1131: Ignore hidden files in Leipzig format parser
---
.../leipzig/LeipzigLanguageSampleStream.java | 13 +++--
.../leipzig/LeipzigLanguageSampleStreamTest.java | 56 ++++++++++++++++++++++
.../opennlp/tools/formats/leipzig/samples/.hidden | 1 +
.../tools/formats/leipzig/samples/123-skipped.txt | 1 +
.../formats/leipzig/samples/dan-sentences.txt | 3 ++
.../leipzig/samples/dontread/xxx-sentences.txt | 3 ++
.../formats/leipzig/samples/eng-sentences.txt | 2 +
7 files changed, 76 insertions(+), 3 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
index 9374a20..eff2935 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -18,6 +18,7 @@
package opennlp.tools.formats.leipzig;
import java.io.File;
+import java.io.FileFilter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
@@ -125,9 +126,15 @@ public class LeipzigLanguageSampleStream implements
ObjectStream<LanguageSample>
public LeipzigLanguageSampleStream(File leipzigFolder, final int
sentencesPerSample,
final int samplesPerLanguage) throws
IOException {
this.sentencesPerSample = sentencesPerSample;
- // TODO: Use a FileFilter to make this more reliable in case there are
- // files which should be ignored or are shorter than 3 chars for the
lang detect substring
- sentencesFiles = leipzigFolder.listFiles();
+
+ sentencesFiles = leipzigFolder.listFiles(new FileFilter() {
+ @Override
+ public boolean accept(File pathname) {
+ return !pathname.isHidden() && pathname.isFile()
+ && pathname.getName().length() >= 3
+ && pathname.getName().substring(0,3).matches("[a-z]+");
+ }
+ });
Arrays.sort(sentencesFiles);
Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
new file mode 100644
index 0000000..b03291f
--- /dev/null
+++
b/opennlp-tools/src/test/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.langdetect.LanguageSample;
+
+/**
+ * Tests for the {@link LeipzigLanguageSampleStream} class.
+ */
+public class LeipzigLanguageSampleStreamTest {
+
+ @Test
+ public void testReadSentenceFiles() {
+ String testDataPath = LeipzigLanguageSampleStreamTest.class
+
.getClassLoader().getResource("opennlp/tools/formats/leipzig/samples").getPath();
+ int samplesPerLanguage = 2;
+ int sentencesPerSample = 1;
+ try {
+
+ LeipzigLanguageSampleStream stream = new LeipzigLanguageSampleStream(new
File(testDataPath),
+ sentencesPerSample, samplesPerLanguage);
+ int count = 0;
+ LanguageSample sample = null;
+ while ((sample = stream.read()) != null) {
+ count++;
+ System.out.println(sample.getContext());
+ }
+ Assert.assertEquals(4, count);
+
+ } catch (IOException e) {
+ Assert.fail();
+ }
+ }
+
+}
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/.hidden
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/.hidden
new file mode 100644
index 0000000..736877f
--- /dev/null
+++
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/.hidden
@@ -0,0 +1 @@
+Nothing in here
\ No newline at end of file
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/123-skipped.txt
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/123-skipped.txt
new file mode 100644
index 0000000..570ad53
--- /dev/null
+++
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/123-skipped.txt
@@ -0,0 +1 @@
+skip this file
\ No newline at end of file
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dan-sentences.txt
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dan-sentences.txt
new file mode 100644
index 0000000..6f01de8
--- /dev/null
+++
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dan-sentences.txt
@@ -0,0 +1,3 @@
+1 Der var engang en mand.
+2 Der boede i en spand.
+3 Spanden var af ler.
\ No newline at end of file
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dontread/xxx-sentences.txt
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dontread/xxx-sentences.txt
new file mode 100644
index 0000000..4a5a581
--- /dev/null
+++
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/dontread/xxx-sentences.txt
@@ -0,0 +1,3 @@
+1 This sentence should not be read.
+2 The same goes for this sentence.
+3 If we got this far then something went wrong!
\ No newline at end of file
diff --git
a/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/eng-sentences.txt
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/eng-sentences.txt
new file mode 100644
index 0000000..7102384
--- /dev/null
+++
b/opennlp-tools/src/test/resources/opennlp/tools/formats/leipzig/samples/eng-sentences.txt
@@ -0,0 +1,2 @@
+1 This is a sentence.
+2 This is another sentences.
\ No newline at end of file
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].