This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new c9440e68 OPENNLP-1702: BratDocumentStream should process files in
bratCorpusDir deterministically - fix by sorting all candidate files from dir
lexicographically - extracts constants where applicable
c9440e68 is described below
commit c9440e68e098654a9f4315c816d0f0dd0cbef7b5
Author: Martin Wiesner <[email protected]>
AuthorDate: Fri Jan 24 15:41:46 2025 +0100
OPENNLP-1702: BratDocumentStream should process files in bratCorpusDir
deterministically
- fix by sorting all candidate files from dir lexicographically
- extracts constants where applicable
---
.../tools/formats/brat/BratDocumentStream.java | 29 +++++++++-------------
1 file changed, 12 insertions(+), 17 deletions(-)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
index 5abaf0ea..36d6d287 100644
---
a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
+++
b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
@@ -23,6 +23,7 @@ import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
@@ -32,6 +33,9 @@ import opennlp.tools.util.ObjectStream;
public class BratDocumentStream implements ObjectStream<BratDocument> {
+ private static final String SUFFIX_ANN = ".ann";
+ private static final String SUFFIX_TXT = ".txt";
+
private final AnnotationConfiguration config;
private List<String> documentIds = new LinkedList<>();
private Iterator<String> documentIdIterator;
@@ -45,7 +49,7 @@ public class BratDocumentStream implements
ObjectStream<BratDocument> {
* to find training data files.
* @param fileFilter a custom file filter to filter out certain files or
null to accept all files
*
- * @throws IOException if reading from the brat directory fails in anyway
+ * @throws IOException if reading from the brat directory fails in any way.
*/
public BratDocumentStream(AnnotationConfiguration config, File
bratCorpusDirectory,
boolean searchRecursive, FileFilter fileFilter) throws IOException {
@@ -54,24 +58,20 @@ public class BratDocumentStream implements
ObjectStream<BratDocument> {
throw new IOException("Input corpus directory must be a directory " +
"according to File.isDirectory()!");
}
-
this.config = config;
Stack<File> directoryStack = new Stack<>();
directoryStack.add(bratCorpusDirectory);
-
while (!directoryStack.isEmpty()) {
- for (File file : directoryStack.pop().listFiles(fileFilter)) {
-
+ final File[] files = directoryStack.pop().listFiles(fileFilter);
+ Arrays.sort(files);
+ for (File file : files) {
if (file.isFile()) {
String annFilePath = file.getAbsolutePath();
- if (annFilePath.endsWith(".ann")) {
-
+ if (annFilePath.endsWith(SUFFIX_ANN)) {
// cutoff last 4 chars ...
String documentId = annFilePath.substring(0, annFilePath.length()
- 4);
-
- File txtFile = new File(documentId + ".txt");
-
+ File txtFile = new File(documentId + SUFFIX_TXT);
if (txtFile.exists() && txtFile.isFile()) {
documentIds.add(documentId);
}
@@ -82,24 +82,19 @@ public class BratDocumentStream implements
ObjectStream<BratDocument> {
}
}
}
-
reset();
}
@Override
public BratDocument read() throws IOException {
-
BratDocument doc = null;
-
if (documentIdIterator.hasNext()) {
String id = documentIdIterator.next();
-
- try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id
+ ".txt"));
- InputStream annIn = new BufferedInputStream(new FileInputStream(id +
".ann"))) {
+ try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id
+ SUFFIX_TXT));
+ InputStream annIn = new BufferedInputStream(new FileInputStream(id
+ SUFFIX_ANN))) {
doc = BratDocument.parseDocument(config, id, txtIn, annIn);
}
}
-
return doc;
}