This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/main by this push:
     new c9440e68 OPENNLP-1702: BratDocumentStream should process files in 
bratCorpusDir deterministically - fix by sorting all candidate files from dir 
lexicographically - extracts constants where applicable
c9440e68 is described below

commit c9440e68e098654a9f4315c816d0f0dd0cbef7b5
Author: Martin Wiesner <[email protected]>
AuthorDate: Fri Jan 24 15:41:46 2025 +0100

    OPENNLP-1702: BratDocumentStream should process files in bratCorpusDir 
deterministically
    - fix by sorting all candidate files from dir lexicographically
    - extracts constants where applicable
---
 .../tools/formats/brat/BratDocumentStream.java     | 29 +++++++++-------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
 
b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
index 5abaf0ea..36d6d287 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
@@ -23,6 +23,7 @@ import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
@@ -32,6 +33,9 @@ import opennlp.tools.util.ObjectStream;
 
 public class BratDocumentStream implements ObjectStream<BratDocument> {
 
+  private static final String SUFFIX_ANN = ".ann";
+  private static final String SUFFIX_TXT = ".txt";
+
   private final AnnotationConfiguration config;
   private List<String> documentIds = new LinkedList<>();
   private Iterator<String> documentIdIterator;
@@ -45,7 +49,7 @@ public class BratDocumentStream implements 
ObjectStream<BratDocument> {
    *     to find training data files.
    * @param fileFilter  a custom file filter to filter out certain files or 
null to accept all files
    *
-   * @throws IOException if reading from the brat directory fails in anyway
+   * @throws IOException if reading from the brat directory fails in any way.
    */
   public BratDocumentStream(AnnotationConfiguration config, File 
bratCorpusDirectory,
       boolean searchRecursive, FileFilter fileFilter) throws IOException {
@@ -54,24 +58,20 @@ public class BratDocumentStream implements 
ObjectStream<BratDocument> {
       throw new IOException("Input corpus directory must be a directory " +
           "according to File.isDirectory()!");
     }
-
     this.config = config;
 
     Stack<File> directoryStack = new Stack<>();
     directoryStack.add(bratCorpusDirectory);
-
     while (!directoryStack.isEmpty()) {
-      for (File file : directoryStack.pop().listFiles(fileFilter)) {
-
+      final File[] files = directoryStack.pop().listFiles(fileFilter);
+      Arrays.sort(files);
+      for (File file : files) {
         if (file.isFile()) {
           String annFilePath = file.getAbsolutePath();
-          if (annFilePath.endsWith(".ann")) {
-
+          if (annFilePath.endsWith(SUFFIX_ANN)) {
             // cutoff last 4 chars ...
             String documentId = annFilePath.substring(0, annFilePath.length() 
- 4);
-
-            File txtFile = new File(documentId + ".txt");
-
+            File txtFile = new File(documentId + SUFFIX_TXT);
             if (txtFile.exists() && txtFile.isFile()) {
               documentIds.add(documentId);
             }
@@ -82,24 +82,19 @@ public class BratDocumentStream implements 
ObjectStream<BratDocument> {
         }
       }
     }
-
     reset();
   }
 
   @Override
   public BratDocument read() throws IOException {
-
     BratDocument doc = null;
-
     if (documentIdIterator.hasNext()) {
       String id = documentIdIterator.next();
-
-      try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id 
+ ".txt"));
-          InputStream annIn = new BufferedInputStream(new FileInputStream(id + 
".ann"))) {
+      try (InputStream txtIn = new BufferedInputStream(new FileInputStream(id 
+ SUFFIX_TXT));
+           InputStream annIn = new BufferedInputStream(new FileInputStream(id 
+ SUFFIX_ANN))) {
         doc = BratDocument.parseDocument(config, id, txtIn, annIn);
       }
     }
-
     return doc;
   }
 

Reply via email to