Author: joern
Date: Thu Sep 17 12:53:24 2015
New Revision: 1703610

URL: http://svn.apache.org/r1703610
Log:
OPENNLP-819 Now reads multiple files from a directory and extracts the language 
from the file name

Modified:
    
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java

Modified: 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
URL: 
http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java?rev=1703610&r1=1703609&r2=1703610&view=diff
==============================================================================
--- 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
 (original)
+++ 
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
 Thu Sep 17 12:53:24 2015
@@ -17,23 +17,32 @@
 
 package opennlp.tools.formats;
 
+import java.io.File;
+import java.io.FilenameFilter;
 import java.io.IOException;
 
 import opennlp.tools.cmdline.ArgumentParser;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.StreamFactoryRegistry;
 import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
 import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.cmdline.params.EncodingParameter;
 import opennlp.tools.cmdline.params.LanguageParams;
 import opennlp.tools.doccat.DocumentSample;
 import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ObjectStreamUtils;
 
 /**
  * <b>Note:</b> Do not use this class, internal use only!
  */
-public class LeipzigDocumentSampleStreamFactory extends 
LanguageSampleStreamFactory<DocumentSample> {
+public class LeipzigDocumentSampleStreamFactory
+    extends AbstractSampleStreamFactory<DocumentSample> {
 
-  interface Parameters extends BasicFormatParams, LanguageParams {
+  interface Parameters extends EncodingParameter {
+    @ParameterDescription(valueName = "sentencesDir",
+        description = "dir with Leipig sentences to be used")
+    File getSentencesDir();
   }
 
   public static void registerFactory() {
@@ -48,13 +57,28 @@ public class LeipzigDocumentSampleStream
   public ObjectStream<DocumentSample> create(String[] args) {
 
     Parameters params = ArgumentParser.parse(args, Parameters.class);
-    language = params.getLang();
-
-    try {
-      return new LeipzigDoccatSampleStream(params.getLang(), 20,
-          CmdLineUtil.openInFile(params.getData()));
-    } catch (IOException e) {
-      throw new TerminateToolException(-1, "IO error while opening sample 
data: " + e.getMessage(), e);
+    File sentencesFileDir = params.getSentencesDir();
+    
+    File sentencesFiles[] = sentencesFileDir.listFiles(new FilenameFilter() {
+      @Override
+      public boolean accept(File dir, String name) {
+        return name.contains("sentences") && name.endsWith(".txt");
+      }
+    });
+    
+    @SuppressWarnings("unchecked")
+    ObjectStream<DocumentSample> sampleStreams[] = 
+        new ObjectStream[sentencesFiles.length];
+
+    for (int i = 0; i < sentencesFiles.length; i++) {
+      try {
+        sampleStreams[i] = new 
LeipzigDoccatSampleStream(sentencesFiles[i].getName().substring(0, 3), 20,
+            CmdLineUtil.openInFile(sentencesFiles[i]));
+      } catch (IOException e) {
+        throw new TerminateToolException(-1, "IO error while opening sample 
data: " + e.getMessage(), e);
+      }
     }
+    
+    return ObjectStreamUtils.createObjectStream(sampleStreams);
   }
 }


Reply via email to