Revision: 20207
          http://sourceforge.net/p/gate/code/20207
Author:   johann_p
Date:     2017-04-18 15:04:37 +0000 (Tue, 18 Apr 2017)
Log Message:
-----------
Make the -i option understand both directories and files.
If the file specified for the -i option is a directory, everything
is as it was before: the documents within that directory and all
subdirectories are processed. If the file is not a directory, it
is assumed to be a file listing all the relative paths of 
documents to process, where the relative paths are expected to
be relative to the directory containing the list file.
The same relative paths are used for creating the output 
documents relative to the output directory, if specified.

Modified Paths:
--------------
    gcp/trunk/doc/gcp-guide.pdf
    gcp/trunk/doc/install-and-run.tex
    gcp/trunk/gcp-direct.sh
    gcp/trunk/src/gate/cloud/batch/BatchRunner.java

Modified: gcp/trunk/doc/gcp-guide.pdf
===================================================================
(Binary files differ)

Modified: gcp/trunk/doc/install-and-run.tex
===================================================================
--- gcp/trunk/doc/install-and-run.tex   2017-04-11 15:11:32 UTC (rev 20206)
+++ gcp/trunk/doc/install-and-run.tex   2017-04-18 15:04:37 UTC (rev 20207)
@@ -140,11 +140,16 @@
   (GATE XML format) or ``finf'' (FastInfoset format).  To use FastInfoset the
   GATE \verb!Format_FastInfoset! plugin must be loaded by the saved
   application.
-\item[-i] the directory in which to look for the input files.  All files in
+\item[-i] the directory in which to look for the input files or a file that 
contains
+  relative path names to the input files. If this points to a directory, all 
files in
   this directory and any subdirectories will be processed (except for standard
   backup and temporary file name patterns and source control metadata -- see
   \url{http://ant.apache.org/manual/dirtasks.html#defaultexcludes} for
-  details).
+  details). If this points to a file, the content of the file is expected to 
be 
+  one relative file path per line, using UTF-8 encoding. The file paths are 
+  interpreted to be relative to the directory that contains the list file.
+  If processed documents are written, then this will also be their relative 
+  path to the output directory. 
 \item[-o] (optional) the directory in which to place the output files.  Each 
input file
   will generate an output file with the same name in the output directory.
   If this option is missing, and the option \texttt{-b} is missing as well,

Modified: gcp/trunk/gcp-direct.sh
===================================================================
--- gcp/trunk/gcp-direct.sh     2017-04-11 15:11:32 UTC (rev 20206)
+++ gcp/trunk/gcp-direct.sh     2017-04-18 15:04:37 UTC (rev 20207)
@@ -81,5 +81,6 @@
 fi
 shift
 done
-
+echo JVM parameters used ${jvmparams[@]}
+echo GCP parameters used ${gcpparams[@]}
 "$JAVA_HOME/bin/java" -Dgcp.home="${SCRIPTDIR}" 
-Djava.protocol.handler.pkgs=gate.cloud.util.protocols -cp "${GCP_CLASSPATH}" 
"${jvmparams[@]}" gate.cloud.batch.BatchRunner "${gcpparams[@]}"

Modified: gcp/trunk/src/gate/cloud/batch/BatchRunner.java
===================================================================
--- gcp/trunk/src/gate/cloud/batch/BatchRunner.java     2017-04-11 15:11:32 UTC 
(rev 20206)
+++ gcp/trunk/src/gate/cloud/batch/BatchRunner.java     2017-04-18 15:04:37 UTC 
(rev 20207)
@@ -56,6 +56,8 @@
 import org.apache.log4j.Logger;
 
 import com.sun.jna.Platform;
+import static gate.cloud.io.IOConstants.PARAM_BATCH_FILE_LOCATION;
+import gate.cloud.io.ListDocumentEnumerator;
 import static gate.cloud.io.IOConstants.PARAM_COMPRESSION;
 import static gate.cloud.io.IOConstants.PARAM_DOCUMENT_ROOT;
 import static gate.cloud.io.IOConstants.PARAM_ENCODING;
@@ -64,6 +66,7 @@
 import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_GZIP;
 import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_NONE;
 import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_SNAPPY;
+import static gate.cloud.io.ListDocumentEnumerator.PARAM_FILE_NAME;
 import gate.cloud.io.file.JSONOutputHandler;
 import static 
gate.cloud.io.file.JSONOutputHandler.PARAM_ANNOTATION_TYPE_PROPERTY;
 import static gate.cloud.io.file.JSONOutputHandler.PARAM_GROUP_ENTITIES_BY;
@@ -520,7 +523,7 @@
     // TODO: may be useful to be able to override the default user config and
     // session files here?
     options.addOption("b","batchFile",true,"Batch file (required, replaces -i, 
-o, -x, -r, -I)");
-    options.addOption("i","inputDirectory",true,"Input directory (required, 
unless -b given)");
+    options.addOption("i","inputDirectoryOrFile",true,"Input directory or file 
listing document IDs (required, unless -b given)");
     options.addOption("f","outputFormat",true,"Output format, optional, one of 
'xml'|'gatexml', 'finf', 'ser', 'json', default is 'finf'");
     options.addOption("o","outputDirectory",true,"Output directory (not output 
if missing)");
     options.addOption("x","executePipeline",true,"Pipeline/application file to 
execute (required, unless -b given)");
@@ -710,10 +713,24 @@
           } else {
             aBatch.setBatchId("GcpBatchId");
           }
-          // set the input Handler
+          // set the input Handler, depending on the value of the option "i":
+          // If this points to a directory, we process all matching files in 
that
+          // directory, if it points to a file we process all files listed in
+          // that file by interpreting each line as a file path relative to 
+          // the directory where the specified file is located in.
+          String fileOrDir = line.getOptionValue('i');
+          File fileOrDirFile = new File(fileOrDir);
+          if(!fileOrDirFile.exists()) {
+            throw new RuntimeException("ERROR file or directory does not 
exist: "+fileOrDirFile.getAbsolutePath());
+          }
           String inputHandlerClassName = "gate.cloud.io.file.FileInputHandler";
           Map<String,String> configData = new HashMap<String, String>();
-          configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));
+          if(fileOrDirFile.isDirectory()) {
+            configData.put(PARAM_DOCUMENT_ROOT, fileOrDir);
+          } else {
+            // if we have a file, use the parent directory
+            configData.put(PARAM_DOCUMENT_ROOT, fileOrDirFile.getParent());
+          }
           if(line.hasOption("ci")) {
             configData.put(PARAM_COMPRESSION,VALUE_COMPRESSION_GZIP);          
  
           } else if(line.hasOption("si"))  {
@@ -781,18 +798,33 @@
             outHandler.init();
             // log.info("Have output handler: "+outHandler);            
             outHandlers.add(outHandler);
-          } // if option -o is given
+          } else { // if option -o is given
+            log.info("WARNING: no option -o, processed documents are 
discarded!");
+          }
           aBatch.setOutputHandlers(outHandlers);
-          String enumeratorClassName = 
"gate.cloud.io.file.FileDocumentEnumerator";
+          String enumeratorClassName = null;
+          configData = new HashMap<String, String>();
+          if(fileOrDirFile.isDirectory()) {
+            log.info("Enumerating all file IDs in directory: 
"+fileOrDirFile.getAbsolutePath());
+            enumeratorClassName = "gate.cloud.io.file.FileDocumentEnumerator";
+            configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));     
      
+          } else {
+            log.info("Reading file IDs from file: 
"+fileOrDirFile.getAbsolutePath());
+            enumeratorClassName = "gate.cloud.io.ListDocumentEnumerator";
+            configData.put(PARAM_BATCH_FILE_LOCATION,new 
File(".").getAbsolutePath());
+            configData.put(PARAM_FILE_NAME, fileOrDir);
+            configData.put(PARAM_ENCODING,"UTF-8");
+          }
           Class<? extends DocumentEnumerator> enumeratorClass =
                 Class.forName(enumeratorClassName, true, Gate.getClassLoader())
                         .asSubclass(DocumentEnumerator.class);
-          configData = new HashMap<String, String>();
-          configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));       
   
-          List<DocumentID> docIds = new LinkedList<DocumentID>();
           DocumentEnumerator enumerator = enumeratorClass.newInstance();
           enumerator.config(configData);
           enumerator.init();
+          // TODO: this should really not be done like this! 
+          // Instead of reading the docIds in all at once, they should 
+          // get streamed to the workers on demand, if at all possible?
+          List<DocumentID> docIds = new LinkedList<DocumentID>();
           while(enumerator.hasNext()) {
             DocumentID id = enumerator.next();
             // log.info("Adding document: "+id);

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to