Revision: 20207 http://sourceforge.net/p/gate/code/20207 Author: johann_p Date: 2017-04-18 15:04:37 +0000 (Tue, 18 Apr 2017) Log Message: ----------- Make the -i option understand both directories and files. If the file specified for the -i option is a directory, everything is as it was before: the documents within that directory and all subdirectories are processed. If the file is not a directory, it is assumed to be a file listing all the relative paths of documents to process, where the relative paths are expected to be relative to the directory containing the list file. The same relative paths are used for creating the output documents relative to the output directory, if specified.
Modified Paths: -------------- gcp/trunk/doc/gcp-guide.pdf gcp/trunk/doc/install-and-run.tex gcp/trunk/gcp-direct.sh gcp/trunk/src/gate/cloud/batch/BatchRunner.java Modified: gcp/trunk/doc/gcp-guide.pdf =================================================================== (Binary files differ) Modified: gcp/trunk/doc/install-and-run.tex =================================================================== --- gcp/trunk/doc/install-and-run.tex 2017-04-11 15:11:32 UTC (rev 20206) +++ gcp/trunk/doc/install-and-run.tex 2017-04-18 15:04:37 UTC (rev 20207) @@ -140,11 +140,16 @@ (GATE XML format) or ``finf'' (FastInfoset format). To use FastInfoset the GATE \verb!Format_FastInfoset! plugin must be loaded by the saved application. -\item[-i] the directory in which to look for the input files. All files in +\item[-i] the directory in which to look for the input files or a file that contains + relative path names to the input files. If this points to a directory, all files in this directory and any subdirectories will be processed (except for standard backup and temporary file name patterns and source control metadata -- see \url{http://ant.apache.org/manual/dirtasks.html#defaultexcludes} for - details). + details). If this points to a file, the content of the file is expected to be + one relative file path per line, using UTF-8 encoding. The file paths are + interpreted to be relative to the directory that contains the list file. + If processed documents are written, then this will also be their relative + path to the output directory. \item[-o] (optional) the directory in which to place the output files. Each input file will generate an output file with the same name in the output directory. If this option is missing, and the option \texttt{-b} is missing as well, Modified: gcp/trunk/gcp-direct.sh =================================================================== --- gcp/trunk/gcp-direct.sh 2017-04-11 15:11:32 UTC (rev 20206) +++ gcp/trunk/gcp-direct.sh 2017-04-18 15:04:37 UTC (rev 20207) @@ -81,5 +81,6 @@ fi shift done - +echo JVM parameters used ${jvmparams[@]} +echo GCP parameters used ${gcpparams[@]} "$JAVA_HOME/bin/java" -Dgcp.home="${SCRIPTDIR}" -Djava.protocol.handler.pkgs=gate.cloud.util.protocols -cp "${GCP_CLASSPATH}" "${jvmparams[@]}" gate.cloud.batch.BatchRunner "${gcpparams[@]}" Modified: gcp/trunk/src/gate/cloud/batch/BatchRunner.java =================================================================== --- gcp/trunk/src/gate/cloud/batch/BatchRunner.java 2017-04-11 15:11:32 UTC (rev 20206) +++ gcp/trunk/src/gate/cloud/batch/BatchRunner.java 2017-04-18 15:04:37 UTC (rev 20207) @@ -56,6 +56,8 @@ import org.apache.log4j.Logger; import com.sun.jna.Platform; +import static gate.cloud.io.IOConstants.PARAM_BATCH_FILE_LOCATION; +import gate.cloud.io.ListDocumentEnumerator; import static gate.cloud.io.IOConstants.PARAM_COMPRESSION; import static gate.cloud.io.IOConstants.PARAM_DOCUMENT_ROOT; import static gate.cloud.io.IOConstants.PARAM_ENCODING; @@ -64,6 +66,7 @@ import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_GZIP; import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_NONE; import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_SNAPPY; +import static gate.cloud.io.ListDocumentEnumerator.PARAM_FILE_NAME; import gate.cloud.io.file.JSONOutputHandler; import static gate.cloud.io.file.JSONOutputHandler.PARAM_ANNOTATION_TYPE_PROPERTY; import static gate.cloud.io.file.JSONOutputHandler.PARAM_GROUP_ENTITIES_BY; @@ -520,7 +523,7 @@ // TODO: may be useful to be able to override the default user config and // session files here? options.addOption("b","batchFile",true,"Batch file (required, replaces -i, -o, -x, -r, -I)"); - options.addOption("i","inputDirectory",true,"Input directory (required, unless -b given)"); + options.addOption("i","inputDirectoryOrFile",true,"Input directory or file listing document IDs (required, unless -b given)"); options.addOption("f","outputFormat",true,"Output format, optional, one of 'xml'|'gatexml', 'finf', 'ser', 'json', default is 'finf'"); options.addOption("o","outputDirectory",true,"Output directory (not output if missing)"); options.addOption("x","executePipeline",true,"Pipeline/application file to execute (required, unless -b given)"); @@ -710,10 +713,24 @@ } else { aBatch.setBatchId("GcpBatchId"); } - // set the input Handler + // set the input Handler, depending on the value of the option "i": + // If this points to a directory, we process all matching files in that + // directory, if it points to a file we process all files listed in + // that file by interpreting each line as a file path relative to + // the directory where the specified file is located in. + String fileOrDir = line.getOptionValue('i'); + File fileOrDirFile = new File(fileOrDir); + if(!fileOrDirFile.exists()) { + throw new RuntimeException("ERROR file or directory does not exist: "+fileOrDirFile.getAbsolutePath()); + } String inputHandlerClassName = "gate.cloud.io.file.FileInputHandler"; Map<String,String> configData = new HashMap<String, String>(); - configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i')); + if(fileOrDirFile.isDirectory()) { + configData.put(PARAM_DOCUMENT_ROOT, fileOrDir); + } else { + // if we have a file, use the parent directory + configData.put(PARAM_DOCUMENT_ROOT, fileOrDirFile.getParent()); + } if(line.hasOption("ci")) { configData.put(PARAM_COMPRESSION,VALUE_COMPRESSION_GZIP); } else if(line.hasOption("si")) { @@ -781,18 +798,33 @@ outHandler.init(); // log.info("Have output handler: "+outHandler); outHandlers.add(outHandler); - } // if option -o is given + } else { // if option -o is given + log.info("WARNING: no option -o, processed documents are discarded!"); + } aBatch.setOutputHandlers(outHandlers); - String enumeratorClassName = "gate.cloud.io.file.FileDocumentEnumerator"; + String enumeratorClassName = null; + configData = new HashMap<String, String>(); + if(fileOrDirFile.isDirectory()) { + log.info("Enumerating all file IDs in directory: "+fileOrDirFile.getAbsolutePath()); + enumeratorClassName = "gate.cloud.io.file.FileDocumentEnumerator"; + configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i')); + } else { + log.info("Reading file IDs from file: "+fileOrDirFile.getAbsolutePath()); + enumeratorClassName = "gate.cloud.io.ListDocumentEnumerator"; + configData.put(PARAM_BATCH_FILE_LOCATION,new File(".").getAbsolutePath()); + configData.put(PARAM_FILE_NAME, fileOrDir); + configData.put(PARAM_ENCODING,"UTF-8"); + } Class<? extends DocumentEnumerator> enumeratorClass = Class.forName(enumeratorClassName, true, Gate.getClassLoader()) .asSubclass(DocumentEnumerator.class); - configData = new HashMap<String, String>(); - configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i')); - List<DocumentID> docIds = new LinkedList<DocumentID>(); DocumentEnumerator enumerator = enumeratorClass.newInstance(); enumerator.config(configData); enumerator.init(); + // TODO: this should really not be done like this! + // Instead of reading the docIds in all at once, they should + // get streamed to the workers on demand, if at all possible? + List<DocumentID> docIds = new LinkedList<DocumentID>(); while(enumerator.hasNext()) { DocumentID id = enumerator.next(); // log.info("Adding document: "+id); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs