Revision: 20240 http://sourceforge.net/p/gate/code/20240 Author: ian_roberts Date: 2017-06-26 17:06:35 +0000 (Mon, 26 Jun 2017) Log Message: ----------- Merged changes from trunk to make v2.8.1, depending on gate-core 8.4.1
Modified Paths: -------------- gcp/branches/2.8/build/ivy.xml gcp/branches/2.8/build.xml gcp/branches/2.8/doc/gcp-guide.pdf gcp/branches/2.8/doc/gcp-guide.tex gcp/branches/2.8/doc/install-and-run.tex gcp/branches/2.8/doc/introduction.tex gcp/branches/2.8/gcp-direct.sh gcp/branches/2.8/src/gate/cloud/batch/BatchRunner.java Property Changed: ---------------- gcp/branches/2.8/ Index: gcp/branches/2.8 =================================================================== --- gcp/branches/2.8 2017-06-23 16:32:55 UTC (rev 20239) +++ gcp/branches/2.8 2017-06-26 17:06:35 UTC (rev 20240) Property changes on: gcp/branches/2.8 ___________________________________________________________________ Modified: svn:mergeinfo ## -1,3 +1,4 ## /gcp/branches/2.2:15411 /gcp/branches/2.4:18004 -/gcp/branches/2.7:19937 \ No newline at end of property +/gcp/branches/2.7:19937 +/gcp/trunk:20207 \ No newline at end of property Modified: gcp/branches/2.8/build/ivy.xml =================================================================== --- gcp/branches/2.8/build/ivy.xml 2017-06-23 16:32:55 UTC (rev 20239) +++ gcp/branches/2.8/build/ivy.xml 2017-06-26 17:06:35 UTC (rev 20240) @@ -3,7 +3,7 @@ <info organisation="uk.ac.gate" module="gcp" - revision="2.8"> + revision="2.8.1"> <description homepage="http://gate.ac.uk/gcp" /> </info> @@ -13,7 +13,7 @@ <dependencies defaultconf="*->master(default),runtime(default)" > <!-- GATE --> - <dependency name="gate-core" org="uk.ac.gate" rev="8.4"> + <dependency name="gate-core" org="uk.ac.gate" rev="8.4.1"> <artifact name="gate-core" type="jar" /> </dependency> Modified: gcp/branches/2.8/build.xml =================================================================== --- gcp/branches/2.8/build.xml 2017-06-23 16:32:55 UTC (rev 20239) +++ gcp/branches/2.8/build.xml 2017-06-26 17:06:35 UTC (rev 20240) @@ -7,7 +7,7 @@ of a GATE application in parallel threads. </description> <property name="build.sysclasspath" value="ignore" /> - <property name="version" value="2.8" /> + <property name="version" value="2.8.1" /> <property name="src.dir" location="src" /> <property name="classes.dir" location="classes" /> Modified: gcp/branches/2.8/doc/gcp-guide.pdf =================================================================== (Binary files differ) Modified: gcp/branches/2.8/doc/gcp-guide.tex =================================================================== --- gcp/branches/2.8/doc/gcp-guide.tex 2017-06-23 16:32:55 UTC (rev 20239) +++ gcp/branches/2.8/doc/gcp-guide.tex 2017-06-26 17:06:35 UTC (rev 20240) @@ -4,7 +4,7 @@ \begin{document} \title{{\Huge {\textsc{The GATECloud Paralleliser (GCP)}}}\\ Large-scale multi-threaded processing with GATE Embedded \\ -{\small version 2.8} +{\small version 2.8.1} } \author{Ian Roberts, Valentin Tablan\\GATE Team} Modified: gcp/branches/2.8/doc/install-and-run.tex =================================================================== --- gcp/branches/2.8/doc/install-and-run.tex 2017-06-23 16:32:55 UTC (rev 20239) +++ gcp/branches/2.8/doc/install-and-run.tex 2017-06-26 17:06:35 UTC (rev 20240) @@ -140,11 +140,16 @@ (GATE XML format) or ``finf'' (FastInfoset format). To use FastInfoset the GATE \verb!Format_FastInfoset! plugin must be loaded by the saved application. -\item[-i] the directory in which to look for the input files. All files in +\item[-i] the directory in which to look for the input files or a file that contains + relative path names to the input files. If this points to a directory, all files in this directory and any subdirectories will be processed (except for standard backup and temporary file name patterns and source control metadata -- see \url{http://ant.apache.org/manual/dirtasks.html#defaultexcludes} for - details). + details). If this points to a file, the content of the file is expected to be + one relative file path per line, using UTF-8 encoding. The file paths are + interpreted to be relative to the directory that contains the list file. + If processed documents are written, then this will also be their relative + path to the output directory. \item[-o] (optional) the directory in which to place the output files. Each input file will generate an output file with the same name in the output directory. If this option is missing, and the option \texttt{-b} is missing as well, Modified: gcp/branches/2.8/doc/introduction.tex =================================================================== --- gcp/branches/2.8/doc/introduction.tex 2017-06-23 16:32:55 UTC (rev 20239) +++ gcp/branches/2.8/doc/introduction.tex 2017-06-26 17:06:35 UTC (rev 20240) @@ -138,6 +138,12 @@ %% option -o for gcp-direct is not required any more, if missing, documents %% are not saved. +\subsection{2.8.1 (June 2017)} + +GCP now depends on GATE Embedded 8.4.1. Also the \verb!-i! option to +\verb!gcp-direct.sh! can now be a \emph{file} which lists the documents to +process, instead of just a \emph{directory} of documents. + \subsection{2.8 (February 2017)} GCP now depends on GATE Embedded 8.4. Modified: gcp/branches/2.8/gcp-direct.sh =================================================================== --- gcp/branches/2.8/gcp-direct.sh 2017-06-23 16:32:55 UTC (rev 20239) +++ gcp/branches/2.8/gcp-direct.sh 2017-06-26 17:06:35 UTC (rev 20240) @@ -81,5 +81,6 @@ fi shift done - +echo JVM parameters used ${jvmparams[@]} +echo GCP parameters used ${gcpparams[@]} "$JAVA_HOME/bin/java" -Dgcp.home="${SCRIPTDIR}" -Djava.protocol.handler.pkgs=gate.cloud.util.protocols -cp "${GCP_CLASSPATH}" "${jvmparams[@]}" gate.cloud.batch.BatchRunner "${gcpparams[@]}" Modified: gcp/branches/2.8/src/gate/cloud/batch/BatchRunner.java =================================================================== --- gcp/branches/2.8/src/gate/cloud/batch/BatchRunner.java 2017-06-23 16:32:55 UTC (rev 20239) +++ gcp/branches/2.8/src/gate/cloud/batch/BatchRunner.java 2017-06-26 17:06:35 UTC (rev 20240) @@ -56,6 +56,8 @@ import org.apache.log4j.Logger; import com.sun.jna.Platform; +import static gate.cloud.io.IOConstants.PARAM_BATCH_FILE_LOCATION; +import gate.cloud.io.ListDocumentEnumerator; import static gate.cloud.io.IOConstants.PARAM_COMPRESSION; import static gate.cloud.io.IOConstants.PARAM_DOCUMENT_ROOT; import static gate.cloud.io.IOConstants.PARAM_ENCODING; @@ -64,6 +66,7 @@ import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_GZIP; import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_NONE; import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_SNAPPY; +import static gate.cloud.io.ListDocumentEnumerator.PARAM_FILE_NAME; import gate.cloud.io.file.JSONOutputHandler; import static gate.cloud.io.file.JSONOutputHandler.PARAM_ANNOTATION_TYPE_PROPERTY; import static gate.cloud.io.file.JSONOutputHandler.PARAM_GROUP_ENTITIES_BY; @@ -520,7 +523,7 @@ // TODO: may be useful to be able to override the default user config and // session files here? options.addOption("b","batchFile",true,"Batch file (required, replaces -i, -o, -x, -r, -I)"); - options.addOption("i","inputDirectory",true,"Input directory (required, unless -b given)"); + options.addOption("i","inputDirectoryOrFile",true,"Input directory or file listing document IDs (required, unless -b given)"); options.addOption("f","outputFormat",true,"Output format, optional, one of 'xml'|'gatexml', 'finf', 'ser', 'json', default is 'finf'"); options.addOption("o","outputDirectory",true,"Output directory (not output if missing)"); options.addOption("x","executePipeline",true,"Pipeline/application file to execute (required, unless -b given)"); @@ -710,10 +713,24 @@ } else { aBatch.setBatchId("GcpBatchId"); } - // set the input Handler + // set the input Handler, depending on the value of the option "i": + // If this points to a directory, we process all matching files in that + // directory, if it points to a file we process all files listed in + // that file by interpreting each line as a file path relative to + // the directory where the specified file is located in. + String fileOrDir = line.getOptionValue('i'); + File fileOrDirFile = new File(fileOrDir); + if(!fileOrDirFile.exists()) { + throw new RuntimeException("ERROR file or directory does not exist: "+fileOrDirFile.getAbsolutePath()); + } String inputHandlerClassName = "gate.cloud.io.file.FileInputHandler"; Map<String,String> configData = new HashMap<String, String>(); - configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i')); + if(fileOrDirFile.isDirectory()) { + configData.put(PARAM_DOCUMENT_ROOT, fileOrDir); + } else { + // if we have a file, use the parent directory + configData.put(PARAM_DOCUMENT_ROOT, fileOrDirFile.getParent()); + } if(line.hasOption("ci")) { configData.put(PARAM_COMPRESSION,VALUE_COMPRESSION_GZIP); } else if(line.hasOption("si")) { @@ -781,18 +798,33 @@ outHandler.init(); // log.info("Have output handler: "+outHandler); outHandlers.add(outHandler); - } // if option -o is given + } else { // if option -o is given + log.info("WARNING: no option -o, processed documents are discarded!"); + } aBatch.setOutputHandlers(outHandlers); - String enumeratorClassName = "gate.cloud.io.file.FileDocumentEnumerator"; + String enumeratorClassName = null; + configData = new HashMap<String, String>(); + if(fileOrDirFile.isDirectory()) { + log.info("Enumerating all file IDs in directory: "+fileOrDirFile.getAbsolutePath()); + enumeratorClassName = "gate.cloud.io.file.FileDocumentEnumerator"; + configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i')); + } else { + log.info("Reading file IDs from file: "+fileOrDirFile.getAbsolutePath()); + enumeratorClassName = "gate.cloud.io.ListDocumentEnumerator"; + configData.put(PARAM_BATCH_FILE_LOCATION,new File(".").getAbsolutePath()); + configData.put(PARAM_FILE_NAME, fileOrDir); + configData.put(PARAM_ENCODING,"UTF-8"); + } Class<? extends DocumentEnumerator> enumeratorClass = Class.forName(enumeratorClassName, true, Gate.getClassLoader()) .asSubclass(DocumentEnumerator.class); - configData = new HashMap<String, String>(); - configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i')); - List<DocumentID> docIds = new LinkedList<DocumentID>(); DocumentEnumerator enumerator = enumeratorClass.newInstance(); enumerator.config(configData); enumerator.init(); + // TODO: this should really not be done like this! + // Instead of reading the docIds in all at once, they should + // get streamed to the workers on demand, if at all possible? + List<DocumentID> docIds = new LinkedList<DocumentID>(); while(enumerator.hasNext()) { DocumentID id = enumerator.next(); // log.info("Adding document: "+id); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. ------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot _______________________________________________ GATE-cvs mailing list GATE-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/gate-cvs