Revision: 20240
          http://sourceforge.net/p/gate/code/20240
Author:   ian_roberts
Date:     2017-06-26 17:06:35 +0000 (Mon, 26 Jun 2017)
Log Message:
-----------
Merged changes from trunk to make v2.8.1, depending on gate-core 8.4.1

Modified Paths:
--------------
    gcp/branches/2.8/build/ivy.xml
    gcp/branches/2.8/build.xml
    gcp/branches/2.8/doc/gcp-guide.pdf
    gcp/branches/2.8/doc/gcp-guide.tex
    gcp/branches/2.8/doc/install-and-run.tex
    gcp/branches/2.8/doc/introduction.tex
    gcp/branches/2.8/gcp-direct.sh
    gcp/branches/2.8/src/gate/cloud/batch/BatchRunner.java

Property Changed:
----------------
    gcp/branches/2.8/

Index: gcp/branches/2.8
===================================================================
--- gcp/branches/2.8    2017-06-23 16:32:55 UTC (rev 20239)
+++ gcp/branches/2.8    2017-06-26 17:06:35 UTC (rev 20240)

Property changes on: gcp/branches/2.8
___________________________________________________________________
Modified: svn:mergeinfo
## -1,3 +1,4 ##
 /gcp/branches/2.2:15411
 /gcp/branches/2.4:18004
-/gcp/branches/2.7:19937
\ No newline at end of property
+/gcp/branches/2.7:19937
+/gcp/trunk:20207
\ No newline at end of property
Modified: gcp/branches/2.8/build/ivy.xml
===================================================================
--- gcp/branches/2.8/build/ivy.xml      2017-06-23 16:32:55 UTC (rev 20239)
+++ gcp/branches/2.8/build/ivy.xml      2017-06-26 17:06:35 UTC (rev 20240)
@@ -3,7 +3,7 @@
   <info
     organisation="uk.ac.gate"
     module="gcp"
-    revision="2.8">
+    revision="2.8.1">
     <description homepage="http://gate.ac.uk/gcp"; />
   </info>
 
@@ -13,7 +13,7 @@
 
   <dependencies defaultconf="*->master(default),runtime(default)" >
     <!-- GATE -->
-    <dependency name="gate-core" org="uk.ac.gate" rev="8.4">
+    <dependency name="gate-core" org="uk.ac.gate" rev="8.4.1">
       <artifact name="gate-core" type="jar" />
     </dependency>
   

Modified: gcp/branches/2.8/build.xml
===================================================================
--- gcp/branches/2.8/build.xml  2017-06-23 16:32:55 UTC (rev 20239)
+++ gcp/branches/2.8/build.xml  2017-06-26 17:06:35 UTC (rev 20240)
@@ -7,7 +7,7 @@
       of a GATE application in parallel threads.
   </description>
   <property name="build.sysclasspath" value="ignore" />
-  <property name="version" value="2.8" />
+  <property name="version" value="2.8.1" />
   
   <property name="src.dir" location="src" />
   <property name="classes.dir" location="classes" />

Modified: gcp/branches/2.8/doc/gcp-guide.pdf
===================================================================
(Binary files differ)

Modified: gcp/branches/2.8/doc/gcp-guide.tex
===================================================================
--- gcp/branches/2.8/doc/gcp-guide.tex  2017-06-23 16:32:55 UTC (rev 20239)
+++ gcp/branches/2.8/doc/gcp-guide.tex  2017-06-26 17:06:35 UTC (rev 20240)
@@ -4,7 +4,7 @@
 \begin{document}
 \title{{\Huge {\textsc{The GATECloud Paralleliser (GCP)}}}\\
 Large-scale multi-threaded processing with GATE Embedded \\
-{\small version 2.8}
+{\small version 2.8.1}
 }
 \author{Ian Roberts, Valentin Tablan\\GATE Team}
 

Modified: gcp/branches/2.8/doc/install-and-run.tex
===================================================================
--- gcp/branches/2.8/doc/install-and-run.tex    2017-06-23 16:32:55 UTC (rev 
20239)
+++ gcp/branches/2.8/doc/install-and-run.tex    2017-06-26 17:06:35 UTC (rev 
20240)
@@ -140,11 +140,16 @@
   (GATE XML format) or ``finf'' (FastInfoset format).  To use FastInfoset the
   GATE \verb!Format_FastInfoset! plugin must be loaded by the saved
   application.
-\item[-i] the directory in which to look for the input files.  All files in
+\item[-i] the directory in which to look for the input files or a file that 
contains
+  relative path names to the input files. If this points to a directory, all 
files in
   this directory and any subdirectories will be processed (except for standard
   backup and temporary file name patterns and source control metadata -- see
   \url{http://ant.apache.org/manual/dirtasks.html#defaultexcludes} for
-  details).
+  details). If this points to a file, the content of the file is expected to 
be 
+  one relative file path per line, using UTF-8 encoding. The file paths are 
+  interpreted to be relative to the directory that contains the list file.
+  If processed documents are written, then this will also be their relative 
+  path to the output directory. 
 \item[-o] (optional) the directory in which to place the output files.  Each 
input file
   will generate an output file with the same name in the output directory.
   If this option is missing, and the option \texttt{-b} is missing as well,

Modified: gcp/branches/2.8/doc/introduction.tex
===================================================================
--- gcp/branches/2.8/doc/introduction.tex       2017-06-23 16:32:55 UTC (rev 
20239)
+++ gcp/branches/2.8/doc/introduction.tex       2017-06-26 17:06:35 UTC (rev 
20240)
@@ -138,6 +138,12 @@
 %% option -o for gcp-direct is not required any more, if missing, documents
 %% are not saved.
 
+\subsection{2.8.1 (June 2017)}
+
+GCP now depends on GATE Embedded 8.4.1.  Also the \verb!-i! option to
+\verb!gcp-direct.sh! can now be a \emph{file} which lists the documents to
+process, instead of just a \emph{directory} of documents.
+
 \subsection{2.8 (February 2017)}
 
 GCP now depends on GATE Embedded 8.4.

Modified: gcp/branches/2.8/gcp-direct.sh
===================================================================
--- gcp/branches/2.8/gcp-direct.sh      2017-06-23 16:32:55 UTC (rev 20239)
+++ gcp/branches/2.8/gcp-direct.sh      2017-06-26 17:06:35 UTC (rev 20240)
@@ -81,5 +81,6 @@
 fi
 shift
 done
-
+echo JVM parameters used ${jvmparams[@]}
+echo GCP parameters used ${gcpparams[@]}
 "$JAVA_HOME/bin/java" -Dgcp.home="${SCRIPTDIR}" 
-Djava.protocol.handler.pkgs=gate.cloud.util.protocols -cp "${GCP_CLASSPATH}" 
"${jvmparams[@]}" gate.cloud.batch.BatchRunner "${gcpparams[@]}"

Modified: gcp/branches/2.8/src/gate/cloud/batch/BatchRunner.java
===================================================================
--- gcp/branches/2.8/src/gate/cloud/batch/BatchRunner.java      2017-06-23 
16:32:55 UTC (rev 20239)
+++ gcp/branches/2.8/src/gate/cloud/batch/BatchRunner.java      2017-06-26 
17:06:35 UTC (rev 20240)
@@ -56,6 +56,8 @@
 import org.apache.log4j.Logger;
 
 import com.sun.jna.Platform;
+import static gate.cloud.io.IOConstants.PARAM_BATCH_FILE_LOCATION;
+import gate.cloud.io.ListDocumentEnumerator;
 import static gate.cloud.io.IOConstants.PARAM_COMPRESSION;
 import static gate.cloud.io.IOConstants.PARAM_DOCUMENT_ROOT;
 import static gate.cloud.io.IOConstants.PARAM_ENCODING;
@@ -64,6 +66,7 @@
 import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_GZIP;
 import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_NONE;
 import static gate.cloud.io.IOConstants.VALUE_COMPRESSION_SNAPPY;
+import static gate.cloud.io.ListDocumentEnumerator.PARAM_FILE_NAME;
 import gate.cloud.io.file.JSONOutputHandler;
 import static 
gate.cloud.io.file.JSONOutputHandler.PARAM_ANNOTATION_TYPE_PROPERTY;
 import static gate.cloud.io.file.JSONOutputHandler.PARAM_GROUP_ENTITIES_BY;
@@ -520,7 +523,7 @@
     // TODO: may be useful to be able to override the default user config and
     // session files here?
     options.addOption("b","batchFile",true,"Batch file (required, replaces -i, 
-o, -x, -r, -I)");
-    options.addOption("i","inputDirectory",true,"Input directory (required, 
unless -b given)");
+    options.addOption("i","inputDirectoryOrFile",true,"Input directory or file 
listing document IDs (required, unless -b given)");
     options.addOption("f","outputFormat",true,"Output format, optional, one of 
'xml'|'gatexml', 'finf', 'ser', 'json', default is 'finf'");
     options.addOption("o","outputDirectory",true,"Output directory (not output 
if missing)");
     options.addOption("x","executePipeline",true,"Pipeline/application file to 
execute (required, unless -b given)");
@@ -710,10 +713,24 @@
           } else {
             aBatch.setBatchId("GcpBatchId");
           }
-          // set the input Handler
+          // set the input Handler, depending on the value of the option "i":
+          // If this points to a directory, we process all matching files in 
that
+          // directory, if it points to a file we process all files listed in
+          // that file by interpreting each line as a file path relative to 
+          // the directory where the specified file is located in.
+          String fileOrDir = line.getOptionValue('i');
+          File fileOrDirFile = new File(fileOrDir);
+          if(!fileOrDirFile.exists()) {
+            throw new RuntimeException("ERROR file or directory does not 
exist: "+fileOrDirFile.getAbsolutePath());
+          }
           String inputHandlerClassName = "gate.cloud.io.file.FileInputHandler";
           Map<String,String> configData = new HashMap<String, String>();
-          configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));
+          if(fileOrDirFile.isDirectory()) {
+            configData.put(PARAM_DOCUMENT_ROOT, fileOrDir);
+          } else {
+            // if we have a file, use the parent directory
+            configData.put(PARAM_DOCUMENT_ROOT, fileOrDirFile.getParent());
+          }
           if(line.hasOption("ci")) {
             configData.put(PARAM_COMPRESSION,VALUE_COMPRESSION_GZIP);          
  
           } else if(line.hasOption("si"))  {
@@ -781,18 +798,33 @@
             outHandler.init();
             // log.info("Have output handler: "+outHandler);            
             outHandlers.add(outHandler);
-          } // if option -o is given
+          } else { // if option -o is given
+            log.info("WARNING: no option -o, processed documents are 
discarded!");
+          }
           aBatch.setOutputHandlers(outHandlers);
-          String enumeratorClassName = 
"gate.cloud.io.file.FileDocumentEnumerator";
+          String enumeratorClassName = null;
+          configData = new HashMap<String, String>();
+          if(fileOrDirFile.isDirectory()) {
+            log.info("Enumerating all file IDs in directory: 
"+fileOrDirFile.getAbsolutePath());
+            enumeratorClassName = "gate.cloud.io.file.FileDocumentEnumerator";
+            configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));     
      
+          } else {
+            log.info("Reading file IDs from file: 
"+fileOrDirFile.getAbsolutePath());
+            enumeratorClassName = "gate.cloud.io.ListDocumentEnumerator";
+            configData.put(PARAM_BATCH_FILE_LOCATION,new 
File(".").getAbsolutePath());
+            configData.put(PARAM_FILE_NAME, fileOrDir);
+            configData.put(PARAM_ENCODING,"UTF-8");
+          }
           Class<? extends DocumentEnumerator> enumeratorClass =
                 Class.forName(enumeratorClassName, true, Gate.getClassLoader())
                         .asSubclass(DocumentEnumerator.class);
-          configData = new HashMap<String, String>();
-          configData.put(PARAM_DOCUMENT_ROOT, line.getOptionValue('i'));       
   
-          List<DocumentID> docIds = new LinkedList<DocumentID>();
           DocumentEnumerator enumerator = enumeratorClass.newInstance();
           enumerator.config(configData);
           enumerator.init();
+          // TODO: this should really not be done like this! 
+          // Instead of reading the docIds in all at once, they should 
+          // get streamed to the workers on demand, if at all possible?
+          List<DocumentID> docIds = new LinkedList<DocumentID>();
           while(enumerator.hasNext()) {
             DocumentID id = enumerator.next();
             // log.info("Adding document: "+id);

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
GATE-cvs mailing list
GATE-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to