Revision: 17649
          http://sourceforge.net/p/gate/code/17649
Author:   markagreenwood
Date:     2014-03-13 15:04:17 +0000 (Thu, 13 Mar 2014)
Log Message:
-----------
updated the tika parsers to the latest version, which includes moving away from 
some deprecated constants and a minor change in the XML handler to deal with 
what appears to be an odd issue in the RTF parser that allows us to hit the 
endElement method without having hit startElement -- if we deal with that then 
the doc opens and is readable so ours is clearly not to wonder why

Modified Paths:
--------------
    gate/trunk/build/deploy/maven/gate-core.pom.template
    gate/trunk/ivy.xml
    gate/trunk/src/main/gate/SimpleAnnotation.java
    gate/trunk/src/main/gate/SimpleDocument.java
    gate/trunk/src/main/gate/corpora/TikaFormat.java
    gate/trunk/src/main/gate/xml/XmlDocumentHandler.java

Modified: gate/trunk/build/deploy/maven/gate-core.pom.template
===================================================================
--- gate/trunk/build/deploy/maven/gate-core.pom.template        2014-03-13 
11:16:47 UTC (rev 17648)
+++ gate/trunk/build/deploy/maven/gate-core.pom.template        2014-03-13 
15:04:17 UTC (rev 17649)
@@ -93,7 +93,7 @@
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-parsers</artifactId>
-            <version>1.0</version>
+            <version>1.5</version>
             <scope>runtime</scope>
 
             <exclusions>
@@ -120,7 +120,18 @@
                     <groupId>asm</groupId>
                     <artifactId>asm</artifactId>
                 </exclusion>
+                <exclusion>
+                    <groupId>org.ow2.asm</groupId>
+                    <artifactId>asm-debug-all</artifactId>
+                </exclusion>
 
+                <!-- we aren't using any of the GUI aspects of tika so we 
don't need to be
+                     able to do syntax highlighting -->
+                <exclusion>
+                    <groupId>com.uwyn</groupId>
+                    <artifactId>jhighlight</artifactId>
+                </exclusion>
+
                 <!-- we exclude boilerpipe as it isn't required for Tika given 
the way we
                      currently configure it, and it contains a copy of 
NekoHTML which causes
                      HTML parsing in GATE to change, and the tests to break -->
@@ -129,6 +140,34 @@
                     <artifactId>boilerpipe</artifactId>
                 </exclusion>
 
+                <!-- we don't handle audio files so we don't need the 
following -->
+                <exclusion>
+                   <groupId>org.gagravarr</groupId>
+                    <artifactId>vorbis-java-tika</artifactId>
+                </exclusion>
+                <exclusion>
+                   <groupId>org.gagravarr</groupId>
+                    <artifactId>vorbis-java-core</artifactId>
+                </exclusion>
+                <exclusion>
+                   <groupId>com.googlecode.mp4parser</groupId>
+                    <artifactId>isoparser</artifactId>
+                </exclusion>
+
+                <!-- no idea what these are used for internally to tika but we 
don't seem to need them -->
+                <exclusion>
+                   <groupId>com.googlecode.juniversalchardet</groupId>
+                    <artifactId>juniversalchardet</artifactId>
+                </exclusion>
+                <exclusion>
+                   <groupId>org.tukaani</groupId>
+                    <artifactId>xz</artifactId>
+                </exclusion>
+                <exclusion>
+                   <groupId>com.adobe.xmp</groupId>
+                    <artifactId>xmpcore</artifactId>
+                </exclusion>
+
                 <!-- exclude stuff that is part of Java 6 core -->
                 <exclusion>
                     <groupId>org.apache.geronimo.specs</groupId>

Modified: gate/trunk/ivy.xml
===================================================================
--- gate/trunk/ivy.xml  2014-03-13 11:16:47 UTC (rev 17648)
+++ gate/trunk/ivy.xml  2014-03-13 15:04:17 UTC (rev 17649)
@@ -44,7 +44,7 @@
     <dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.14" />
 
     <!-- we use Tika to handle most document formats other than XML, HTML and 
TXT -->
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.0" 
conf="runtime->master,runtime" >
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.5" 
conf="runtime->master,runtime" >
       <!-- we don't currently use the RSS/ATOM aspects of Tika so we don't 
need ROME -->
       <exclude org="rome" module="rome" />
 
@@ -56,11 +56,26 @@
 
       <!-- we don't use Tika to parse Java classes so we don't need ASM -->
       <exclude org="asm" module="asm" />
+      <exclude org="org.ow2.asm" module="asm-debug-all" />
+      
+      <!-- we aren't using any of the GUI aspects of tika so we don't need to 
be
+           able to do syntax highlighting -->
+      <exclude org="com.uwyn" module="jhighlight" />
 
       <!-- we exclude boilerpipe as it isn't required for Tika given the way we
            currently configure it, and it contains a copy of NekoHTML which 
causes
            HTML parsing in GATE to change, and the tests to break -->
       <exclude org="de.l3s.boilerpipe" module="boilerpipe" />
+      
+      <!-- we don't handle audio files so we don't need the following -->
+      <exclude org="org.gagravarr" module="vorbis-java-tika" />
+      <exclude org="org.gagravarr" module="vorbis-java-core" />
+      <exclude org="com.googlecode.mp4parser" module="isoparser" />
+      
+      <!-- no idea what these are used for internally to tika but we don't 
seem to need them -->
+      <exclude org="com.googlecode.juniversalchardet" 
module="juniversalchardet" />
+      <exclude org="org.tukaani" module="xz" />
+      <exclude org="com.adobe.xmp" module="xmpcore" />
     </dependency>
 
     <!-- We use Woodstox instead of the default Sun StAX implementation as it 
seems to behave better.

Modified: gate/trunk/src/main/gate/SimpleAnnotation.java
===================================================================
--- gate/trunk/src/main/gate/SimpleAnnotation.java      2014-03-13 11:16:47 UTC 
(rev 17648)
+++ gate/trunk/src/main/gate/SimpleAnnotation.java      2014-03-13 15:04:17 UTC 
(rev 17649)
@@ -29,7 +29,7 @@
   * SimpleAnnotation was introduced to simplify the API of annotations
   */
 public interface SimpleAnnotation
-extends FeatureBearer, IdBearer, Comparable, Serializable {
+extends FeatureBearer, IdBearer, Comparable<Object>, Serializable {
 
   /** The type of the annotation (corresponds to TIPSTER "name"). */
   public String getType();

Modified: gate/trunk/src/main/gate/SimpleDocument.java
===================================================================
--- gate/trunk/src/main/gate/SimpleDocument.java        2014-03-13 11:16:47 UTC 
(rev 17648)
+++ gate/trunk/src/main/gate/SimpleDocument.java        2014-03-13 15:04:17 UTC 
(rev 17649)
@@ -22,7 +22,7 @@
 
 /** Represents the commonalities between all sorts of documents.
  */
-public interface SimpleDocument extends LanguageResource, Comparable {
+public interface SimpleDocument extends LanguageResource, Comparable<Object> {
 
   /**
    * The parameter name for the document URL

Modified: gate/trunk/src/main/gate/corpora/TikaFormat.java
===================================================================
--- gate/trunk/src/main/gate/corpora/TikaFormat.java    2014-03-13 11:16:47 UTC 
(rev 17648)
+++ gate/trunk/src/main/gate/corpora/TikaFormat.java    2014-03-13 15:04:17 UTC 
(rev 17649)
@@ -21,6 +21,9 @@
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.xml.sax.SAXException;
@@ -124,17 +127,17 @@
 
   private void setDocumentFeatures(Metadata metadata, Document doc) {
     FeatureMap fmap = doc.getFeatures();
-    setTikaFeature(metadata, Metadata.TITLE, fmap);
-    setTikaFeature(metadata, Metadata.AUTHOR, fmap);
-    setTikaFeature(metadata, Metadata.COMMENTS, fmap);
-    setTikaFeature(metadata, Metadata.CREATOR, fmap);
+    setTikaFeature(metadata, TikaCoreProperties.TITLE, fmap);
+    setTikaFeature(metadata, Office.AUTHOR, fmap);
+    setTikaFeature(metadata, TikaCoreProperties.COMMENTS, fmap);
+    setTikaFeature(metadata, TikaCoreProperties.CREATOR, fmap);
     if (fmap.get("AUTHORS") == null && fmap.get("AUTHOR") != null)
-      fmap.put("AUTHORS", fmap.get(Metadata.AUTHOR));
+      fmap.put("AUTHORS", fmap.get(Office.AUTHOR));
     fmap.put("MimeType", metadata.get(Metadata.CONTENT_TYPE));
   }
 
-  private void setTikaFeature(Metadata metadata, String key, FeatureMap fmap) {
-    String value = metadata.get(key);
+  private void setTikaFeature(Metadata metadata, Property property, FeatureMap 
fmap) {
+    String value = metadata.get(property);
     if (value == null) {
       return;
     }
@@ -143,7 +146,7 @@
     if (value.length() == 0) {
       return;
     }
-    key = key.toUpperCase();
+    String key = property.getName().toUpperCase();
     if (fmap.containsKey(key)) {
       fmap.put("TIKA_" + key, value);
     }

Modified: gate/trunk/src/main/gate/xml/XmlDocumentHandler.java
===================================================================
--- gate/trunk/src/main/gate/xml/XmlDocumentHandler.java        2014-03-13 
11:16:47 UTC (rev 17648)
+++ gate/trunk/src/main/gate/xml/XmlDocumentHandler.java        2014-03-13 
15:04:17 UTC (rev 17649)
@@ -368,23 +368,23 @@
     // obj is for internal use
     CustomObject obj = null;
 
-    // if the stack is not empty, we extract the custom object and delete it
-    if (!stack.isEmpty()) {
+    // if the stack is not empty, we extract the custom object and
+    // delete it from the stack
+    if(!stack.isEmpty()) {
       obj = stack.pop();
-    }// End if
 
-    // Before adding it to the colector, we need to check if is an
-    // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
-    if (obj.getStart().equals(obj.getEnd())) {
-      // The element had an end tag and its start was equal to its end. Hence
-      // it is anEmptyAndSpan one.
-      obj.getFM().put("isEmptyAndSpan", "true");
-    }// End iff
+      // Before adding it to the colector, we need to check if is an
+      // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
+      if(obj.getStart().equals(obj.getEnd())) {
+        // The element had an end tag and its start was equal to its
+        // end. Hence it is anEmptyAndSpan one.
+        obj.getFM().put("isEmptyAndSpan", "true");
+      }// End iff
 
-    // Put the object into colector
-    // Later, when the document ends we will use colector to create all the
-    // annotations
-    colector.add(obj);
+      // Put the object into colector. Later, when the document ends
+      // we will use colector to create all the annotations
+      colector.add(obj);
+    }// End if
 
     // if element is found on Element2String map, then add the string to the
     // end of the document content

This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.


------------------------------------------------------------------------------
Learn Graph Databases - Download FREE O'Reilly Book
"Graph Databases" is the definitive new guide to graph databases and their
applications. Written by three acclaimed leaders in the field,
this first edition is now available. Download your free book today!
http://p.sf.net/sfu/13534_NeoTech
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs

Reply via email to