Revision: 17649
http://sourceforge.net/p/gate/code/17649
Author: markagreenwood
Date: 2014-03-13 15:04:17 +0000 (Thu, 13 Mar 2014)
Log Message:
-----------
updated the tika parsers to the latest version, which includes moving away from
some deprecated constants and a minor change in the XML handler to deal with
what appears to be an odd issue in the RTF parser that allows us to hit the
endElement method without having hit startElement -- if we deal with that then
the doc opens and is readable so ours is clearly not to wonder why
Modified Paths:
--------------
gate/trunk/build/deploy/maven/gate-core.pom.template
gate/trunk/ivy.xml
gate/trunk/src/main/gate/SimpleAnnotation.java
gate/trunk/src/main/gate/SimpleDocument.java
gate/trunk/src/main/gate/corpora/TikaFormat.java
gate/trunk/src/main/gate/xml/XmlDocumentHandler.java
Modified: gate/trunk/build/deploy/maven/gate-core.pom.template
===================================================================
--- gate/trunk/build/deploy/maven/gate-core.pom.template 2014-03-13
11:16:47 UTC (rev 17648)
+++ gate/trunk/build/deploy/maven/gate-core.pom.template 2014-03-13
15:04:17 UTC (rev 17649)
@@ -93,7 +93,7 @@
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
- <version>1.0</version>
+ <version>1.5</version>
<scope>runtime</scope>
<exclusions>
@@ -120,7 +120,18 @@
<groupId>asm</groupId>
<artifactId>asm</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.ow2.asm</groupId>
+ <artifactId>asm-debug-all</artifactId>
+ </exclusion>
+ <!-- we aren't using any of the GUI aspects of tika so we
don't need to be
+ able to do syntax highlighting -->
+ <exclusion>
+ <groupId>com.uwyn</groupId>
+ <artifactId>jhighlight</artifactId>
+ </exclusion>
+
<!-- we exclude boilerpipe as it isn't required for Tika given
the way we
currently configure it, and it contains a copy of
NekoHTML which causes
HTML parsing in GATE to change, and the tests to break -->
@@ -129,6 +140,34 @@
<artifactId>boilerpipe</artifactId>
</exclusion>
+ <!-- we don't handle audio files so we don't need the
following -->
+ <exclusion>
+ <groupId>org.gagravarr</groupId>
+ <artifactId>vorbis-java-tika</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.gagravarr</groupId>
+ <artifactId>vorbis-java-core</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.googlecode.mp4parser</groupId>
+ <artifactId>isoparser</artifactId>
+ </exclusion>
+
+ <!-- no idea what these are used for internally to tika but we
don't seem to need them -->
+ <exclusion>
+ <groupId>com.googlecode.juniversalchardet</groupId>
+ <artifactId>juniversalchardet</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>org.tukaani</groupId>
+ <artifactId>xz</artifactId>
+ </exclusion>
+ <exclusion>
+ <groupId>com.adobe.xmp</groupId>
+ <artifactId>xmpcore</artifactId>
+ </exclusion>
+
<!-- exclude stuff that is part of Java 6 core -->
<exclusion>
<groupId>org.apache.geronimo.specs</groupId>
Modified: gate/trunk/ivy.xml
===================================================================
--- gate/trunk/ivy.xml 2014-03-13 11:16:47 UTC (rev 17648)
+++ gate/trunk/ivy.xml 2014-03-13 15:04:17 UTC (rev 17649)
@@ -44,7 +44,7 @@
<dependency org="net.sourceforge.nekohtml" name="nekohtml" rev="1.9.14" />
<!-- we use Tika to handle most document formats other than XML, HTML and
TXT -->
- <dependency org="org.apache.tika" name="tika-parsers" rev="1.0"
conf="runtime->master,runtime" >
+ <dependency org="org.apache.tika" name="tika-parsers" rev="1.5"
conf="runtime->master,runtime" >
<!-- we don't currently use the RSS/ATOM aspects of Tika so we don't
need ROME -->
<exclude org="rome" module="rome" />
@@ -56,11 +56,26 @@
<!-- we don't use Tika to parse Java classes so we don't need ASM -->
<exclude org="asm" module="asm" />
+ <exclude org="org.ow2.asm" module="asm-debug-all" />
+
+ <!-- we aren't using any of the GUI aspects of tika so we don't need to
be
+ able to do syntax highlighting -->
+ <exclude org="com.uwyn" module="jhighlight" />
<!-- we exclude boilerpipe as it isn't required for Tika given the way we
currently configure it, and it contains a copy of NekoHTML which
causes
HTML parsing in GATE to change, and the tests to break -->
<exclude org="de.l3s.boilerpipe" module="boilerpipe" />
+
+ <!-- we don't handle audio files so we don't need the following -->
+ <exclude org="org.gagravarr" module="vorbis-java-tika" />
+ <exclude org="org.gagravarr" module="vorbis-java-core" />
+ <exclude org="com.googlecode.mp4parser" module="isoparser" />
+
+ <!-- no idea what these are used for internally to tika but we don't
seem to need them -->
+ <exclude org="com.googlecode.juniversalchardet"
module="juniversalchardet" />
+ <exclude org="org.tukaani" module="xz" />
+ <exclude org="com.adobe.xmp" module="xmpcore" />
</dependency>
<!-- We use Woodstox instead of the default Sun StAX implementation as it
seems to behave better.
Modified: gate/trunk/src/main/gate/SimpleAnnotation.java
===================================================================
--- gate/trunk/src/main/gate/SimpleAnnotation.java 2014-03-13 11:16:47 UTC
(rev 17648)
+++ gate/trunk/src/main/gate/SimpleAnnotation.java 2014-03-13 15:04:17 UTC
(rev 17649)
@@ -29,7 +29,7 @@
* SimpleAnnotation was introduced to simplify the API of annotations
*/
public interface SimpleAnnotation
-extends FeatureBearer, IdBearer, Comparable, Serializable {
+extends FeatureBearer, IdBearer, Comparable<Object>, Serializable {
/** The type of the annotation (corresponds to TIPSTER "name"). */
public String getType();
Modified: gate/trunk/src/main/gate/SimpleDocument.java
===================================================================
--- gate/trunk/src/main/gate/SimpleDocument.java 2014-03-13 11:16:47 UTC
(rev 17648)
+++ gate/trunk/src/main/gate/SimpleDocument.java 2014-03-13 15:04:17 UTC
(rev 17649)
@@ -22,7 +22,7 @@
/** Represents the commonalities between all sorts of documents.
*/
-public interface SimpleDocument extends LanguageResource, Comparable {
+public interface SimpleDocument extends LanguageResource, Comparable<Object> {
/**
* The parameter name for the document URL
Modified: gate/trunk/src/main/gate/corpora/TikaFormat.java
===================================================================
--- gate/trunk/src/main/gate/corpora/TikaFormat.java 2014-03-13 11:16:47 UTC
(rev 17648)
+++ gate/trunk/src/main/gate/corpora/TikaFormat.java 2014-03-13 15:04:17 UTC
(rev 17649)
@@ -21,6 +21,9 @@
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.SAXException;
@@ -124,17 +127,17 @@
private void setDocumentFeatures(Metadata metadata, Document doc) {
FeatureMap fmap = doc.getFeatures();
- setTikaFeature(metadata, Metadata.TITLE, fmap);
- setTikaFeature(metadata, Metadata.AUTHOR, fmap);
- setTikaFeature(metadata, Metadata.COMMENTS, fmap);
- setTikaFeature(metadata, Metadata.CREATOR, fmap);
+ setTikaFeature(metadata, TikaCoreProperties.TITLE, fmap);
+ setTikaFeature(metadata, Office.AUTHOR, fmap);
+ setTikaFeature(metadata, TikaCoreProperties.COMMENTS, fmap);
+ setTikaFeature(metadata, TikaCoreProperties.CREATOR, fmap);
if (fmap.get("AUTHORS") == null && fmap.get("AUTHOR") != null)
- fmap.put("AUTHORS", fmap.get(Metadata.AUTHOR));
+ fmap.put("AUTHORS", fmap.get(Office.AUTHOR));
fmap.put("MimeType", metadata.get(Metadata.CONTENT_TYPE));
}
- private void setTikaFeature(Metadata metadata, String key, FeatureMap fmap) {
- String value = metadata.get(key);
+ private void setTikaFeature(Metadata metadata, Property property, FeatureMap
fmap) {
+ String value = metadata.get(property);
if (value == null) {
return;
}
@@ -143,7 +146,7 @@
if (value.length() == 0) {
return;
}
- key = key.toUpperCase();
+ String key = property.getName().toUpperCase();
if (fmap.containsKey(key)) {
fmap.put("TIKA_" + key, value);
}
Modified: gate/trunk/src/main/gate/xml/XmlDocumentHandler.java
===================================================================
--- gate/trunk/src/main/gate/xml/XmlDocumentHandler.java 2014-03-13
11:16:47 UTC (rev 17648)
+++ gate/trunk/src/main/gate/xml/XmlDocumentHandler.java 2014-03-13
15:04:17 UTC (rev 17649)
@@ -368,23 +368,23 @@
// obj is for internal use
CustomObject obj = null;
- // if the stack is not empty, we extract the custom object and delete it
- if (!stack.isEmpty()) {
+ // if the stack is not empty, we extract the custom object and
+ // delete it from the stack
+ if(!stack.isEmpty()) {
obj = stack.pop();
- }// End if
- // Before adding it to the colector, we need to check if is an
- // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
- if (obj.getStart().equals(obj.getEnd())) {
- // The element had an end tag and its start was equal to its end. Hence
- // it is anEmptyAndSpan one.
- obj.getFM().put("isEmptyAndSpan", "true");
- }// End iff
+ // Before adding it to the colector, we need to check if is an
+ // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
+ if(obj.getStart().equals(obj.getEnd())) {
+ // The element had an end tag and its start was equal to its
+ // end. Hence it is anEmptyAndSpan one.
+ obj.getFM().put("isEmptyAndSpan", "true");
+ }// End iff
- // Put the object into colector
- // Later, when the document ends we will use colector to create all the
- // annotations
- colector.add(obj);
+ // Put the object into colector. Later, when the document ends
+ // we will use colector to create all the annotations
+ colector.add(obj);
+ }// End if
// if element is found on Element2String map, then add the string to the
// end of the document content
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
------------------------------------------------------------------------------
Learn Graph Databases - Download FREE O'Reilly Book
"Graph Databases" is the definitive new guide to graph databases and their
applications. Written by three acclaimed leaders in the field,
this first edition is now available. Download your free book today!
http://p.sf.net/sfu/13534_NeoTech
_______________________________________________
GATE-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/gate-cvs