This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 847e19d NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6 (#717)
847e19d is described below
commit 847e19d984503d333fd8fdd430fe347dd370dc4c
Author: Lewis John McGibbney <[email protected]>
AuthorDate: Sat Jan 15 15:24:21 2022 -0800
NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6 (#717)
* NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6
---
ivy/ivy.xml | 2 +-
src/plugin/any23/ivy.xml | 2 +-
src/plugin/any23/plugin.xml | 272 ++++++++++-----------
.../apache/nutch/any23/Any23IndexingFilter.java | 2 +-
.../org/apache/nutch/any23/Any23ParseFilter.java | 35 +--
src/plugin/build-plugin.xml | 3 +-
src/plugin/language-identifier/ivy.xml | 2 +-
src/plugin/language-identifier/plugin.xml | 6 +-
src/plugin/parse-tika/howto_upgrade_tika.txt | 5 +-
src/plugin/parse-tika/ivy.xml | 2 +-
src/plugin/parse-tika/plugin.xml | 70 +++---
.../org/apache/nutch/parse/tika/TestRTFParser.java | 4 +-
12 files changed, 192 insertions(+), 213 deletions(-)
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 8d154bf..34e298f 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -63,7 +63,7 @@
<dependency org="org.apache.hadoop"
name="hadoop-mapreduce-client-jobclient" rev="3.1.3" conf="*->default" />
<!-- End of Hadoop Dependencies -->
- <dependency org="org.apache.tika" name="tika-core" rev="2.1.0"
/>
+ <dependency org="org.apache.tika" name="tika-core" rev="2.2.1"
/>
<dependency org="xml-apis" name="xml-apis" rev="1.4.01" /><!--
force this version as it is required by Tika -->
<dependency org="xerces" name="xercesImpl" rev="2.12.1" />
diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml
index a5a0077..7220a25 100644
--- a/src/plugin/any23/ivy.xml
+++ b/src/plugin/any23/ivy.xml
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="org.apache.any23" name="apache-any23-core" rev="2.5"
conf="*->default">
+ <dependency org="org.apache.any23" name="apache-any23-core" rev="2.6"
conf="*->default">
<exclude org="org.apache.commons" name="commons-lang" />
<exclude org="org.apache.commons" name="commons-compress" />
<exclude org="org.slf4j" name="slf4j-log4j12" />
diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml
index cc941b2..40a42c7 100644
--- a/src/plugin/any23/plugin.xml
+++ b/src/plugin/any23/plugin.xml
@@ -26,194 +26,168 @@
<export name="*"/>
</library>
<!-- Begin Any23 dependencies -->
- <library name="FastInfoset-1.2.16.jar"/>
- <library name="HikariCP-java7-2.4.13.jar"/>
<library name="SparseBitSet-1.2.jar"/>
- <library name="apache-any23-api-2.5.jar"/>
- <library name="apache-any23-core-2.5.jar"/>
- <library name="apache-any23-csvutils-2.5.jar"/>
- <library name="apache-any23-encoding-2.5.jar"/>
- <library name="apache-any23-mime-2.5.jar"/>
- <library name="apache-mime4j-core-0.8.3.jar"/>
- <library name="apache-mime4j-dom-0.8.3.jar"/>
- <library name="asm-7.3.1.jar"/>
- <library name="bcmail-jdk15on-1.64.jar"/>
- <library name="bcpkix-jdk15on-1.64.jar"/>
- <library name="bcprov-jdk15on-1.64.jar"/>
- <library name="biweekly-0.6.3.jar"/>
+ <library name="apache-any23-api-2.6.jar"/>
+ <library name="apache-any23-core-2.6.jar"/>
+ <library name="apache-any23-csvutils-2.6.jar"/>
+ <library name="apache-any23-encoding-2.6.jar"/>
+ <library name="apache-any23-mime-2.6.jar"/>
+ <library name="apache-mime4j-core-0.8.4.jar"/>
+ <library name="apache-mime4j-dom-0.8.4.jar"/>
+ <library name="asm-9.2.jar"/>
+ <library name="bcmail-jdk15on-1.70.jar"/>
+ <library name="bcpkix-jdk15on-1.70.jar"/>
+ <library name="bcutil-jdk15on-1.70.jar"/>
+ <library name="biweekly-0.6.6.jar"/>
<library name="boilerpipe-1.1.0.jar"/>
- <library name="bzip2-0.9.1.jar"/>
- <library name="c3p0-0.9.5.5.jar"/>
- <library name="caffeine-2.6.1.jar"/>
- <library name="cdm-4.5.5.jar"/>
- <library name="checker-qual-2.10.0.jar"/>
- <library name="commons-codec-1.14.jar"/>
+ <library name="caffeine-2.8.1.jar"/>
+ <library name="checker-qual-3.1.0.jar"/>
+ <library name="commons-codec-1.15.jar"/>
<library name="commons-collections4-4.4.jar"/>
- <library name="commons-csv-1.8.jar"/>
+ <library name="commons-csv-1.9.0.jar"/>
<library name="commons-exec-1.3.jar"/>
- <library name="commons-io-2.6.jar"/>
- <library name="commons-lang3-3.10.jar"/>
+ <library name="commons-io-2.11.0.jar"/>
+ <library name="commons-lang3-3.12.0.jar"/>
<library name="commons-logging-1.2.jar"/>
<library name="commons-math3-3.6.1.jar"/>
<library name="commons-rdf-api-0.5.0.jar"/>
+ <library name="commons-text-1.9.jar"/>
<library name="curvesapi-1.06.jar"/>
- <library name="cxf-core-3.3.5.jar"/>
- <library name="cxf-rt-frontend-jaxrs-3.3.5.jar"/>
- <library name="cxf-rt-rs-client-3.3.5.jar"/>
- <library name="cxf-rt-security-3.3.5.jar"/>
- <library name="cxf-rt-transports-http-3.3.5.jar"/>
+ <library name="dd-plist-1.23.jar"/>
<library name="dec-0.1.2.jar"/>
- <library name="ehcache-core-2.6.2.jar"/>
<library name="error_prone_annotations-2.3.4.jar"/>
<library name="f8-1.1.jar"/>
<library name="failureaccess-1.0.1.jar"/>
- <library name="fluent-hc-4.5.10.jar"/>
- <library name="fontbox-2.0.19.jar"/>
- <library name="geoapi-3.0.1.jar"/>
- <library name="geronimo-jta_1.1_spec-1.1.1.jar"/>
- <library name="geronimo-ws-metadata_2.0_spec-1.1.3.jar"/>
- <library name="grib-4.5.5.jar"/>
- <library name="gson-2.8.6.jar"/>
- <library name="guava-28.2-jre.jar"/>
+ <library name="fluent-hc-4.5.13.jar"/>
+ <library name="fontbox-2.0.25.jar"/>
+ <library name="guava-30.1.1-jre.jar"/>
<library name="hppcrt-0.7.5.jar"/>
- <library name="httpclient-4.5.12.jar"/>
- <library name="httpclient-cache-4.5.12.jar"/>
- <library name="httpclient-osgi-4.5.12.jar"/>
- <library name="httpcore-4.4.13.jar"/>
- <library name="httpcore-nio-4.4.12.jar"/>
- <library name="httpcore-osgi-4.4.13.jar"/>
- <library name="httpmime-4.5.12.jar"/>
- <library name="httpservices-4.5.5.jar"/>
- <library name="isoparser-1.1.22.jar"/>
- <library name="istack-commons-runtime-3.0.8.jar"/>
- <library name="j2objc-annotations-1.3.jar"/>
- <library name="jackcess-3.0.1.jar"/>
- <library name="jackcess-encrypt-3.0.0.jar"/>
- <library name="jackson-annotations-2.12.2.jar"/>
- <library name="jackson-core-2.12.2.jar"/>
- <library name="jackson-databind-2.12.2.jar"/>
- <library name="jacorb-omgapi-3.9.jar"/>
+ <library name="httpclient-4.5.13.jar"/>
+ <library name="httpclient-cache-4.5.13.jar"/>
+ <library name="httpclient-osgi-4.5.13.jar"/>
+ <library name="httpcore-4.4.15.jar"/>
+ <library name="httpcore-nio-4.4.14.jar"/>
+ <library name="httpcore-osgi-4.4.14.jar"/>
+ <library name="httpmime-4.5.13.jar"/>
+ <library name="istack-commons-runtime-3.0.12.jar"/>
+ <library name="jackcess-4.0.1.jar"/>
+ <library name="jackcess-encrypt-4.0.1.jar"/>
+ <library name="jackson-annotations-2.11.4.jar"/>
+ <library name="jackson-core-2.12.1.jar"/>
+ <library name="jackson-databind-2.11.4.jar"/>
<library name="jai-imageio-core-1.4.0.jar"/>
- <library name="jakarta.activation-1.2.1.jar"/>
- <library name="jakarta.activation-api-1.2.1.jar"/>
- <library name="jakarta.ws.rs-api-2.1.5.jar"/>
- <library name="jakarta.xml.bind-api-2.3.2.jar"/>
+ <library name="jakarta.activation-1.2.2.jar"/>
+ <library name="jakarta.xml.bind-api-2.3.3.jar"/>
<library name="java-libpst-0.9.3.jar"/>
- <library name="javax.activation-1.2.0.jar"/>
<library name="javax.activation-api-1.2.0.jar"/>
- <library name="javax.annotation-api-1.3.2.jar"/>
<library name="javax.inject-1.jar"/>
- <library name="javax.xml.soap-api-1.4.0.jar"/>
<library name="jaxb-api-2.3.1.jar"/>
- <library name="jaxb-runtime-2.3.2.jar"/>
- <library name="jaxws-api-2.3.1.jar"/>
+ <library name="jaxb-runtime-2.3.5.jar"/>
<library name="jbig2-imageio-3.0.3.jar"/>
- <library name="jboss-rmi-api_1.0_spec-1.0.6.Final.jar"/>
- <library name="jcip-annotations-1.0.jar"/>
- <library name="jcl-over-slf4j-1.7.30.jar"/>
- <library name="jcommander-1.78.jar"/>
- <library name="jdom2-2.0.6.jar"/>
+ <library name="jcl-over-slf4j-1.7.32.jar"/>
+ <library name="jcommander-1.81.jar"/>
+ <library name="jdom2-2.0.6.1.jar"/>
<library name="jempbox-1.8.16.jar"/>
<library name="jhighlight-1.0.3.jar"/>
<library name="jmatio-1.5.jar"/>
- <library name="jna-5.5.0.jar"/>
- <library name="joda-time-2.2.jar"/>
- <library name="json-simple-1.1.1.jar"/>
- <library name="jsonld-java-0.13.2.jar"/>
- <library name="jsoup-1.13.1.jar"/>
+ <library name="jsonld-java-0.13.4.jar"/>
+ <library name="jsoup-1.14.3.jar"/>
<library name="jsr305-3.0.2.jar"/>
- <library name="jul-to-slf4j-1.7.30.jar"/>
<library name="juniversalchardet-1.0.3.jar"/>
- <library name="junrar-4.0.0.jar"/>
+ <library name="junrar-7.4.0.jar"/>
<library
name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
+ <library name="log4j-core-2.17.1.jar"/>
+ <library name="log4j-slf4j-impl-2.17.1.jar"/>
<library name="mapdb-1.0.8.jar"/>
- <library name="mchange-commons-java-0.2.19.jar"/>
- <library name="metadata-extractor-2.13.0.jar"/>
- <library name="mimepull-1.9.7.jar"/>
- <library name="netcdf4-4.5.5.jar"/>
- <library name="openjson-1.0.11.jar"/>
- <library name="opennlp-tools-1.9.2.jar"/>
- <library name="owlapi-api-5.1.13.jar"/>
- <library name="owlapi-apibinding-5.1.13.jar"/>
- <library name="owlapi-impl-5.1.13.jar"/>
- <library name="owlapi-oboformat-5.1.13.jar"/>
- <library name="owlapi-parsers-5.1.13.jar"/>
- <library name="owlapi-rio-5.1.13.jar"/>
- <library name="owlapi-tools-5.1.13.jar"/>
- <library name="parso-2.0.11.jar"/>
- <library name="pdfbox-2.0.19.jar"/>
- <library name="pdfbox-tools-2.0.19.jar"/>
+ <library name="metadata-extractor-2.16.0.jar"/>
+ <library name="owlapi-api-5.1.19.jar"/>
+ <library name="owlapi-apibinding-5.1.19.jar"/>
+ <library name="owlapi-impl-5.1.19.jar"/>
+ <library name="owlapi-oboformat-5.1.19.jar"/>
+ <library name="owlapi-parsers-5.1.19.jar"/>
+ <library name="owlapi-rio-5.1.19.jar"/>
+ <library name="owlapi-tools-5.1.19.jar"/>
+ <library name="parso-2.0.14.jar"/>
+ <library name="pdfbox-2.0.25.jar"/>
+ <library name="pdfbox-debugger-2.0.25.jar"/>
+ <library name="pdfbox-tools-2.0.25.jar"/>
<library name="poi-4.1.2.jar"/>
<library name="poi-ooxml-4.1.2.jar"/>
<library name="poi-ooxml-schemas-4.1.2.jar"/>
<library name="poi-scratchpad-4.1.2.jar"/>
- <library name="preflight-2.0.19.jar"/>
- <library name="protobuf-java-3.11.4.jar"/>
- <library name="quartz-2.3.2.jar"/>
- <library name="rdf4j-http-client-3.1.2.jar"/>
- <library name="rdf4j-http-protocol-3.1.2.jar"/>
- <library name="rdf4j-model-3.1.2.jar"/>
- <library name="rdf4j-query-3.1.2.jar"/>
- <library name="rdf4j-queryalgebra-evaluation-3.1.2.jar"/>
- <library name="rdf4j-queryalgebra-model-3.1.2.jar"/>
- <library name="rdf4j-queryparser-api-3.1.2.jar"/>
- <library name="rdf4j-queryparser-sparql-3.1.2.jar"/>
- <library name="rdf4j-queryresultio-api-3.1.2.jar"/>
- <library name="rdf4j-queryresultio-binary-3.1.2.jar"/>
- <library name="rdf4j-queryresultio-sparqlxml-3.1.2.jar"/>
- <library name="rdf4j-repository-api-3.1.2.jar"/>
- <library name="rdf4j-repository-sail-3.1.2.jar"/>
- <library name="rdf4j-repository-sparql-3.1.2.jar"/>
- <library name="rdf4j-rio-api-3.1.2.jar"/>
- <library name="rdf4j-rio-binary-3.1.2.jar"/>
- <library name="rdf4j-rio-datatypes-3.1.2.jar"/>
- <library name="rdf4j-rio-jsonld-3.1.2.jar"/>
- <library name="rdf4j-rio-languages-3.1.2.jar"/>
- <library name="rdf4j-rio-n3-3.1.2.jar"/>
- <library name="rdf4j-rio-nquads-3.1.2.jar"/>
- <library name="rdf4j-rio-ntriples-3.1.2.jar"/>
- <library name="rdf4j-rio-rdfjson-3.1.2.jar"/>
- <library name="rdf4j-rio-rdfxml-3.1.2.jar"/>
- <library name="rdf4j-rio-trig-3.1.2.jar"/>
- <library name="rdf4j-rio-trix-3.1.2.jar"/>
- <library name="rdf4j-rio-turtle-3.1.2.jar"/>
- <library name="rdf4j-sail-api-3.1.2.jar"/>
- <library name="rdf4j-sail-base-3.1.2.jar"/>
- <library name="rdf4j-sail-memory-3.1.2.jar"/>
- <library name="rdf4j-util-3.1.2.jar"/>
- <library name="rome-1.12.2.jar"/>
- <library name="rome-utils-1.12.2.jar"/>
- <library name="saaj-impl-1.4.0-b03.jar"/>
+ <library name="rdf4j-http-client-3.7.4.jar"/>
+ <library name="rdf4j-http-protocol-3.7.4.jar"/>
+ <library name="rdf4j-model-3.7.4.jar"/>
+ <library name="rdf4j-model-api-3.7.4.jar"/>
+ <library name="rdf4j-model-vocabulary-3.7.4.jar"/>
+ <library name="rdf4j-query-3.7.4.jar"/>
+ <library name="rdf4j-queryalgebra-evaluation-3.7.4.jar"/>
+ <library name="rdf4j-queryalgebra-model-3.7.4.jar"/>
+ <library name="rdf4j-queryparser-api-3.7.4.jar"/>
+ <library name="rdf4j-queryparser-sparql-3.7.4.jar"/>
+ <library name="rdf4j-queryresultio-api-3.7.4.jar"/>
+ <library name="rdf4j-queryresultio-binary-3.7.4.jar"/>
+ <library name="rdf4j-queryresultio-sparqlxml-3.7.4.jar"/>
+ <library name="rdf4j-repository-api-3.7.4.jar"/>
+ <library name="rdf4j-repository-sail-3.7.4.jar"/>
+ <library name="rdf4j-repository-sparql-3.7.4.jar"/>
+ <library name="rdf4j-rio-api-3.7.4.jar"/>
+ <library name="rdf4j-rio-binary-3.7.4.jar"/>
+ <library name="rdf4j-rio-datatypes-3.7.4.jar"/>
+ <library name="rdf4j-rio-hdt-3.4.3.jar"/>
+ <library name="rdf4j-rio-jsonld-3.7.4.jar"/>
+ <library name="rdf4j-rio-languages-3.7.4.jar"/>
+ <library name="rdf4j-rio-n3-3.7.4.jar"/>
+ <library name="rdf4j-rio-nquads-3.7.4.jar"/>
+ <library name="rdf4j-rio-ntriples-3.7.4.jar"/>
+ <library name="rdf4j-rio-rdfjson-3.7.4.jar"/>
+ <library name="rdf4j-rio-rdfxml-3.7.4.jar"/>
+ <library name="rdf4j-rio-trig-3.7.4.jar"/>
+ <library name="rdf4j-rio-trix-3.7.4.jar"/>
+ <library name="rdf4j-rio-turtle-3.7.4.jar"/>
+ <library name="rdf4j-sail-api-3.7.4.jar"/>
+ <library name="rdf4j-sail-base-3.7.4.jar"/>
+ <library name="rdf4j-sail-memory-3.7.4.jar"/>
+ <library name="rdf4j-util-3.7.4.jar"/>
+ <library name="rome-1.16.0.jar"/>
+ <library name="rome-utils-1.16.0.jar"/>
<library name="semargl-core-0.7.jar"/>
<library name="semargl-rdf-0.7.jar"/>
<library name="semargl-rdf4j-0.7.jar"/>
<library name="semargl-rdfa-0.7.jar"/>
- <library name="sentiment-analysis-parser-0.1.jar"/>
- <library name="sis-feature-1.0.jar"/>
- <library name="sis-metadata-1.0.jar"/>
- <library name="sis-netcdf-1.0.jar"/>
- <library name="sis-referencing-1.0.jar"/>
- <library name="sis-storage-1.0.jar"/>
- <library name="sis-utility-1.0.jar"/>
- <library name="snakeyaml-1.26.jar"/>
- <library name="stax-ex-1.8.2.jar"/>
- <library name="stax2-api-3.1.4.jar"/>
+ <library name="snakeyaml-1.30.jar"/>
<library name="tagsoup-1.2.1.jar"/>
- <library name="tika-core-1.24.jar"/>
- <library name="tika-parsers-1.24.jar"/>
- <library name="txw2-2.3.2.jar"/>
- <library name="udunits-4.5.5.jar"/>
- <library name="unit-api-1.0.jar"/>
+ <library name="tika-core-2.2.1.jar"/>
+ <library name="tika-parser-apple-module-2.2.1.jar"/>
+ <library name="tika-parser-audiovideo-module-2.2.1.jar"/>
+ <library name="tika-parser-cad-module-2.2.1.jar"/>
+ <library name="tika-parser-code-module-2.2.1.jar"/>
+ <library name="tika-parser-crypto-module-2.2.1.jar"/>
+ <library name="tika-parser-digest-commons-2.2.1.jar"/>
+ <library name="tika-parser-font-module-2.2.1.jar"/>
+ <library name="tika-parser-html-commons-2.2.1.jar"/>
+ <library name="tika-parser-html-module-2.2.1.jar"/>
+ <library name="tika-parser-image-module-2.2.1.jar"/>
+ <library name="tika-parser-mail-commons-2.2.1.jar"/>
+ <library name="tika-parser-mail-module-2.2.1.jar"/>
+ <library name="tika-parser-microsoft-module-2.2.1.jar"/>
+ <library name="tika-parser-miscoffice-module-2.2.1.jar"/>
+ <library name="tika-parser-news-module-2.2.1.jar"/>
+ <library name="tika-parser-ocr-module-2.2.1.jar"/>
+ <library name="tika-parser-pdf-module-2.2.1.jar"/>
+ <library name="tika-parser-pkg-module-2.2.1.jar"/>
+ <library name="tika-parser-text-module-2.2.1.jar"/>
+ <library name="tika-parser-xml-module-2.2.1.jar"/>
+ <library name="tika-parser-xmp-commons-2.2.1.jar"/>
+ <library name="tika-parser-zip-commons-2.2.1.jar"/>
+ <library name="tika-parsers-standard-package-2.2.1.jar"/>
+ <library name="txw2-2.3.5.jar"/>
<library name="vinnie-2.0.2.jar"/>
<library name="vorbis-java-core-0.8.jar"/>
<library name="vorbis-java-tika-0.8.jar"/>
- <library name="woodstox-core-5.0.3.jar"/>
<library name="xmlbeans-3.1.0.jar"/>
- <library name="xmlschema-core-2.2.5.jar"/>
- <library name="xmpbox-2.0.19.jar"/>
- <library name="xmpcore-6.1.10.jar"/>
- <library name="xmpcore-shaded-6.1.10.jar"/>
- <library name="xz-1.8.jar"/>
+ <library name="xmpcore-6.1.11.jar"/>
+ <library name="xz-1.9.jar"/>
<!-- End Any23 dependencies -->
</runtime>
diff --git
a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
index c0f1d6f..09dc32e 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
@@ -106,7 +106,7 @@ public class Any23IndexingFilter implements IndexingFilter {
return doc;
}
- private String keyToShortKey(String key) {
+ private static String keyToShortKey(String key) {
if (key.startsWith("<") && key.endsWith(">")) {
key = key.substring(1, key.length() - 1);
}
diff --git
a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
index af7f135..bed659f 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -20,13 +20,17 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
import java.util.Set;
import java.util.TreeSet;
-import java.util.Collections;
-import java.util.Arrays;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.filter.IgnoreAccidentalRDFa;
+import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
+import org.apache.any23.mime.TikaMIMETypeDetector;
+import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
import org.apache.any23.writer.BenchmarkTripleHandler;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.TripleHandler;
@@ -76,7 +80,7 @@ public class Any23ParseFilter implements HtmlParseFilter {
Set<String> triples = null;
Any23Parser(String url, String htmlContent, String contentType, String...
extractorNames) throws TripleHandlerException {
- triples = new TreeSet<>();
+ this.triples = new TreeSet<>();
try {
parse(url, htmlContent, contentType, extractorNames);
} catch (URISyntaxException e) {
@@ -91,33 +95,32 @@ public class Any23ParseFilter implements HtmlParseFilter {
* Maintains a {@link java.util.Set} containing the triples
* @return a {@link java.util.Set} of triples.
*/
- private Set<String> getTriples() {
- return triples;
+ Set<String> getTriples() {
+ return this.triples;
}
- private void parse(String url, String htmlContent, String contentType,
String... extractorNames) throws URISyntaxException, IOException,
TripleHandlerException {
+ private void parse(String url, String htmlContent, String contentType,
String... extractorNames)
+ throws URISyntaxException, IOException, TripleHandlerException {
Any23 any23 = new Any23(extractorNames);
- any23.setMIMETypeDetector(null);
+ any23.setMIMETypeDetector(new TikaMIMETypeDetector(new
WhiteSpacesPurifier()));
ByteArrayOutputStream baos = new ByteArrayOutputStream();
- try {
- TripleHandler tHandler = new NTriplesWriter(baos);
- BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
+ try (TripleHandler tHandler = new IgnoreTitlesOfEmptyDocuments(
+ new IgnoreAccidentalRDFa(
+ new NTriplesWriter(baos)));
+ BenchmarkTripleHandler bHandler = new
BenchmarkTripleHandler(tHandler)) {
try {
any23.extract(htmlContent, url, contentType, "UTF-8", bHandler);
} catch (IOException e) {
LOG.error("Error while reading the source", e);
} catch (ExtractionException e) {
LOG.error("Error while extracting structured data", e);
- } finally {
- tHandler.close();
- bHandler.close();
}
LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report());
String n3 = baos.toString("UTF-8");
String[] triplesStrings = n3.split("\n");
- Collections.addAll(triples, triplesStrings);
+ Collections.addAll(this.triples, triplesStrings);
} catch (IOException e) {
LOG.error("Unexpected IOException", e);
}
@@ -139,8 +142,8 @@ public class Any23ParseFilter implements HtmlParseFilter {
*/
@Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
- String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF,
"html-head-meta");
- String[] supportedContentTypes =
conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html",
"application/xhtml+xml");
+ String[] extractorNames = this.conf.getStrings(ANY_23_EXTRACTORS_CONF,
"html-head-meta");
+ String[] supportedContentTypes =
this.conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html",
"application/xhtml+xml");
String contentType = content.getContentType();
if (supportedContentTypes != null &&
!Arrays.asList(supportedContentTypes).contains(contentType)) {
LOG.debug("Ignoring document at {} because it has an unsupported
Content-Type {}", content.getUrl(), contentType);
diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml
index 814123e..f6e87e8 100755
--- a/src/plugin/build-plugin.xml
+++ b/src/plugin/build-plugin.xml
@@ -80,7 +80,8 @@
</fileset>
<!-- global test dependencies -->
<fileset dir="${nutch.root}/build/test/lib">
- <include name="*.jar" />
+ <include name="junit*.jar" />
+ <include name="hamcrest*.jar" />
</fileset>
<path refid="classpath"/>
</path>
diff --git a/src/plugin/language-identifier/ivy.xml
b/src/plugin/language-identifier/ivy.xml
index e7bf2e6..feba0bb 100644
--- a/src/plugin/language-identifier/ivy.xml
+++ b/src/plugin/language-identifier/ivy.xml
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="org.apache.tika" name="tika-langdetect-optimaize"
rev="2.1.0" conf="*->default">
+ <dependency org="org.apache.tika" name="tika-langdetect-optimaize"
rev="2.2.1" conf="*->default">
<!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
<exclude org="org.apache.tika" name="tika-core" />
<exclude org="com.google.guava" name="guava" />
diff --git a/src/plugin/language-identifier/plugin.xml
b/src/plugin/language-identifier/plugin.xml
index 85fa121..360b81a 100644
--- a/src/plugin/language-identifier/plugin.xml
+++ b/src/plugin/language-identifier/plugin.xml
@@ -27,15 +27,15 @@
</library>
<!-- dependencies of Tika's Optimaize language detector
(tika-langdetect-optimaize) -->
<library name="annotations-12.0.jar"/>
- <library name="checker-qual-3.8.0.jar"/>
- <library name="error_prone_annotations-2.5.1.jar"/>
+ <library name="checker-qual-3.12.0.jar"/>
+ <library name="error_prone_annotations-2.7.1.jar"/>
<library name="failureaccess-1.0.1.jar"/>
<library name="j2objc-annotations-1.3.jar"/>
<library name="jsonic-1.2.11.jar"/>
<library name="jsr305-3.0.2.jar"/>
<library name="language-detector-0.6.jar"/>
<library
name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
- <library name="tika-langdetect-optimaize-2.1.0.jar"/>
+ <library name="tika-langdetect-optimaize-2.2.1.jar"/>
</runtime>
<requires>
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt
b/src/plugin/parse-tika/howto_upgrade_tika.txt
index 697750e..cb3ed6b 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -23,8 +23,7 @@
(eventually with different versions)
- duplicated libs can be added to the exclusions of transitive dependencies
in
build/plugins/parse-tika/ivy.xml
- - but it should be made sure that the library versions in ivy/ivy.xml
correspond to
- those required by Tika
+ - but the library versions in ivy/ivy.xml MUST correspond to those required
by Tika
5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/:
@@ -34,6 +33,8 @@
$ cd ../language-identifier/
+It should be noted that Any23 also has a dependency on Tika so you may wish to
check that there are no classpath conflicts in the any23 plugin as well.
+
7. Build Nutch and run all unit tests:
$ cd ../../../
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 388b695..f0033b8 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="org.apache.tika" name="tika-parsers-standard-package"
rev="2.1.0" conf="*->default">
+ <dependency org="org.apache.tika" name="tika-parsers-standard-package"
rev="2.2.1" conf="*->default">
<!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
<exclude org="org.apache.tika" name="tika-core" />
<exclude org="org.apache.commons" name="commons-lang3" />
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index 275406a..1d4e2c7 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -26,13 +26,14 @@
<export name="*"/>
</library>
<!-- dependencies of Tika (tika-parsers) -->
+ <library name="SparseBitSet-1.2.jar"/>
<library name="apache-mime4j-core-0.8.4.jar"/>
<library name="apache-mime4j-dom-0.8.4.jar"/>
<library name="asm-9.2.jar"/>
- <library name="bcmail-jdk15on-1.69.jar"/>
- <library name="bcpkix-jdk15on-1.69.jar"/>
- <library name="bcprov-jdk15on-1.69.jar"/>
- <library name="bcutil-jdk15on-1.69.jar"/>
+ <library name="bcmail-jdk15on-1.70.jar"/>
+ <library name="bcpkix-jdk15on-1.70.jar"/>
+ <library name="bcprov-jdk15on-1.70.jar"/>
+ <library name="bcutil-jdk15on-1.70.jar"/>
<library name="boilerpipe-1.1.0.jar"/>
<library name="commons-codec-1.15.jar"/>
<library name="commons-compress-1.21.jar"/>
@@ -42,7 +43,7 @@
<library name="curvesapi-1.06.jar"/>
<library name="dd-plist-1.23.jar"/>
<library name="dec-0.1.2.jar"/>
- <library name="fontbox-2.0.24.jar"/>
+ <library name="fontbox-2.0.25.jar"/>
<library name="istack-commons-runtime-3.0.12.jar"/>
<library name="jackcess-4.0.1.jar"/>
<library name="jackcess-encrypt-4.0.1.jar"/>
@@ -50,10 +51,10 @@
<library name="jakarta.activation-1.2.2.jar"/>
<library name="jakarta.xml.bind-api-2.3.3.jar"/>
<library name="java-libpst-0.9.3.jar"/>
- <library name="jaxb-runtime-2.3.4.jar"/>
+ <library name="jaxb-runtime-2.3.5.jar"/>
<library name="jbig2-imageio-3.0.3.jar"/>
<library name="jcl-over-slf4j-1.7.32.jar"/>
- <library name="jdom2-2.0.6.jar"/>
+ <library name="jdom2-2.0.6.1.jar"/>
<library name="jempbox-1.8.16.jar"/>
<library name="jhighlight-1.0.3.jar"/>
<library name="jmatio-1.5.jar"/>
@@ -61,41 +62,40 @@
<library name="junrar-7.4.0.jar"/>
<library name="metadata-extractor-2.16.0.jar"/>
<library name="parso-2.0.14.jar"/>
- <library name="pdfbox-2.0.24.jar"/>
- <library name="pdfbox-debugger-2.0.24.jar"/>
- <library name="pdfbox-tools-2.0.24.jar"/>
+ <library name="pdfbox-2.0.25.jar"/>
+ <library name="pdfbox-debugger-2.0.25.jar"/>
+ <library name="pdfbox-tools-2.0.25.jar"/>
<library name="poi-4.1.2.jar"/>
<library name="poi-ooxml-4.1.2.jar"/>
<library name="poi-ooxml-schemas-4.1.2.jar"/>
<library name="poi-scratchpad-4.1.2.jar"/>
<library name="rome-1.16.0.jar"/>
<library name="rome-utils-1.16.0.jar"/>
- <library name="SparseBitSet-1.2.jar"/>
<library name="tagsoup-1.2.1.jar"/>
- <library name="tika-parser-apple-module-2.1.0.jar"/>
- <library name="tika-parser-audiovideo-module-2.1.0.jar"/>
- <library name="tika-parser-cad-module-2.1.0.jar"/>
- <library name="tika-parser-code-module-2.1.0.jar"/>
- <library name="tika-parser-crypto-module-2.1.0.jar"/>
- <library name="tika-parser-digest-commons-2.1.0.jar"/>
- <library name="tika-parser-font-module-2.1.0.jar"/>
- <library name="tika-parser-html-commons-2.1.0.jar"/>
- <library name="tika-parser-html-module-2.1.0.jar"/>
- <library name="tika-parser-image-module-2.1.0.jar"/>
- <library name="tika-parser-mail-commons-2.1.0.jar"/>
- <library name="tika-parser-mail-module-2.1.0.jar"/>
- <library name="tika-parser-microsoft-module-2.1.0.jar"/>
- <library name="tika-parser-miscoffice-module-2.1.0.jar"/>
- <library name="tika-parser-news-module-2.1.0.jar"/>
- <library name="tika-parser-ocr-module-2.1.0.jar"/>
- <library name="tika-parser-pdf-module-2.1.0.jar"/>
- <library name="tika-parser-pkg-module-2.1.0.jar"/>
- <library name="tika-parsers-standard-package-2.1.0.jar"/>
- <library name="tika-parser-text-module-2.1.0.jar"/>
- <library name="tika-parser-xml-module-2.1.0.jar"/>
- <library name="tika-parser-xmp-commons-2.1.0.jar"/>
- <library name="tika-parser-zip-commons-2.1.0.jar"/>
- <library name="txw2-2.3.4.jar"/>
+ <library name="tika-parser-apple-module-2.2.1.jar"/>
+ <library name="tika-parser-audiovideo-module-2.2.1.jar"/>
+ <library name="tika-parser-cad-module-2.2.1.jar"/>
+ <library name="tika-parser-code-module-2.2.1.jar"/>
+ <library name="tika-parser-crypto-module-2.2.1.jar"/>
+ <library name="tika-parser-digest-commons-2.2.1.jar"/>
+ <library name="tika-parser-font-module-2.2.1.jar"/>
+ <library name="tika-parser-html-commons-2.2.1.jar"/>
+ <library name="tika-parser-html-module-2.2.1.jar"/>
+ <library name="tika-parser-image-module-2.2.1.jar"/>
+ <library name="tika-parser-mail-commons-2.2.1.jar"/>
+ <library name="tika-parser-mail-module-2.2.1.jar"/>
+ <library name="tika-parser-microsoft-module-2.2.1.jar"/>
+ <library name="tika-parser-miscoffice-module-2.2.1.jar"/>
+ <library name="tika-parser-news-module-2.2.1.jar"/>
+ <library name="tika-parser-ocr-module-2.2.1.jar"/>
+ <library name="tika-parser-pdf-module-2.2.1.jar"/>
+ <library name="tika-parser-pkg-module-2.2.1.jar"/>
+ <library name="tika-parser-text-module-2.2.1.jar"/>
+ <library name="tika-parser-xml-module-2.2.1.jar"/>
+ <library name="tika-parser-xmp-commons-2.2.1.jar"/>
+ <library name="tika-parser-zip-commons-2.2.1.jar"/>
+ <library name="tika-parsers-standard-package-2.2.1.jar"/>
+ <library name="txw2-2.3.5.jar"/>
<library name="vorbis-java-core-0.8.jar"/>
<library name="vorbis-java-tika-0.8.jar"/>
<library name="xmlbeans-3.1.0.jar"/>
diff --git
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index 3dfe26d..de0c1c6 100644
---
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -26,7 +26,7 @@ import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.DublinCore;
import org.junit.Assert;
import org.junit.Test;
@@ -60,7 +60,7 @@ public class TestRTFParser extends TikaParserTest {
Metadata meta = parse.getData().getParseMeta();
Assert.assertEquals("test rft document", title);
- Assert.assertEquals("tests",
meta.get(OfficeOpenXMLCore.SUBJECT.getName()));
+ Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT.getName()));
}
}