This is an automated email from the ASF dual-hosted git repository.

lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 847e19d  NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6 (#717)
847e19d is described below

commit 847e19d984503d333fd8fdd430fe347dd370dc4c
Author: Lewis John McGibbney <lewis.mcgibb...@gmail.com>
AuthorDate: Sat Jan 15 15:24:21 2022 -0800

    NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6 (#717)
    
    * NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6
---
 ivy/ivy.xml                                        |   2 +-
 src/plugin/any23/ivy.xml                           |   2 +-
 src/plugin/any23/plugin.xml                        | 272 ++++++++++-----------
 .../apache/nutch/any23/Any23IndexingFilter.java    |   2 +-
 .../org/apache/nutch/any23/Any23ParseFilter.java   |  35 +--
 src/plugin/build-plugin.xml                        |   3 +-
 src/plugin/language-identifier/ivy.xml             |   2 +-
 src/plugin/language-identifier/plugin.xml          |   6 +-
 src/plugin/parse-tika/howto_upgrade_tika.txt       |   5 +-
 src/plugin/parse-tika/ivy.xml                      |   2 +-
 src/plugin/parse-tika/plugin.xml                   |  70 +++---
 .../org/apache/nutch/parse/tika/TestRTFParser.java |   4 +-
 12 files changed, 192 insertions(+), 213 deletions(-)

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 8d154bf..34e298f 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -63,7 +63,7 @@
                <dependency org="org.apache.hadoop" 
name="hadoop-mapreduce-client-jobclient" rev="3.1.3" conf="*->default" />
                <!-- End of Hadoop Dependencies -->
 
-               <dependency org="org.apache.tika" name="tika-core" rev="2.1.0" 
/>
+               <dependency org="org.apache.tika" name="tika-core" rev="2.2.1" 
/>
 
                <dependency org="xml-apis" name="xml-apis" rev="1.4.01" /><!-- 
force this version as it is required by Tika -->
                <dependency org="xerces" name="xercesImpl" rev="2.12.1" />
diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml
index a5a0077..7220a25 100644
--- a/src/plugin/any23/ivy.xml
+++ b/src/plugin/any23/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.any23" name="apache-any23-core" rev="2.5" 
conf="*->default">
+    <dependency org="org.apache.any23" name="apache-any23-core" rev="2.6" 
conf="*->default">
       <exclude org="org.apache.commons" name="commons-lang" />
       <exclude org="org.apache.commons" name="commons-compress" />
       <exclude org="org.slf4j" name="slf4j-log4j12" />
diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml
index cc941b2..40a42c7 100644
--- a/src/plugin/any23/plugin.xml
+++ b/src/plugin/any23/plugin.xml
@@ -26,194 +26,168 @@
       <export name="*"/>
     </library>
       <!-- Begin Any23 dependencies -->
-      <library name="FastInfoset-1.2.16.jar"/>
-      <library name="HikariCP-java7-2.4.13.jar"/>
       <library name="SparseBitSet-1.2.jar"/>
-      <library name="apache-any23-api-2.5.jar"/>
-      <library name="apache-any23-core-2.5.jar"/>
-      <library name="apache-any23-csvutils-2.5.jar"/>
-      <library name="apache-any23-encoding-2.5.jar"/>
-      <library name="apache-any23-mime-2.5.jar"/>
-      <library name="apache-mime4j-core-0.8.3.jar"/>
-      <library name="apache-mime4j-dom-0.8.3.jar"/>
-      <library name="asm-7.3.1.jar"/>
-      <library name="bcmail-jdk15on-1.64.jar"/>
-      <library name="bcpkix-jdk15on-1.64.jar"/>
-      <library name="bcprov-jdk15on-1.64.jar"/>
-      <library name="biweekly-0.6.3.jar"/>
+      <library name="apache-any23-api-2.6.jar"/>
+      <library name="apache-any23-core-2.6.jar"/>
+      <library name="apache-any23-csvutils-2.6.jar"/>
+      <library name="apache-any23-encoding-2.6.jar"/>
+      <library name="apache-any23-mime-2.6.jar"/>
+      <library name="apache-mime4j-core-0.8.4.jar"/>
+      <library name="apache-mime4j-dom-0.8.4.jar"/>
+      <library name="asm-9.2.jar"/>
+      <library name="bcmail-jdk15on-1.70.jar"/>
+      <library name="bcpkix-jdk15on-1.70.jar"/>
+      <library name="bcutil-jdk15on-1.70.jar"/>
+      <library name="biweekly-0.6.6.jar"/>
       <library name="boilerpipe-1.1.0.jar"/>
-      <library name="bzip2-0.9.1.jar"/>
-      <library name="c3p0-0.9.5.5.jar"/>
-      <library name="caffeine-2.6.1.jar"/>
-      <library name="cdm-4.5.5.jar"/>
-      <library name="checker-qual-2.10.0.jar"/>
-      <library name="commons-codec-1.14.jar"/>
+      <library name="caffeine-2.8.1.jar"/>
+      <library name="checker-qual-3.1.0.jar"/>
+      <library name="commons-codec-1.15.jar"/>
       <library name="commons-collections4-4.4.jar"/>
-      <library name="commons-csv-1.8.jar"/>
+      <library name="commons-csv-1.9.0.jar"/>
       <library name="commons-exec-1.3.jar"/>
-      <library name="commons-io-2.6.jar"/>
-      <library name="commons-lang3-3.10.jar"/>
+      <library name="commons-io-2.11.0.jar"/>
+      <library name="commons-lang3-3.12.0.jar"/>
       <library name="commons-logging-1.2.jar"/>
       <library name="commons-math3-3.6.1.jar"/>
       <library name="commons-rdf-api-0.5.0.jar"/>
+      <library name="commons-text-1.9.jar"/>
       <library name="curvesapi-1.06.jar"/>
-      <library name="cxf-core-3.3.5.jar"/>
-      <library name="cxf-rt-frontend-jaxrs-3.3.5.jar"/>
-      <library name="cxf-rt-rs-client-3.3.5.jar"/>
-      <library name="cxf-rt-security-3.3.5.jar"/>
-      <library name="cxf-rt-transports-http-3.3.5.jar"/>
+      <library name="dd-plist-1.23.jar"/>
       <library name="dec-0.1.2.jar"/>
-      <library name="ehcache-core-2.6.2.jar"/>
       <library name="error_prone_annotations-2.3.4.jar"/>
       <library name="f8-1.1.jar"/>
       <library name="failureaccess-1.0.1.jar"/>
-      <library name="fluent-hc-4.5.10.jar"/>
-      <library name="fontbox-2.0.19.jar"/>
-      <library name="geoapi-3.0.1.jar"/>
-      <library name="geronimo-jta_1.1_spec-1.1.1.jar"/>
-      <library name="geronimo-ws-metadata_2.0_spec-1.1.3.jar"/>
-      <library name="grib-4.5.5.jar"/>
-      <library name="gson-2.8.6.jar"/>
-      <library name="guava-28.2-jre.jar"/>
+      <library name="fluent-hc-4.5.13.jar"/>
+      <library name="fontbox-2.0.25.jar"/>
+      <library name="guava-30.1.1-jre.jar"/>
       <library name="hppcrt-0.7.5.jar"/>
-      <library name="httpclient-4.5.12.jar"/>
-      <library name="httpclient-cache-4.5.12.jar"/>
-      <library name="httpclient-osgi-4.5.12.jar"/>
-      <library name="httpcore-4.4.13.jar"/>
-      <library name="httpcore-nio-4.4.12.jar"/>
-      <library name="httpcore-osgi-4.4.13.jar"/>
-      <library name="httpmime-4.5.12.jar"/>
-      <library name="httpservices-4.5.5.jar"/>
-      <library name="isoparser-1.1.22.jar"/>
-      <library name="istack-commons-runtime-3.0.8.jar"/>
-      <library name="j2objc-annotations-1.3.jar"/>
-      <library name="jackcess-3.0.1.jar"/>
-      <library name="jackcess-encrypt-3.0.0.jar"/>
-      <library name="jackson-annotations-2.12.2.jar"/>
-      <library name="jackson-core-2.12.2.jar"/>
-      <library name="jackson-databind-2.12.2.jar"/>
-      <library name="jacorb-omgapi-3.9.jar"/>
+      <library name="httpclient-4.5.13.jar"/>
+      <library name="httpclient-cache-4.5.13.jar"/>
+      <library name="httpclient-osgi-4.5.13.jar"/>
+      <library name="httpcore-4.4.15.jar"/>
+      <library name="httpcore-nio-4.4.14.jar"/>
+      <library name="httpcore-osgi-4.4.14.jar"/>
+      <library name="httpmime-4.5.13.jar"/>
+      <library name="istack-commons-runtime-3.0.12.jar"/>
+      <library name="jackcess-4.0.1.jar"/>
+      <library name="jackcess-encrypt-4.0.1.jar"/>
+      <library name="jackson-annotations-2.11.4.jar"/>
+      <library name="jackson-core-2.12.1.jar"/>
+      <library name="jackson-databind-2.11.4.jar"/>
       <library name="jai-imageio-core-1.4.0.jar"/>
-      <library name="jakarta.activation-1.2.1.jar"/>
-      <library name="jakarta.activation-api-1.2.1.jar"/>
-      <library name="jakarta.ws.rs-api-2.1.5.jar"/>
-      <library name="jakarta.xml.bind-api-2.3.2.jar"/>
+      <library name="jakarta.activation-1.2.2.jar"/>
+      <library name="jakarta.xml.bind-api-2.3.3.jar"/>
       <library name="java-libpst-0.9.3.jar"/>
-      <library name="javax.activation-1.2.0.jar"/>
       <library name="javax.activation-api-1.2.0.jar"/>
-      <library name="javax.annotation-api-1.3.2.jar"/>
       <library name="javax.inject-1.jar"/>
-      <library name="javax.xml.soap-api-1.4.0.jar"/>
       <library name="jaxb-api-2.3.1.jar"/>
-      <library name="jaxb-runtime-2.3.2.jar"/>
-      <library name="jaxws-api-2.3.1.jar"/>
+      <library name="jaxb-runtime-2.3.5.jar"/>
       <library name="jbig2-imageio-3.0.3.jar"/>
-      <library name="jboss-rmi-api_1.0_spec-1.0.6.Final.jar"/>
-      <library name="jcip-annotations-1.0.jar"/>
-      <library name="jcl-over-slf4j-1.7.30.jar"/>
-      <library name="jcommander-1.78.jar"/>
-      <library name="jdom2-2.0.6.jar"/>
+      <library name="jcl-over-slf4j-1.7.32.jar"/>
+      <library name="jcommander-1.81.jar"/>
+      <library name="jdom2-2.0.6.1.jar"/>
       <library name="jempbox-1.8.16.jar"/>
       <library name="jhighlight-1.0.3.jar"/>
       <library name="jmatio-1.5.jar"/>
-      <library name="jna-5.5.0.jar"/>
-      <library name="joda-time-2.2.jar"/>
-      <library name="json-simple-1.1.1.jar"/>
-      <library name="jsonld-java-0.13.2.jar"/>
-      <library name="jsoup-1.13.1.jar"/>
+      <library name="jsonld-java-0.13.4.jar"/>
+      <library name="jsoup-1.14.3.jar"/>
       <library name="jsr305-3.0.2.jar"/>
-      <library name="jul-to-slf4j-1.7.30.jar"/>
       <library name="juniversalchardet-1.0.3.jar"/>
-      <library name="junrar-4.0.0.jar"/>
+      <library name="junrar-7.4.0.jar"/>
       <library 
name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
+      <library name="log4j-core-2.17.1.jar"/>
+      <library name="log4j-slf4j-impl-2.17.1.jar"/>
       <library name="mapdb-1.0.8.jar"/>
-      <library name="mchange-commons-java-0.2.19.jar"/>
-      <library name="metadata-extractor-2.13.0.jar"/>
-      <library name="mimepull-1.9.7.jar"/>
-      <library name="netcdf4-4.5.5.jar"/>
-      <library name="openjson-1.0.11.jar"/>
-      <library name="opennlp-tools-1.9.2.jar"/>
-      <library name="owlapi-api-5.1.13.jar"/>
-      <library name="owlapi-apibinding-5.1.13.jar"/>
-      <library name="owlapi-impl-5.1.13.jar"/>
-      <library name="owlapi-oboformat-5.1.13.jar"/>
-      <library name="owlapi-parsers-5.1.13.jar"/>
-      <library name="owlapi-rio-5.1.13.jar"/>
-      <library name="owlapi-tools-5.1.13.jar"/>
-      <library name="parso-2.0.11.jar"/>
-      <library name="pdfbox-2.0.19.jar"/>
-      <library name="pdfbox-tools-2.0.19.jar"/>
+      <library name="metadata-extractor-2.16.0.jar"/>
+      <library name="owlapi-api-5.1.19.jar"/>
+      <library name="owlapi-apibinding-5.1.19.jar"/>
+      <library name="owlapi-impl-5.1.19.jar"/>
+      <library name="owlapi-oboformat-5.1.19.jar"/>
+      <library name="owlapi-parsers-5.1.19.jar"/>
+      <library name="owlapi-rio-5.1.19.jar"/>
+      <library name="owlapi-tools-5.1.19.jar"/>
+      <library name="parso-2.0.14.jar"/>
+      <library name="pdfbox-2.0.25.jar"/>
+      <library name="pdfbox-debugger-2.0.25.jar"/>
+      <library name="pdfbox-tools-2.0.25.jar"/>
       <library name="poi-4.1.2.jar"/>
       <library name="poi-ooxml-4.1.2.jar"/>
       <library name="poi-ooxml-schemas-4.1.2.jar"/>
       <library name="poi-scratchpad-4.1.2.jar"/>
-      <library name="preflight-2.0.19.jar"/>
-      <library name="protobuf-java-3.11.4.jar"/>
-      <library name="quartz-2.3.2.jar"/>
-      <library name="rdf4j-http-client-3.1.2.jar"/>
-      <library name="rdf4j-http-protocol-3.1.2.jar"/>
-      <library name="rdf4j-model-3.1.2.jar"/>
-      <library name="rdf4j-query-3.1.2.jar"/>
-      <library name="rdf4j-queryalgebra-evaluation-3.1.2.jar"/>
-      <library name="rdf4j-queryalgebra-model-3.1.2.jar"/>
-      <library name="rdf4j-queryparser-api-3.1.2.jar"/>
-      <library name="rdf4j-queryparser-sparql-3.1.2.jar"/>
-      <library name="rdf4j-queryresultio-api-3.1.2.jar"/>
-      <library name="rdf4j-queryresultio-binary-3.1.2.jar"/>
-      <library name="rdf4j-queryresultio-sparqlxml-3.1.2.jar"/>
-      <library name="rdf4j-repository-api-3.1.2.jar"/>
-      <library name="rdf4j-repository-sail-3.1.2.jar"/>
-      <library name="rdf4j-repository-sparql-3.1.2.jar"/>
-      <library name="rdf4j-rio-api-3.1.2.jar"/>
-      <library name="rdf4j-rio-binary-3.1.2.jar"/>
-      <library name="rdf4j-rio-datatypes-3.1.2.jar"/>
-      <library name="rdf4j-rio-jsonld-3.1.2.jar"/>
-      <library name="rdf4j-rio-languages-3.1.2.jar"/>
-      <library name="rdf4j-rio-n3-3.1.2.jar"/>
-      <library name="rdf4j-rio-nquads-3.1.2.jar"/>
-      <library name="rdf4j-rio-ntriples-3.1.2.jar"/>
-      <library name="rdf4j-rio-rdfjson-3.1.2.jar"/>
-      <library name="rdf4j-rio-rdfxml-3.1.2.jar"/>
-      <library name="rdf4j-rio-trig-3.1.2.jar"/>
-      <library name="rdf4j-rio-trix-3.1.2.jar"/>
-      <library name="rdf4j-rio-turtle-3.1.2.jar"/>
-      <library name="rdf4j-sail-api-3.1.2.jar"/>
-      <library name="rdf4j-sail-base-3.1.2.jar"/>
-      <library name="rdf4j-sail-memory-3.1.2.jar"/>
-      <library name="rdf4j-util-3.1.2.jar"/>
-      <library name="rome-1.12.2.jar"/>
-      <library name="rome-utils-1.12.2.jar"/>
-      <library name="saaj-impl-1.4.0-b03.jar"/>
+      <library name="rdf4j-http-client-3.7.4.jar"/>
+      <library name="rdf4j-http-protocol-3.7.4.jar"/>
+      <library name="rdf4j-model-3.7.4.jar"/>
+      <library name="rdf4j-model-api-3.7.4.jar"/>
+      <library name="rdf4j-model-vocabulary-3.7.4.jar"/>
+      <library name="rdf4j-query-3.7.4.jar"/>
+      <library name="rdf4j-queryalgebra-evaluation-3.7.4.jar"/>
+      <library name="rdf4j-queryalgebra-model-3.7.4.jar"/>
+      <library name="rdf4j-queryparser-api-3.7.4.jar"/>
+      <library name="rdf4j-queryparser-sparql-3.7.4.jar"/>
+      <library name="rdf4j-queryresultio-api-3.7.4.jar"/>
+      <library name="rdf4j-queryresultio-binary-3.7.4.jar"/>
+      <library name="rdf4j-queryresultio-sparqlxml-3.7.4.jar"/>
+      <library name="rdf4j-repository-api-3.7.4.jar"/>
+      <library name="rdf4j-repository-sail-3.7.4.jar"/>
+      <library name="rdf4j-repository-sparql-3.7.4.jar"/>
+      <library name="rdf4j-rio-api-3.7.4.jar"/>
+      <library name="rdf4j-rio-binary-3.7.4.jar"/>
+      <library name="rdf4j-rio-datatypes-3.7.4.jar"/>
+      <library name="rdf4j-rio-hdt-3.4.3.jar"/>
+      <library name="rdf4j-rio-jsonld-3.7.4.jar"/>
+      <library name="rdf4j-rio-languages-3.7.4.jar"/>
+      <library name="rdf4j-rio-n3-3.7.4.jar"/>
+      <library name="rdf4j-rio-nquads-3.7.4.jar"/>
+      <library name="rdf4j-rio-ntriples-3.7.4.jar"/>
+      <library name="rdf4j-rio-rdfjson-3.7.4.jar"/>
+      <library name="rdf4j-rio-rdfxml-3.7.4.jar"/>
+      <library name="rdf4j-rio-trig-3.7.4.jar"/>
+      <library name="rdf4j-rio-trix-3.7.4.jar"/>
+      <library name="rdf4j-rio-turtle-3.7.4.jar"/>
+      <library name="rdf4j-sail-api-3.7.4.jar"/>
+      <library name="rdf4j-sail-base-3.7.4.jar"/>
+      <library name="rdf4j-sail-memory-3.7.4.jar"/>
+      <library name="rdf4j-util-3.7.4.jar"/>
+      <library name="rome-1.16.0.jar"/>
+      <library name="rome-utils-1.16.0.jar"/>
       <library name="semargl-core-0.7.jar"/>
       <library name="semargl-rdf-0.7.jar"/>
       <library name="semargl-rdf4j-0.7.jar"/>
       <library name="semargl-rdfa-0.7.jar"/>
-      <library name="sentiment-analysis-parser-0.1.jar"/>
-      <library name="sis-feature-1.0.jar"/>
-      <library name="sis-metadata-1.0.jar"/>
-      <library name="sis-netcdf-1.0.jar"/>
-      <library name="sis-referencing-1.0.jar"/>
-      <library name="sis-storage-1.0.jar"/>
-      <library name="sis-utility-1.0.jar"/>
-      <library name="snakeyaml-1.26.jar"/>
-      <library name="stax-ex-1.8.2.jar"/>
-      <library name="stax2-api-3.1.4.jar"/>
+      <library name="snakeyaml-1.30.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-core-1.24.jar"/>
-      <library name="tika-parsers-1.24.jar"/>
-      <library name="txw2-2.3.2.jar"/>
-      <library name="udunits-4.5.5.jar"/>
-      <library name="unit-api-1.0.jar"/>
+      <library name="tika-core-2.2.1.jar"/>
+      <library name="tika-parser-apple-module-2.2.1.jar"/>
+      <library name="tika-parser-audiovideo-module-2.2.1.jar"/>
+      <library name="tika-parser-cad-module-2.2.1.jar"/>
+      <library name="tika-parser-code-module-2.2.1.jar"/>
+      <library name="tika-parser-crypto-module-2.2.1.jar"/>
+      <library name="tika-parser-digest-commons-2.2.1.jar"/>
+      <library name="tika-parser-font-module-2.2.1.jar"/>
+      <library name="tika-parser-html-commons-2.2.1.jar"/>
+      <library name="tika-parser-html-module-2.2.1.jar"/>
+      <library name="tika-parser-image-module-2.2.1.jar"/>
+      <library name="tika-parser-mail-commons-2.2.1.jar"/>
+      <library name="tika-parser-mail-module-2.2.1.jar"/>
+      <library name="tika-parser-microsoft-module-2.2.1.jar"/>
+      <library name="tika-parser-miscoffice-module-2.2.1.jar"/>
+      <library name="tika-parser-news-module-2.2.1.jar"/>
+      <library name="tika-parser-ocr-module-2.2.1.jar"/>
+      <library name="tika-parser-pdf-module-2.2.1.jar"/>
+      <library name="tika-parser-pkg-module-2.2.1.jar"/>
+      <library name="tika-parser-text-module-2.2.1.jar"/>
+      <library name="tika-parser-xml-module-2.2.1.jar"/>
+      <library name="tika-parser-xmp-commons-2.2.1.jar"/>
+      <library name="tika-parser-zip-commons-2.2.1.jar"/>
+      <library name="tika-parsers-standard-package-2.2.1.jar"/>
+      <library name="txw2-2.3.5.jar"/>
       <library name="vinnie-2.0.2.jar"/>
       <library name="vorbis-java-core-0.8.jar"/>
       <library name="vorbis-java-tika-0.8.jar"/>
-      <library name="woodstox-core-5.0.3.jar"/>
       <library name="xmlbeans-3.1.0.jar"/>
-      <library name="xmlschema-core-2.2.5.jar"/>
-      <library name="xmpbox-2.0.19.jar"/>
-      <library name="xmpcore-6.1.10.jar"/>
-      <library name="xmpcore-shaded-6.1.10.jar"/>
-      <library name="xz-1.8.jar"/>
+      <library name="xmpcore-6.1.11.jar"/>
+      <library name="xz-1.9.jar"/>
       <!-- End Any23 dependencies -->
   </runtime>
 
diff --git 
a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java 
b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
index c0f1d6f..09dc32e 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
@@ -106,7 +106,7 @@ public class Any23IndexingFilter implements IndexingFilter {
     return doc;
   }
   
-  private String keyToShortKey(String key) {
+  private static String keyToShortKey(String key) {
     if (key.startsWith("<") && key.endsWith(">")) {
       key = key.substring(1, key.length() - 1);
     }
diff --git 
a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java 
b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
index af7f135..bed659f 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -20,13 +20,17 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.net.URISyntaxException;
 import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.Set;
 import java.util.TreeSet;
-import java.util.Collections;
-import java.util.Arrays;
 
 import org.apache.any23.Any23;
 import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.filter.IgnoreAccidentalRDFa;
+import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
+import org.apache.any23.mime.TikaMIMETypeDetector;
+import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
 import org.apache.any23.writer.BenchmarkTripleHandler;
 import org.apache.any23.writer.NTriplesWriter;
 import org.apache.any23.writer.TripleHandler;
@@ -76,7 +80,7 @@ public class Any23ParseFilter implements HtmlParseFilter {
     Set<String> triples = null;
 
     Any23Parser(String url, String htmlContent, String contentType, String... 
extractorNames) throws TripleHandlerException {
-      triples = new TreeSet<>();
+      this.triples = new TreeSet<>();
       try {
         parse(url, htmlContent, contentType, extractorNames);
       } catch (URISyntaxException e) {
@@ -91,33 +95,32 @@ public class Any23ParseFilter implements HtmlParseFilter {
      * Maintains a {@link java.util.Set} containing the triples
      * @return a {@link java.util.Set} of triples.
      */
-    private Set<String> getTriples() {
-      return triples;
+    Set<String> getTriples() {
+      return this.triples;
     }
 
-    private void parse(String url, String htmlContent, String contentType, 
String... extractorNames) throws URISyntaxException, IOException, 
TripleHandlerException {
+    private void parse(String url, String htmlContent, String contentType, 
String... extractorNames)
+            throws URISyntaxException, IOException, TripleHandlerException {
       Any23 any23 = new Any23(extractorNames);
-      any23.setMIMETypeDetector(null);
+      any23.setMIMETypeDetector(new TikaMIMETypeDetector(new 
WhiteSpacesPurifier()));
       ByteArrayOutputStream baos = new ByteArrayOutputStream();
-      try {
-        TripleHandler tHandler = new NTriplesWriter(baos);
-        BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler);
+      try (TripleHandler tHandler = new IgnoreTitlesOfEmptyDocuments(
+              new IgnoreAccidentalRDFa(
+                      new NTriplesWriter(baos))); 
+              BenchmarkTripleHandler bHandler = new 
BenchmarkTripleHandler(tHandler)) {
         try {
           any23.extract(htmlContent, url, contentType, "UTF-8", bHandler);
         } catch (IOException e) {
           LOG.error("Error while reading the source", e);
         } catch (ExtractionException e) {
           LOG.error("Error while extracting structured data", e);
-        } finally {
-          tHandler.close();
-          bHandler.close();
         }
 
         LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report());
 
         String n3 = baos.toString("UTF-8");
         String[] triplesStrings = n3.split("\n");
-        Collections.addAll(triples, triplesStrings);
+        Collections.addAll(this.triples, triplesStrings);
       } catch (IOException e) {
         LOG.error("Unexpected IOException", e);
       }
@@ -139,8 +142,8 @@ public class Any23ParseFilter implements HtmlParseFilter {
    */
   @Override
   public ParseResult filter(Content content, ParseResult parseResult, 
HTMLMetaTags metaTags, DocumentFragment doc) {
-    String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, 
"html-head-meta");
-    String[] supportedContentTypes = 
conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", 
"application/xhtml+xml");
+    String[] extractorNames = this.conf.getStrings(ANY_23_EXTRACTORS_CONF, 
"html-head-meta");
+    String[] supportedContentTypes = 
this.conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", 
"application/xhtml+xml");
     String contentType = content.getContentType();
     if (supportedContentTypes != null && 
!Arrays.asList(supportedContentTypes).contains(contentType)) {
       LOG.debug("Ignoring document at {} because it has an unsupported 
Content-Type {}", content.getUrl(), contentType);
diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml
index 814123e..f6e87e8 100755
--- a/src/plugin/build-plugin.xml
+++ b/src/plugin/build-plugin.xml
@@ -80,7 +80,8 @@
     </fileset>
     <!-- global test dependencies -->
     <fileset dir="${nutch.root}/build/test/lib">
-      <include name="*.jar" />
+      <include name="junit*.jar" />
+      <include name="hamcrest*.jar" />
     </fileset>
     <path refid="classpath"/>
   </path>
diff --git a/src/plugin/language-identifier/ivy.xml 
b/src/plugin/language-identifier/ivy.xml
index e7bf2e6..feba0bb 100644
--- a/src/plugin/language-identifier/ivy.xml
+++ b/src/plugin/language-identifier/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-langdetect-optimaize" 
rev="2.1.0" conf="*->default">
+    <dependency org="org.apache.tika" name="tika-langdetect-optimaize" 
rev="2.2.1" conf="*->default">
       <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="com.google.guava" name="guava" />
diff --git a/src/plugin/language-identifier/plugin.xml 
b/src/plugin/language-identifier/plugin.xml
index 85fa121..360b81a 100644
--- a/src/plugin/language-identifier/plugin.xml
+++ b/src/plugin/language-identifier/plugin.xml
@@ -27,15 +27,15 @@
       </library>
       <!-- dependencies of Tika's Optimaize language detector 
(tika-langdetect-optimaize) -->
       <library name="annotations-12.0.jar"/>
-      <library name="checker-qual-3.8.0.jar"/>
-      <library name="error_prone_annotations-2.5.1.jar"/>
+      <library name="checker-qual-3.12.0.jar"/>
+      <library name="error_prone_annotations-2.7.1.jar"/>
       <library name="failureaccess-1.0.1.jar"/>
       <library name="j2objc-annotations-1.3.jar"/>
       <library name="jsonic-1.2.11.jar"/>
       <library name="jsr305-3.0.2.jar"/>
       <library name="language-detector-0.6.jar"/>
       <library 
name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
-      <library name="tika-langdetect-optimaize-2.1.0.jar"/>
+      <library name="tika-langdetect-optimaize-2.2.1.jar"/>
    </runtime>
 
    <requires>
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt 
b/src/plugin/parse-tika/howto_upgrade_tika.txt
index 697750e..cb3ed6b 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -23,8 +23,7 @@
      (eventually with different versions)
    - duplicated libs can be added to the exclusions of transitive dependencies 
in
        build/plugins/parse-tika/ivy.xml
-   - but it should be made sure that the library versions in ivy/ivy.xml 
correspond to
-     those required by Tika
+   - but the library versions in ivy/ivy.xml MUST correspond to those required 
by Tika
 
 5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/:
 
@@ -34,6 +33,8 @@
 
     $ cd ../language-identifier/
 
+It should be noted that Any23 also has a dependency on Tika so you may wish to 
check that there are no classpath conflicts in the any23 plugin as well.
+
 7. Build Nutch and run all unit tests:
 
     $ cd ../../../
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 388b695..f0033b8 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers-standard-package" 
rev="2.1.0" conf="*->default">
+    <dependency org="org.apache.tika" name="tika-parsers-standard-package" 
rev="2.2.1" conf="*->default">
       <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="org.apache.commons" name="commons-lang3" />
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index 275406a..1d4e2c7 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -26,13 +26,14 @@
          <export name="*"/>
       </library>
       <!-- dependencies of Tika (tika-parsers) -->
+      <library name="SparseBitSet-1.2.jar"/>
       <library name="apache-mime4j-core-0.8.4.jar"/>
       <library name="apache-mime4j-dom-0.8.4.jar"/>
       <library name="asm-9.2.jar"/>
-      <library name="bcmail-jdk15on-1.69.jar"/>
-      <library name="bcpkix-jdk15on-1.69.jar"/>
-      <library name="bcprov-jdk15on-1.69.jar"/>
-      <library name="bcutil-jdk15on-1.69.jar"/>
+      <library name="bcmail-jdk15on-1.70.jar"/>
+      <library name="bcpkix-jdk15on-1.70.jar"/>
+      <library name="bcprov-jdk15on-1.70.jar"/>
+      <library name="bcutil-jdk15on-1.70.jar"/>
       <library name="boilerpipe-1.1.0.jar"/>
       <library name="commons-codec-1.15.jar"/>
       <library name="commons-compress-1.21.jar"/>
@@ -42,7 +43,7 @@
       <library name="curvesapi-1.06.jar"/>
       <library name="dd-plist-1.23.jar"/>
       <library name="dec-0.1.2.jar"/>
-      <library name="fontbox-2.0.24.jar"/>
+      <library name="fontbox-2.0.25.jar"/>
       <library name="istack-commons-runtime-3.0.12.jar"/>
       <library name="jackcess-4.0.1.jar"/>
       <library name="jackcess-encrypt-4.0.1.jar"/>
@@ -50,10 +51,10 @@
       <library name="jakarta.activation-1.2.2.jar"/>
       <library name="jakarta.xml.bind-api-2.3.3.jar"/>
       <library name="java-libpst-0.9.3.jar"/>
-      <library name="jaxb-runtime-2.3.4.jar"/>
+      <library name="jaxb-runtime-2.3.5.jar"/>
       <library name="jbig2-imageio-3.0.3.jar"/>
       <library name="jcl-over-slf4j-1.7.32.jar"/>
-      <library name="jdom2-2.0.6.jar"/>
+      <library name="jdom2-2.0.6.1.jar"/>
       <library name="jempbox-1.8.16.jar"/>
       <library name="jhighlight-1.0.3.jar"/>
       <library name="jmatio-1.5.jar"/>
@@ -61,41 +62,40 @@
       <library name="junrar-7.4.0.jar"/>
       <library name="metadata-extractor-2.16.0.jar"/>
       <library name="parso-2.0.14.jar"/>
-      <library name="pdfbox-2.0.24.jar"/>
-      <library name="pdfbox-debugger-2.0.24.jar"/>
-      <library name="pdfbox-tools-2.0.24.jar"/>
+      <library name="pdfbox-2.0.25.jar"/>
+      <library name="pdfbox-debugger-2.0.25.jar"/>
+      <library name="pdfbox-tools-2.0.25.jar"/>
       <library name="poi-4.1.2.jar"/>
       <library name="poi-ooxml-4.1.2.jar"/>
       <library name="poi-ooxml-schemas-4.1.2.jar"/>
       <library name="poi-scratchpad-4.1.2.jar"/>
       <library name="rome-1.16.0.jar"/>
       <library name="rome-utils-1.16.0.jar"/>
-      <library name="SparseBitSet-1.2.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parser-apple-module-2.1.0.jar"/>
-      <library name="tika-parser-audiovideo-module-2.1.0.jar"/>
-      <library name="tika-parser-cad-module-2.1.0.jar"/>
-      <library name="tika-parser-code-module-2.1.0.jar"/>
-      <library name="tika-parser-crypto-module-2.1.0.jar"/>
-      <library name="tika-parser-digest-commons-2.1.0.jar"/>
-      <library name="tika-parser-font-module-2.1.0.jar"/>
-      <library name="tika-parser-html-commons-2.1.0.jar"/>
-      <library name="tika-parser-html-module-2.1.0.jar"/>
-      <library name="tika-parser-image-module-2.1.0.jar"/>
-      <library name="tika-parser-mail-commons-2.1.0.jar"/>
-      <library name="tika-parser-mail-module-2.1.0.jar"/>
-      <library name="tika-parser-microsoft-module-2.1.0.jar"/>
-      <library name="tika-parser-miscoffice-module-2.1.0.jar"/>
-      <library name="tika-parser-news-module-2.1.0.jar"/>
-      <library name="tika-parser-ocr-module-2.1.0.jar"/>
-      <library name="tika-parser-pdf-module-2.1.0.jar"/>
-      <library name="tika-parser-pkg-module-2.1.0.jar"/>
-      <library name="tika-parsers-standard-package-2.1.0.jar"/>
-      <library name="tika-parser-text-module-2.1.0.jar"/>
-      <library name="tika-parser-xml-module-2.1.0.jar"/>
-      <library name="tika-parser-xmp-commons-2.1.0.jar"/>
-      <library name="tika-parser-zip-commons-2.1.0.jar"/>
-      <library name="txw2-2.3.4.jar"/>
+      <library name="tika-parser-apple-module-2.2.1.jar"/>
+      <library name="tika-parser-audiovideo-module-2.2.1.jar"/>
+      <library name="tika-parser-cad-module-2.2.1.jar"/>
+      <library name="tika-parser-code-module-2.2.1.jar"/>
+      <library name="tika-parser-crypto-module-2.2.1.jar"/>
+      <library name="tika-parser-digest-commons-2.2.1.jar"/>
+      <library name="tika-parser-font-module-2.2.1.jar"/>
+      <library name="tika-parser-html-commons-2.2.1.jar"/>
+      <library name="tika-parser-html-module-2.2.1.jar"/>
+      <library name="tika-parser-image-module-2.2.1.jar"/>
+      <library name="tika-parser-mail-commons-2.2.1.jar"/>
+      <library name="tika-parser-mail-module-2.2.1.jar"/>
+      <library name="tika-parser-microsoft-module-2.2.1.jar"/>
+      <library name="tika-parser-miscoffice-module-2.2.1.jar"/>
+      <library name="tika-parser-news-module-2.2.1.jar"/>
+      <library name="tika-parser-ocr-module-2.2.1.jar"/>
+      <library name="tika-parser-pdf-module-2.2.1.jar"/>
+      <library name="tika-parser-pkg-module-2.2.1.jar"/>
+      <library name="tika-parser-text-module-2.2.1.jar"/>
+      <library name="tika-parser-xml-module-2.2.1.jar"/>
+      <library name="tika-parser-xmp-commons-2.2.1.jar"/>
+      <library name="tika-parser-zip-commons-2.2.1.jar"/>
+      <library name="tika-parsers-standard-package-2.2.1.jar"/>
+      <library name="txw2-2.3.5.jar"/>
       <library name="vorbis-java-core-0.8.jar"/>
       <library name="vorbis-java-tika-0.8.jar"/>
       <library name="xmlbeans-3.1.0.jar"/>
diff --git 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index 3dfe26d..de0c1c6 100644
--- 
a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++ 
b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -26,7 +26,7 @@ import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.DublinCore;
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -60,7 +60,7 @@ public class TestRTFParser extends TikaParserTest {
     Metadata meta = parse.getData().getParseMeta();
 
     Assert.assertEquals("test rft document", title);
-    Assert.assertEquals("tests", 
meta.get(OfficeOpenXMLCore.SUBJECT.getName()));
+    Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT.getName()));
 
   }
 }

Reply via email to