This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/branch-2.4 by this push: new 52188de NUTCH-2734 Upgrade Tika dependency to 1.22 - fall back to default Tika config if custom config file is not found - warn if loading a parser fails (reports potential plugin class loader issues - improve ant build file to download plugin dependencies (src/plugin/parse-tika/build-ivy.xml) - complete exclusions of dependencies provided also in Nutch core - force same version of xml-apis to be used by tika-core and tika-parsers: otherwise Tika parsers may fail with a link [...] 52188de is described below commit 52188de61965d2ad366df28877072f77f6f16b07 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Mon Sep 2 15:26:29 2019 +0200 NUTCH-2734 Upgrade Tika dependency to 1.22 - fall back to default Tika config if custom config file is not found - warn if loading a parser fails (reports potential plugin class loader issues - improve ant build file to download plugin dependencies (src/plugin/parse-tika/build-ivy.xml) - complete exclusions of dependencies provided also in Nutch core - force same version of xml-apis to be used by tika-core and tika-parsers: otherwise Tika parsers may fail with a linkage error because different implementations of org.xml.sax.ContentHandler are used --- ivy/ivy.xml | 6 +- src/plugin/parse-tika/build-ivy.xml | 4 +- src/plugin/parse-tika/build.xml | 4 +- src/plugin/parse-tika/howto_upgrade_tika.txt | 21 +++- src/plugin/parse-tika/ivy.xml | 19 +++- src/plugin/parse-tika/plugin.xml | 110 +++++++++++---------- src/test/org/apache/nutch/parse/TestParseUtil.java | 2 +- 7 files changed, 102 insertions(+), 64 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 322eecb..48b8c3f 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -60,11 +60,13 @@ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.5.2" conf="*->default"/> <!-- End of Hadoop Dependencies --> - <dependency org="com.ibm.icu" name="icu4j" rev="55.1" /> - <dependency org="org.apache.tika" name="tika-core" rev="1.20" /> + <dependency org="com.ibm.icu" name="icu4j" rev="61.1" /> + <dependency org="org.apache.tika" name="tika-core" rev="1.22" /> <!-- Tika requires a recent version of commons-compress --> <dependency org="org.apache.commons" name="commons-compress" rev="1.18" conf="*->default" /> <dependency org="com.googlecode.juniversalchardet" name="juniversalchardet" rev="1.0.3"/> + <dependency org="xml-apis" name="xml-apis" rev="1.4.01"/><!-- force this version as it is required by Tika --> + <dependency org="xerces" name="xercesImpl" rev="2.12.0" /> <dependency org="log4j" name="log4j" rev="1.2.15" conf="*->master" /> diff --git a/src/plugin/parse-tika/build-ivy.xml b/src/plugin/parse-tika/build-ivy.xml index e4984d8..738f041 100644 --- a/src/plugin/parse-tika/build-ivy.xml +++ b/src/plugin/parse-tika/build-ivy.xml @@ -17,14 +17,14 @@ --> <project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant"> - <property name="ivy.install.version" value="2.1.0" /> + <property name="ivy.install.version" value="2.4.0" /> <condition property="ivy.home" value="${env.IVY_HOME}"> <isset property="env.IVY_HOME" /> </condition> <property name="ivy.home" value="${user.home}/.ant" /> <property name="ivy.checksums" value="" /> <property name="ivy.jar.dir" value="${ivy.home}/lib" /> - <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" /> + <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy-${ivy.install.version}.jar" /> <target name="download-ivy" unless="offline"> diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml index 5bb43fc..bda9e89 100644 --- a/src/plugin/parse-tika/build.xml +++ b/src/plugin/parse-tika/build.xml @@ -18,8 +18,8 @@ <project name="parse-tika" default="jar-core"> <import file="../build-plugin.xml"/> - - <!-- Deploy Unit test dependencies --> + + <!-- Deploy Unit test dependencies --> <target name="deps-test"> <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> <ant target="deploy" inheritall="false" dir="../protocol-file"/> diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt index c02f8ea..aa4147c 100644 --- a/src/plugin/parse-tika/howto_upgrade_tika.txt +++ b/src/plugin/parse-tika/howto_upgrade_tika.txt @@ -1,4 +1,4 @@ -1. Upgrade Tika depencency (tika-core) in ivy/ivy.xml +1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml 2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml @@ -15,6 +15,23 @@ <!-- end of dependencies of Tika (tika-parsers) --> with the output of the command above. -4. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/: +4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies: + - check for libs present both in + build/lib + and + build/plugins/parse-tika/ + (eventually with different versions) + - duplicated libs can be added to the exclusions of transitive dependencies in + build/plugins/parse-tika/ivy.xml + - but it should be made sure that the library versions in ivy/ivy.xml correspend to + those required by Tika + +5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/: $ rm -rf lib/ + +6. Build Nutch and run all unit tests: + + $ cd ../../../ + $ ant clean runtime test + diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index 738498a..08d0f12 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -36,16 +36,27 @@ </publications> <dependencies> - <dependency org="org.apache.tika" name="tika-parsers" rev="1.20" conf="*->default"> + <dependency org="org.apache.tika" name="tika-parsers" rev="1.22" conf="*->default"> + <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) --> <exclude org="org.apache.tika" name="tika-core" /> <exclude org="org.apache.httpcomponents" name="httpclient" /> <exclude org="org.apache.httpcomponents" name="httpcore" /> - <exclude org="org.slf4j" name="slf4j-log4j12" /> - <exclude org="org.slf4j" name="slf4j-api" /> <exclude org="commons-lang" name="commons-lang" /> - <exclude org="commons-codec" name="commons-codec" /> + <exclude org="org.apache.commons" name="commons-lang3" /> + <exclude org="org.apache.commons" name="commons-codec" /> + <exclude org="commons-codec" name="commons-codec" /><!-- older versions are published with org=commons-codec --> + <exclude org="org.apache.commons" name="commons-collections4" /> <exclude org="org.apache.commons" name="commons-compress" /> + <exclude org="org.apache.cxf" name="cxf-core" /> + <exclude org="org.apache.cxf" name="cxf-rt-transports-http" /> + <exclude org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" /> + <exclude org="com.fasterxml.jackson.core" name="jackson-core" /> + <exclude org="com.fasterxml.jackson.core" name="jackson-databind" /> + <exclude org="com.fasterxml.jackson.core" name="jackson-annotations" /> <exclude org="com.google.protobuf" name="protobuf-java" /> + <exclude org="org.slf4j" name="slf4j-log4j12" /> + <exclude org="org.slf4j" name="slf4j-api" /> + <exclude org="xml-apis" name="xml-apis" /><!-- must be provided in core as it is used also by tika-core --> </dependency> </dependencies> diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml index 6559688..18dad6c 100644 --- a/src/plugin/parse-tika/plugin.xml +++ b/src/plugin/parse-tika/plugin.xml @@ -25,84 +25,92 @@ <library name="parse-tika.jar"> <export name="*"/> </library> - - <library name="activation-1.1.1.jar"/> - <library name="apache-mime4j-core-0.8.2.jar"/> - <library name="apache-mime4j-dom-0.8.2.jar"/> - <library name="asm-7.0.jar"/> - <library name="bcmail-jdk15on-1.60.jar"/> - <library name="bcpkix-jdk15on-1.60.jar"/> - <library name="bcprov-jdk15on-1.60.jar"/> + <!-- dependencies of Tika (tika-parsers) --> + <library name="animal-sniffer-annotations-1.17.jar"/> + <library name="ant-1.10.5.jar"/> + <library name="ant-launcher-1.10.5.jar"/> + <library name="apache-mime4j-core-0.8.3.jar"/> + <library name="apache-mime4j-dom-0.8.3.jar"/> + <library name="asm-7.2-beta.jar"/> + <library name="bcmail-jdk15on-1.62.jar"/> + <library name="bcpkix-jdk15on-1.62.jar"/> + <library name="bcprov-jdk15on-1.62.jar"/> <library name="boilerpipe-1.1.0.jar"/> <library name="bzip2-0.9.1.jar"/> - <library name="c3p0-0.9.1.1.jar"/> + <library name="c3p0-0.9.5.4.jar"/> <library name="cdm-4.5.5.jar"/> - <library name="commons-collections4-4.2.jar"/> - <library name="commons-csv-1.6.jar"/> + <library name="checker-qual-2.8.1.jar"/> + <library name="codemodel-2.3.2.jar"/> + <library name="commons-csv-1.7.jar"/> <library name="commons-exec-1.3.jar"/> <library name="commons-io-2.6.jar"/> - <library name="commons-lang3-3.8.1.jar"/> + <library name="commons-logging-1.2.jar"/> <library name="commons-math3-3.6.1.jar"/> <library name="curvesapi-1.05.jar"/> - <library name="cxf-core-3.2.7.jar"/> - <library name="cxf-rt-frontend-jaxrs-3.2.7.jar"/> - <library name="cxf-rt-rs-client-3.2.7.jar"/> - <library name="cxf-rt-transports-http-3.2.7.jar"/> + <library name="cxf-rt-rs-client-3.3.2.jar"/> + <library name="cxf-rt-security-3.3.2.jar"/> <library name="dec-0.1.2.jar"/> + <library name="dtd-parser-1.4.1.jar"/> <library name="ehcache-core-2.6.2.jar"/> - <library name="FastInfoset-1.2.15.jar"/> - <library name="fontbox-2.0.13.jar"/> + <library name="error_prone_annotations-2.3.2.jar"/> + <library name="failureaccess-1.0.1.jar"/> + <library name="FastInfoset-1.2.16.jar"/> + <library name="fontbox-2.0.16.jar"/> <library name="geoapi-3.0.1.jar"/> <library name="grib-4.5.5.jar"/> <library name="gson-2.8.5.jar"/> - <library name="guava-17.0.jar"/> - <library name="httpmime-4.5.6.jar"/> + <library name="guava-28.0-jre.jar"/> + <library name="httpmime-4.5.9.jar"/> <library name="httpservices-4.5.5.jar"/> <library name="isoparser-1.1.22.jar"/> - <library name="istack-commons-runtime-3.0.7.jar"/> - <library name="jackcess-2.1.12.jar"/> - <library name="jackcess-encrypt-2.1.4.jar"/> - <library name="jackson-annotations-2.9.7.jar"/> - <library name="jackson-core-2.9.7.jar"/> - <library name="jackson-databind-2.9.7.jar"/> + <library name="istack-commons-runtime-3.0.8.jar"/> + <library name="istack-commons-tools-3.0.8.jar"/> + <library name="j2objc-annotations-1.3.jar"/> + <library name="jackcess-3.0.1.jar"/> + <library name="jackcess-encrypt-3.0.0.jar"/> <library name="jai-imageio-core-1.4.0.jar"/> + <library name="jakarta.activation-1.2.1.jar"/> + <library name="jakarta.activation-api-1.2.1.jar"/> + <library name="jakarta.ws.rs-api-2.1.5.jar"/> + <library name="jakarta.xml.bind-api-2.3.2.jar"/> <library name="java-libpst-0.8.1.jar"/> - <library name="javax.activation-1.2.0.jar"/> <library name="javax.annotation-api-1.3.2.jar"/> - <library name="javax.ws.rs-api-2.1.1.jar"/> - <library name="jaxb-api-2.3.1.jar"/> - <library name="jaxb-runtime-2.3.1.jar"/> + <library name="jaxb-runtime-2.3.2.jar"/> + <library name="jaxb-xjc-2.3.2.jar"/> <library name="jbig2-imageio-3.0.2.jar"/> <library name="jcip-annotations-1.0.jar"/> - <library name="jcl-over-slf4j-1.7.25.jar"/> + <library name="jcl-over-slf4j-1.7.26.jar"/> <library name="jcommander-1.35.jar"/> <library name="jdom2-2.0.6.jar"/> <library name="jempbox-1.8.16.jar"/> <library name="jhighlight-1.0.3.jar"/> <library name="jmatio-1.5.jar"/> - <library name="jna-5.1.0.jar"/> + <library name="jna-5.3.1.jar"/> <library name="joda-time-2.2.jar"/> <library name="json-simple-1.1.1.jar"/> - <library name="jsoup-1.11.3.jar"/> - <library name="jul-to-slf4j-1.7.25.jar"/> + <library name="jsoup-1.12.1.jar"/> + <library name="jsr305-3.0.2.jar"/> + <library name="jul-to-slf4j-1.7.26.jar"/> <library name="juniversalchardet-1.0.3.jar"/> - <library name="junrar-2.0.0.jar"/> + <library name="junrar-4.0.0.jar"/> + <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/> + <library name="mchange-commons-java-0.2.15.jar"/> <library name="metadata-extractor-2.11.0.jar"/> <library name="netcdf4-4.5.5.jar"/> - <library name="openjson-1.0.10.jar"/> - <library name="opennlp-tools-1.9.0.jar"/> - <library name="parso-2.0.10.jar"/> - <library name="pdfbox-2.0.13.jar"/> - <library name="pdfbox-tools-2.0.13.jar"/> + <library name="openjson-1.0.11.jar"/> + <library name="opennlp-tools-1.9.1.jar"/> + <library name="parso-2.0.11.jar"/> + <library name="pdfbox-2.0.16.jar"/> + <library name="pdfbox-tools-2.0.16.jar"/> <library name="poi-4.0.1.jar"/> <library name="poi-ooxml-4.0.1.jar"/> <library name="poi-ooxml-schemas-4.0.1.jar"/> <library name="poi-scratchpad-4.0.1.jar"/> - <library name="procyon-compilertools-0.5.32.jar"/> - <library name="procyon-core-0.5.32.jar"/> <library name="quartz-2.2.0.jar"/> - <library name="rome-1.12.0.jar"/> - <library name="rome-utils-1.12.0.jar"/> + <library name="relaxng-datatype-2.3.2.jar"/> + <library name="rngom-2.3.2.jar"/> + <library name="rome-1.12.1.jar"/> + <library name="rome-utils-1.12.1.jar"/> <library name="sentiment-analysis-parser-0.1.jar"/> <library name="sis-feature-0.8.jar"/> <library name="sis-metadata-0.8.jar"/> @@ -111,28 +119,28 @@ <library name="sis-storage-0.8.jar"/> <library name="sis-utility-0.8.jar"/> <library name="stax2-api-3.1.4.jar"/> - <library name="stax-ex-1.8.jar"/> + <library name="stax-ex-1.8.1.jar"/> <library name="tagsoup-1.2.1.jar"/> - <library name="tika-parsers-1.20.jar"/> - <library name="txw2-2.3.1.jar"/> + <library name="tika-parsers-1.22.jar"/> + <library name="txw2-2.3.2.jar"/> <library name="udunits-4.5.5.jar"/> - <library name="uimafit-core-2.4.0.jar"/> - <library name="uimaj-core-3.0.1.jar"/> <library name="unit-api-1.0.jar"/> <library name="vorbis-java-core-0.8.jar"/> <library name="vorbis-java-tika-0.8.jar"/> <library name="woodstox-core-5.0.3.jar"/> + <library name="xercesImpl-2.12.0.jar"/> <library name="xmlbeans-3.0.2.jar"/> - <library name="xmlschema-core-2.2.3.jar"/> + <library name="xmlschema-core-2.2.4.jar"/> <library name="xmpcore-5.1.3.jar"/> + <library name="xsom-2.3.2.jar"/> <library name="xz-1.8.jar"/> + <!-- end of dependencies of Tika (tika-parsers) --> </runtime> <requires> <import plugin="nutch-extensionpoints"/> </requires> - <extension point="org.apache.nutch.parse.Parser" id="org.apache.nutch.parse.tika" name="TikaParser"> diff --git a/src/test/org/apache/nutch/parse/TestParseUtil.java b/src/test/org/apache/nutch/parse/TestParseUtil.java index 3fb48b2..6f9404c 100644 --- a/src/test/org/apache/nutch/parse/TestParseUtil.java +++ b/src/test/org/apache/nutch/parse/TestParseUtil.java @@ -62,7 +62,7 @@ public class TestParseUtil { page.setContent(ByteBuffer.wrap(content.getBytes("utf-8"))); page.setStatus((int)CrawlStatus.STATUS_FETCHED); parseUtil.process("http://www.example.com/", page); - assertTrue("Wrong URL!", page.getOutlinks().size() == 2); + assertTrue("Wrong number of URLs", page.getOutlinks().size() == 2); assertTrue("Wrong URL", page.getOutlinks().containsKey(new Utf8("http://www.nutch.org/index.html"))); assertTrue("Wrong URL", page.getOutlinks().containsKey(new Utf8("http://www.apache.org/foundation/")));