This is an automated email from the ASF dual-hosted git repository. snagel pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 784aa5f NUTCH-2682 Upgrade to Tika 1.20 - upgrade to Tika dependencies to version 1.20 - plugin parse-tika: add exclusions of transitive dependencies already provided as Nutch core dependencies - upgrade Nutch core dependencies to match versions required by Tika 1.20 - apply code formatting template to TikaParser class and replace deprecated method calls new 6934d52 Merge pull request #424 from sebastian-nagel/NUTCH-2682-upgrade-tika 784aa5f is described below commit 784aa5f8a5210cdd129a583c1dccdffaad5f9807 Author: Sebastian Nagel <sna...@apache.org> AuthorDate: Fri Jan 4 17:40:17 2019 +0100 NUTCH-2682 Upgrade to Tika 1.20 - upgrade to Tika dependencies to version 1.20 - plugin parse-tika: add exclusions of transitive dependencies already provided as Nutch core dependencies - upgrade Nutch core dependencies to match versions required by Tika 1.20 - apply code formatting template to TikaParser class and replace deprecated method calls --- ivy/ivy.xml | 26 +++---- src/plugin/parse-tika/howto_upgrade_tika.txt | 19 ++++++ src/plugin/parse-tika/ivy.xml | 16 ++++- src/plugin/parse-tika/plugin.xml | 79 +++++++++++----------- .../org/apache/nutch/parse/tika/TikaParser.java | 78 +++++++++++---------- 5 files changed, 129 insertions(+), 89 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index f1e4a80..52826bb 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -43,11 +43,11 @@ <exclude org="com.sun.jmx" name="jmxri" /> </dependency--> - <dependency org="org.apache.commons" name="commons-lang3" rev="3.7" conf="*->default" /> - <dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->master" /> - <dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.5" conf="*->master" /> + <dependency org="org.apache.commons" name="commons-lang3" rev="3.8.1" conf="*->default" /> + <dependency org="org.apache.commons" name="commons-collections4" rev="4.2" conf="*->master" /> + <dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.6" conf="*->master" /> <dependency org="commons-codec" name="commons-codec" rev="1.11" conf="*->default" /> - <dependency org="org.apache.commons" name="commons-compress" rev="1.16.1" conf="*->default" /> + <dependency org="org.apache.commons" name="commons-compress" rev="1.18" conf="*->default" /> <dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" /> <dependency org="com.tdunning" name="t-digest" rev="3.2" /> @@ -65,7 +65,7 @@ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4" conf="*->default"/> <!-- End of Hadoop Dependencies --> - <dependency org="org.apache.tika" name="tika-core" rev="1.19.1" /> + <dependency org="org.apache.tika" name="tika-core" rev="1.20" /> <dependency org="com.ibm.icu" name="icu4j" rev="61.1" /> <dependency org="xerces" name="xercesImpl" rev="2.11.0" /> @@ -78,14 +78,14 @@ <dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" /> <!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/--> - <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.1.15" conf="*->default"/> - <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.1.15" conf="*->default"/> - <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.1.15" conf="*->default"/> - <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.1.15" conf="*->default"/> - <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.1.15" conf="test->default"/> - <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.5" conf="*->default"/> - <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.5" conf="*->default"/> - <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.5" conf="*->default"/> + <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.2.7" conf="*->default"/> + <dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.2.7" conf="*->default"/> + <dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.2.7" conf="*->default"/> + <dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.2.7" conf="*->default"/> + <dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.2.7" conf="test->default"/> + <dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.7" conf="*->default"/> + <dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.7" conf="*->default"/> + <dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.7" conf="*->default"/> <!-- WARC artifacts needed --> <dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default"> diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt index f8bbae1..fbf7207 100644 --- a/src/plugin/parse-tika/howto_upgrade_tika.txt +++ b/src/plugin/parse-tika/howto_upgrade_tika.txt @@ -15,4 +15,23 @@ <!-- end of dependencies of Tika (tika-parsers) --> with the output of the command above. +4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies: + - check for libs present both in + build/lib + and + build/plugins/parse-tika/ + (eventually with different versions) + - duplicated libs can be added to the exclusions of transitive dependencies in + build/plugins/parse-tika/ivy.xml + - but it should be made sure that the library versions in ivy/ivy.xml correspend to + those required by Tika + +5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/: + + $ rm -rf lib/ + +6. Build Nutch and run all unit tests: + + $ cd ../../../ + $ ant clean runtime test diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index 53c7775..df06f14 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -36,14 +36,24 @@ </publications> <dependencies> - <dependency org="org.apache.tika" name="tika-parsers" rev="1.19.1" conf="*->default"> + <dependency org="org.apache.tika" name="tika-parsers" rev="1.20" conf="*->default"> + <!-- exclusions of dependencies in Nutch core (ivy/ivy.xml) --> <exclude org="org.apache.tika" name="tika-core" /> <exclude org="org.apache.httpcomponents" name="httpclient" /> <exclude org="org.apache.httpcomponents" name="httpcore" /> - <exclude org="org.slf4j" name="slf4j-log4j12" /> - <exclude org="org.slf4j" name="slf4j-api" /> <exclude org="commons-lang" name="commons-lang" /> + <exclude org="org.apache.commons" name="commons-lang3" /> + <exclude org="org.apache.commons" name="commons-codec" /> + <exclude org="commons-codec" name="commons-codec" /><!-- older versions are published with org=commons-codec --> + <exclude org="org.apache.commons" name="commons-collections4" /> + <exclude org="org.apache.commons" name="commons-compress" /> + <exclude org="org.apache.cxf" name="cxf-core" /> + <exclude org="org.apache.cxf" name="cxf-rt-transports-http" /> + <exclude org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" /> + <exclude org="com.fasterxml.jackson.core" name="jackson-databind" /> <exclude org="com.google.protobuf" name="protobuf-java" /> + <exclude org="org.slf4j" name="slf4j-log4j12" /> + <exclude org="org.slf4j" name="slf4j-api" /> </dependency> </dependencies> diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml index 7dbe180..b89f41e 100644 --- a/src/plugin/parse-tika/plugin.xml +++ b/src/plugin/parse-tika/plugin.xml @@ -26,10 +26,9 @@ <export name="*"/> </library> <!-- dependencies of Tika (tika-parsers) --> - <library name="activation-1.1.1.jar"/> <library name="apache-mime4j-core-0.8.2.jar"/> <library name="apache-mime4j-dom-0.8.2.jar"/> - <library name="asm-6.2.jar"/> + <library name="asm-7.0.jar"/> <library name="bcmail-jdk15on-1.60.jar"/> <library name="bcpkix-jdk15on-1.60.jar"/> <library name="bcprov-jdk15on-1.60.jar"/> @@ -37,22 +36,22 @@ <library name="bzip2-0.9.1.jar"/> <library name="c3p0-0.9.1.1.jar"/> <library name="cdm-4.5.5.jar"/> - <library name="commons-codec-1.11.jar"/> <library name="commons-collections4-4.2.jar"/> <library name="commons-compress-1.18.jar"/> - <library name="commons-csv-1.5.jar"/> + <library name="commons-csv-1.6.jar"/> <library name="commons-exec-1.3.jar"/> <library name="commons-io-2.6.jar"/> - <library name="commons-logging-1.2.jar"/> - <library name="curvesapi-1.04.jar"/> - <library name="cxf-core-3.2.6.jar"/> - <library name="cxf-rt-frontend-jaxrs-3.2.6.jar"/> - <library name="cxf-rt-rs-client-3.2.6.jar"/> - <library name="cxf-rt-transports-http-3.2.6.jar"/> + <library name="commons-lang3-3.8.1.jar"/> + <library name="commons-math3-3.6.1.jar"/> + <library name="curvesapi-1.05.jar"/> + <library name="cxf-core-3.2.7.jar"/> + <library name="cxf-rt-frontend-jaxrs-3.2.7.jar"/> + <library name="cxf-rt-rs-client-3.2.7.jar"/> + <library name="cxf-rt-transports-http-3.2.7.jar"/> <library name="dec-0.1.2.jar"/> <library name="ehcache-core-2.6.2.jar"/> - <library name="FastInfoset-1.2.13.jar"/> - <library name="fontbox-2.0.12.jar"/> + <library name="FastInfoset-1.2.15.jar"/> + <library name="fontbox-2.0.13.jar"/> <library name="geoapi-3.0.1.jar"/> <library name="grib-4.5.5.jar"/> <library name="gson-2.8.5.jar"/> @@ -60,19 +59,19 @@ <library name="httpmime-4.5.6.jar"/> <library name="httpservices-4.5.5.jar"/> <library name="isoparser-1.1.22.jar"/> - <library name="istack-commons-runtime-3.0.5.jar"/> + <library name="istack-commons-runtime-3.0.7.jar"/> <library name="jackcess-2.1.12.jar"/> <library name="jackcess-encrypt-2.1.4.jar"/> - <library name="jackson-annotations-2.9.6.jar"/> - <library name="jackson-core-2.9.6.jar"/> - <library name="jackson-databind-2.9.6.jar"/> + <library name="jackson-annotations-2.9.7.jar"/> + <library name="jackson-core-2.9.7.jar"/> + <library name="jackson-databind-2.9.7.jar"/> <library name="jai-imageio-core-1.4.0.jar"/> <library name="java-libpst-0.8.1.jar"/> - <library name="javax.annotation-api-1.3.jar"/> - <library name="javax.ws.rs-api-2.1.jar"/> - <library name="jaxb-api-2.3.0.jar"/> - <library name="jaxb-core-2.3.0.1.jar"/> - <library name="jaxb-runtime-2.3.0.1.jar"/> + <library name="javax.activation-1.2.0.jar"/> + <library name="javax.annotation-api-1.3.2.jar"/> + <library name="javax.ws.rs-api-2.1.1.jar"/> + <library name="jaxb-api-2.3.1.jar"/> + <library name="jaxb-runtime-2.3.1.jar"/> <library name="jbig2-imageio-3.0.2.jar"/> <library name="jcip-annotations-1.0.jar"/> <library name="jcl-over-slf4j-1.7.25.jar"/> @@ -81,7 +80,7 @@ <library name="jempbox-1.8.16.jar"/> <library name="jhighlight-1.0.3.jar"/> <library name="jmatio-1.5.jar"/> - <library name="jna-4.3.0.jar"/> + <library name="jna-5.1.0.jar"/> <library name="joda-time-2.2.jar"/> <library name="json-simple-1.1.1.jar"/> <library name="jsoup-1.11.3.jar"/> @@ -92,16 +91,18 @@ <library name="netcdf4-4.5.5.jar"/> <library name="openjson-1.0.10.jar"/> <library name="opennlp-tools-1.9.0.jar"/> - <library name="parso-2.0.9.jar"/> - <library name="pdfbox-2.0.12.jar"/> - <library name="pdfbox-tools-2.0.12.jar"/> - <library name="poi-4.0.0.jar"/> - <library name="poi-ooxml-4.0.0.jar"/> - <library name="poi-ooxml-schemas-4.0.0.jar"/> - <library name="poi-scratchpad-4.0.0.jar"/> + <library name="parso-2.0.10.jar"/> + <library name="pdfbox-2.0.13.jar"/> + <library name="pdfbox-tools-2.0.13.jar"/> + <library name="poi-4.0.1.jar"/> + <library name="poi-ooxml-4.0.1.jar"/> + <library name="poi-ooxml-schemas-4.0.1.jar"/> + <library name="poi-scratchpad-4.0.1.jar"/> + <library name="procyon-compilertools-0.5.32.jar"/> + <library name="procyon-core-0.5.32.jar"/> <library name="quartz-2.2.0.jar"/> - <library name="rome-1.5.1.jar"/> - <library name="rome-utils-1.5.1.jar"/> + <library name="rome-1.12.0.jar"/> + <library name="rome-utils-1.12.0.jar"/> <library name="sentiment-analysis-parser-0.1.jar"/> <library name="sis-feature-0.8.jar"/> <library name="sis-metadata-0.8.jar"/> @@ -109,19 +110,19 @@ <library name="sis-referencing-0.8.jar"/> <library name="sis-storage-0.8.jar"/> <library name="sis-utility-0.8.jar"/> - <library name="stax2-api-4.1.jar"/> - <library name="stax-ex-1.7.8.jar"/> + <library name="stax2-api-3.1.4.jar"/> + <library name="stax-ex-1.8.jar"/> <library name="tagsoup-1.2.1.jar"/> - <library name="tika-parsers-1.19.1.jar"/> - <library name="txw2-2.3.0.1.jar"/> + <library name="tika-parsers-1.20.jar"/> + <library name="txw2-2.3.1.jar"/> <library name="udunits-4.5.5.jar"/> - <library name="uimafit-core-2.2.0.jar"/> - <library name="uimaj-core-2.9.0.jar"/> + <library name="uimafit-core-2.4.0.jar"/> + <library name="uimaj-core-3.0.1.jar"/> <library name="unit-api-1.0.jar"/> <library name="vorbis-java-core-0.8.jar"/> <library name="vorbis-java-tika-0.8.jar"/> - <library name="woodstox-core-5.1.0.jar"/> - <library name="xmlbeans-3.0.1.jar"/> + <library name="woodstox-core-5.0.3.jar"/> + <library name="xmlbeans-3.0.2.jar"/> <library name="xmlschema-core-2.2.3.jar"/> <library name="xmpcore-5.1.3.jar"/> <library name="xz-1.8.jar"/> diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java index e346940..7440333 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java @@ -42,6 +42,7 @@ import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.html.BoilerpipeContentHandler; +import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.html.HtmlMapper; @@ -70,6 +71,8 @@ public class TikaParser implements org.apache.nutch.parse.Parser { private String cachingPolicy; private HtmlMapper HTMLMapper; private boolean upperCaseElementNames = true; + private String boilerpipeExtractorName; + private boolean useBoilerpipe; public ParseResult getParse(Content content) { HTMLDocumentImpl doc = new HTMLDocumentImpl(); @@ -83,59 +86,59 @@ public class TikaParser implements org.apache.nutch.parse.Parser { ParseResult getParse(Content content, HTMLDocumentImpl doc, DocumentFragment root) { String mimeType = content.getContentType(); - - boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe"); - String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor"); URL base; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { - return new ParseStatus(e) - .getEmptyParseResult(content.getUrl(), getConf()); + return new ParseStatus(e).getEmptyParseResult(content.getUrl(), + getConf()); } // get the right parser using the mime type as a clue - Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); - byte[] raw = content.getContent(); - + CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser(); + Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType)); if (parser == null) { String message = "Can't retrieve Tika parser for mime-type " + mimeType; LOG.error(message); - return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult( - content.getUrl(), getConf()); + return new ParseStatus(ParseStatus.FAILED, message) + .getEmptyParseResult(content.getUrl(), getConf()); } - LOG.debug("Using Tika parser " + parser.getClass().getName() - + " for mime-type " + mimeType); + LOG.debug("Using Tika parser {} for mime-type {}.", + parser.getClass().getName(), mimeType); + byte[] raw = content.getContent(); Metadata tikamd = new Metadata(); ContentHandler domHandler; - + // Check whether to use Tika's BoilerplateContentHandler if (useBoilerpipe) { - BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root), - BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); + BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler( + (ContentHandler) new DOMBuilder(doc, root), + BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName)); bpHandler.setIncludeMarkup(true); - domHandler = (ContentHandler)bpHandler; + domHandler = (ContentHandler) bpHandler; } else { DOMBuilder domBuilder = new DOMBuilder(doc, root); domBuilder.setUpperCaseElementNames(upperCaseElementNames); domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML); - domHandler = (ContentHandler)domBuilder; + domHandler = (ContentHandler) domBuilder; } LinkContentHandler linkContentHandler = new LinkContentHandler(); ParseContext context = new ParseContext(); - TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler); - + TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, + linkContentHandler); + if (HTMLMapper != null) context.set(HtmlMapper.class, HTMLMapper); tikamd.set(Metadata.CONTENT_TYPE, mimeType); try { - parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd, context); + parser.parse(new ByteArrayInputStream(raw), + (ContentHandler) teeContentHandler, tikamd, context); } catch (Exception e) { LOG.error("Error parsing " + content.getUrl(), e); return new ParseStatus(ParseStatus.FAILED, e.getMessage()) @@ -186,16 +189,16 @@ public class TikaParser implements org.apache.nutch.parse.Parser { if (LOG.isTraceEnabled()) { LOG.trace("Getting links (base URL = {}) ...", baseTag); } - + // pre-1233 outlink extraction - //utils.getOutlinks(baseTag != null ? baseTag : base, l, root); + // utils.getOutlinks(baseTag != null ? baseTag : base, l, root); // Get outlinks from Tika List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks(); utils.getOutlinks(baseTag, l, tikaExtractedOutlinks); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { - LOG.trace("found " + outlinks.length + " outlinks in " - + content.getUrl()); + LOG.trace( + "found " + outlinks.length + " outlinks in " + content.getUrl()); } } @@ -251,7 +254,8 @@ public class TikaParser implements org.apache.nutch.parse.Parser { // see if a Tika config file can be found in the job file URL customTikaConfig = conf.getResource(customConfFile); if (customTikaConfig != null) - tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader()); + tikaConfig = new TikaConfig(customTikaConfig, + this.getClass().getClassLoader()); } catch (Exception e1) { String message = "Problem loading custom Tika configuration from " + customConfFile; @@ -277,20 +281,26 @@ public class TikaParser implements org.apache.nutch.parse.Parser { throw new RuntimeException("Class " + htmlmapperClassName + " does not implement HtmlMapper"); } - HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor().newInstance(); + HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor() + .newInstance(); } catch (Exception e) { - LOG.error("Can't generate instance for class " + htmlmapperClassName); - throw new RuntimeException("Can't generate instance for class " - + htmlmapperClassName); + String message = "Can't generate instance for class " + + htmlmapperClassName; + LOG.error(message); + throw new RuntimeException(message); } } - this.htmlParseFilters = new HtmlParseFilters(getConf()); - this.utils = new DOMContentUtils(conf); - this.cachingPolicy = getConf().get("parser.caching.forbidden.policy", + htmlParseFilters = new HtmlParseFilters(getConf()); + utils = new DOMContentUtils(conf); + cachingPolicy = getConf().get("parser.caching.forbidden.policy", Nutch.CACHING_FORBIDDEN_CONTENT); - this.upperCaseElementNames = getConf().getBoolean( - "tika.uppercase.element.names", true); + upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names", + true); + useBoilerpipe = getConf().get("tika.extractor", "none") + .equals("boilerpipe"); + boilerpipeExtractorName = getConf() + .get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor"); } public Configuration getConf() {