This is an automated email from the ASF dual-hosted git repository. lewismc pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push: new 847e19d NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6 (#717) 847e19d is described below commit 847e19d984503d333fd8fdd430fe347dd370dc4c Author: Lewis John McGibbney <lewis.mcgibb...@gmail.com> AuthorDate: Sat Jan 15 15:24:21 2022 -0800 NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6 (#717) * NUTCH-2919 Upgrade to Tika 2.2.1 and Any23 2.6 --- ivy/ivy.xml | 2 +- src/plugin/any23/ivy.xml | 2 +- src/plugin/any23/plugin.xml | 272 ++++++++++----------- .../apache/nutch/any23/Any23IndexingFilter.java | 2 +- .../org/apache/nutch/any23/Any23ParseFilter.java | 35 +-- src/plugin/build-plugin.xml | 3 +- src/plugin/language-identifier/ivy.xml | 2 +- src/plugin/language-identifier/plugin.xml | 6 +- src/plugin/parse-tika/howto_upgrade_tika.txt | 5 +- src/plugin/parse-tika/ivy.xml | 2 +- src/plugin/parse-tika/plugin.xml | 70 +++--- .../org/apache/nutch/parse/tika/TestRTFParser.java | 4 +- 12 files changed, 192 insertions(+), 213 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 8d154bf..34e298f 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -63,7 +63,7 @@ <dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="3.1.3" conf="*->default" /> <!-- End of Hadoop Dependencies --> - <dependency org="org.apache.tika" name="tika-core" rev="2.1.0" /> + <dependency org="org.apache.tika" name="tika-core" rev="2.2.1" /> <dependency org="xml-apis" name="xml-apis" rev="1.4.01" /><!-- force this version as it is required by Tika --> <dependency org="xerces" name="xercesImpl" rev="2.12.1" /> diff --git a/src/plugin/any23/ivy.xml b/src/plugin/any23/ivy.xml index a5a0077..7220a25 100644 --- a/src/plugin/any23/ivy.xml +++ b/src/plugin/any23/ivy.xml @@ -36,7 +36,7 @@ </publications> <dependencies> - <dependency org="org.apache.any23" name="apache-any23-core" rev="2.5" conf="*->default"> + <dependency org="org.apache.any23" name="apache-any23-core" rev="2.6" conf="*->default"> <exclude org="org.apache.commons" name="commons-lang" /> <exclude org="org.apache.commons" name="commons-compress" /> <exclude org="org.slf4j" name="slf4j-log4j12" /> diff --git a/src/plugin/any23/plugin.xml b/src/plugin/any23/plugin.xml index cc941b2..40a42c7 100644 --- a/src/plugin/any23/plugin.xml +++ b/src/plugin/any23/plugin.xml @@ -26,194 +26,168 @@ <export name="*"/> </library> <!-- Begin Any23 dependencies --> - <library name="FastInfoset-1.2.16.jar"/> - <library name="HikariCP-java7-2.4.13.jar"/> <library name="SparseBitSet-1.2.jar"/> - <library name="apache-any23-api-2.5.jar"/> - <library name="apache-any23-core-2.5.jar"/> - <library name="apache-any23-csvutils-2.5.jar"/> - <library name="apache-any23-encoding-2.5.jar"/> - <library name="apache-any23-mime-2.5.jar"/> - <library name="apache-mime4j-core-0.8.3.jar"/> - <library name="apache-mime4j-dom-0.8.3.jar"/> - <library name="asm-7.3.1.jar"/> - <library name="bcmail-jdk15on-1.64.jar"/> - <library name="bcpkix-jdk15on-1.64.jar"/> - <library name="bcprov-jdk15on-1.64.jar"/> - <library name="biweekly-0.6.3.jar"/> + <library name="apache-any23-api-2.6.jar"/> + <library name="apache-any23-core-2.6.jar"/> + <library name="apache-any23-csvutils-2.6.jar"/> + <library name="apache-any23-encoding-2.6.jar"/> + <library name="apache-any23-mime-2.6.jar"/> + <library name="apache-mime4j-core-0.8.4.jar"/> + <library name="apache-mime4j-dom-0.8.4.jar"/> + <library name="asm-9.2.jar"/> + <library name="bcmail-jdk15on-1.70.jar"/> + <library name="bcpkix-jdk15on-1.70.jar"/> + <library name="bcutil-jdk15on-1.70.jar"/> + <library name="biweekly-0.6.6.jar"/> <library name="boilerpipe-1.1.0.jar"/> - <library name="bzip2-0.9.1.jar"/> - <library name="c3p0-0.9.5.5.jar"/> - <library name="caffeine-2.6.1.jar"/> - <library name="cdm-4.5.5.jar"/> - <library name="checker-qual-2.10.0.jar"/> - <library name="commons-codec-1.14.jar"/> + <library name="caffeine-2.8.1.jar"/> + <library name="checker-qual-3.1.0.jar"/> + <library name="commons-codec-1.15.jar"/> <library name="commons-collections4-4.4.jar"/> - <library name="commons-csv-1.8.jar"/> + <library name="commons-csv-1.9.0.jar"/> <library name="commons-exec-1.3.jar"/> - <library name="commons-io-2.6.jar"/> - <library name="commons-lang3-3.10.jar"/> + <library name="commons-io-2.11.0.jar"/> + <library name="commons-lang3-3.12.0.jar"/> <library name="commons-logging-1.2.jar"/> <library name="commons-math3-3.6.1.jar"/> <library name="commons-rdf-api-0.5.0.jar"/> + <library name="commons-text-1.9.jar"/> <library name="curvesapi-1.06.jar"/> - <library name="cxf-core-3.3.5.jar"/> - <library name="cxf-rt-frontend-jaxrs-3.3.5.jar"/> - <library name="cxf-rt-rs-client-3.3.5.jar"/> - <library name="cxf-rt-security-3.3.5.jar"/> - <library name="cxf-rt-transports-http-3.3.5.jar"/> + <library name="dd-plist-1.23.jar"/> <library name="dec-0.1.2.jar"/> - <library name="ehcache-core-2.6.2.jar"/> <library name="error_prone_annotations-2.3.4.jar"/> <library name="f8-1.1.jar"/> <library name="failureaccess-1.0.1.jar"/> - <library name="fluent-hc-4.5.10.jar"/> - <library name="fontbox-2.0.19.jar"/> - <library name="geoapi-3.0.1.jar"/> - <library name="geronimo-jta_1.1_spec-1.1.1.jar"/> - <library name="geronimo-ws-metadata_2.0_spec-1.1.3.jar"/> - <library name="grib-4.5.5.jar"/> - <library name="gson-2.8.6.jar"/> - <library name="guava-28.2-jre.jar"/> + <library name="fluent-hc-4.5.13.jar"/> + <library name="fontbox-2.0.25.jar"/> + <library name="guava-30.1.1-jre.jar"/> <library name="hppcrt-0.7.5.jar"/> - <library name="httpclient-4.5.12.jar"/> - <library name="httpclient-cache-4.5.12.jar"/> - <library name="httpclient-osgi-4.5.12.jar"/> - <library name="httpcore-4.4.13.jar"/> - <library name="httpcore-nio-4.4.12.jar"/> - <library name="httpcore-osgi-4.4.13.jar"/> - <library name="httpmime-4.5.12.jar"/> - <library name="httpservices-4.5.5.jar"/> - <library name="isoparser-1.1.22.jar"/> - <library name="istack-commons-runtime-3.0.8.jar"/> - <library name="j2objc-annotations-1.3.jar"/> - <library name="jackcess-3.0.1.jar"/> - <library name="jackcess-encrypt-3.0.0.jar"/> - <library name="jackson-annotations-2.12.2.jar"/> - <library name="jackson-core-2.12.2.jar"/> - <library name="jackson-databind-2.12.2.jar"/> - <library name="jacorb-omgapi-3.9.jar"/> + <library name="httpclient-4.5.13.jar"/> + <library name="httpclient-cache-4.5.13.jar"/> + <library name="httpclient-osgi-4.5.13.jar"/> + <library name="httpcore-4.4.15.jar"/> + <library name="httpcore-nio-4.4.14.jar"/> + <library name="httpcore-osgi-4.4.14.jar"/> + <library name="httpmime-4.5.13.jar"/> + <library name="istack-commons-runtime-3.0.12.jar"/> + <library name="jackcess-4.0.1.jar"/> + <library name="jackcess-encrypt-4.0.1.jar"/> + <library name="jackson-annotations-2.11.4.jar"/> + <library name="jackson-core-2.12.1.jar"/> + <library name="jackson-databind-2.11.4.jar"/> <library name="jai-imageio-core-1.4.0.jar"/> - <library name="jakarta.activation-1.2.1.jar"/> - <library name="jakarta.activation-api-1.2.1.jar"/> - <library name="jakarta.ws.rs-api-2.1.5.jar"/> - <library name="jakarta.xml.bind-api-2.3.2.jar"/> + <library name="jakarta.activation-1.2.2.jar"/> + <library name="jakarta.xml.bind-api-2.3.3.jar"/> <library name="java-libpst-0.9.3.jar"/> - <library name="javax.activation-1.2.0.jar"/> <library name="javax.activation-api-1.2.0.jar"/> - <library name="javax.annotation-api-1.3.2.jar"/> <library name="javax.inject-1.jar"/> - <library name="javax.xml.soap-api-1.4.0.jar"/> <library name="jaxb-api-2.3.1.jar"/> - <library name="jaxb-runtime-2.3.2.jar"/> - <library name="jaxws-api-2.3.1.jar"/> + <library name="jaxb-runtime-2.3.5.jar"/> <library name="jbig2-imageio-3.0.3.jar"/> - <library name="jboss-rmi-api_1.0_spec-1.0.6.Final.jar"/> - <library name="jcip-annotations-1.0.jar"/> - <library name="jcl-over-slf4j-1.7.30.jar"/> - <library name="jcommander-1.78.jar"/> - <library name="jdom2-2.0.6.jar"/> + <library name="jcl-over-slf4j-1.7.32.jar"/> + <library name="jcommander-1.81.jar"/> + <library name="jdom2-2.0.6.1.jar"/> <library name="jempbox-1.8.16.jar"/> <library name="jhighlight-1.0.3.jar"/> <library name="jmatio-1.5.jar"/> - <library name="jna-5.5.0.jar"/> - <library name="joda-time-2.2.jar"/> - <library name="json-simple-1.1.1.jar"/> - <library name="jsonld-java-0.13.2.jar"/> - <library name="jsoup-1.13.1.jar"/> + <library name="jsonld-java-0.13.4.jar"/> + <library name="jsoup-1.14.3.jar"/> <library name="jsr305-3.0.2.jar"/> - <library name="jul-to-slf4j-1.7.30.jar"/> <library name="juniversalchardet-1.0.3.jar"/> - <library name="junrar-4.0.0.jar"/> + <library name="junrar-7.4.0.jar"/> <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/> + <library name="log4j-core-2.17.1.jar"/> + <library name="log4j-slf4j-impl-2.17.1.jar"/> <library name="mapdb-1.0.8.jar"/> - <library name="mchange-commons-java-0.2.19.jar"/> - <library name="metadata-extractor-2.13.0.jar"/> - <library name="mimepull-1.9.7.jar"/> - <library name="netcdf4-4.5.5.jar"/> - <library name="openjson-1.0.11.jar"/> - <library name="opennlp-tools-1.9.2.jar"/> - <library name="owlapi-api-5.1.13.jar"/> - <library name="owlapi-apibinding-5.1.13.jar"/> - <library name="owlapi-impl-5.1.13.jar"/> - <library name="owlapi-oboformat-5.1.13.jar"/> - <library name="owlapi-parsers-5.1.13.jar"/> - <library name="owlapi-rio-5.1.13.jar"/> - <library name="owlapi-tools-5.1.13.jar"/> - <library name="parso-2.0.11.jar"/> - <library name="pdfbox-2.0.19.jar"/> - <library name="pdfbox-tools-2.0.19.jar"/> + <library name="metadata-extractor-2.16.0.jar"/> + <library name="owlapi-api-5.1.19.jar"/> + <library name="owlapi-apibinding-5.1.19.jar"/> + <library name="owlapi-impl-5.1.19.jar"/> + <library name="owlapi-oboformat-5.1.19.jar"/> + <library name="owlapi-parsers-5.1.19.jar"/> + <library name="owlapi-rio-5.1.19.jar"/> + <library name="owlapi-tools-5.1.19.jar"/> + <library name="parso-2.0.14.jar"/> + <library name="pdfbox-2.0.25.jar"/> + <library name="pdfbox-debugger-2.0.25.jar"/> + <library name="pdfbox-tools-2.0.25.jar"/> <library name="poi-4.1.2.jar"/> <library name="poi-ooxml-4.1.2.jar"/> <library name="poi-ooxml-schemas-4.1.2.jar"/> <library name="poi-scratchpad-4.1.2.jar"/> - <library name="preflight-2.0.19.jar"/> - <library name="protobuf-java-3.11.4.jar"/> - <library name="quartz-2.3.2.jar"/> - <library name="rdf4j-http-client-3.1.2.jar"/> - <library name="rdf4j-http-protocol-3.1.2.jar"/> - <library name="rdf4j-model-3.1.2.jar"/> - <library name="rdf4j-query-3.1.2.jar"/> - <library name="rdf4j-queryalgebra-evaluation-3.1.2.jar"/> - <library name="rdf4j-queryalgebra-model-3.1.2.jar"/> - <library name="rdf4j-queryparser-api-3.1.2.jar"/> - <library name="rdf4j-queryparser-sparql-3.1.2.jar"/> - <library name="rdf4j-queryresultio-api-3.1.2.jar"/> - <library name="rdf4j-queryresultio-binary-3.1.2.jar"/> - <library name="rdf4j-queryresultio-sparqlxml-3.1.2.jar"/> - <library name="rdf4j-repository-api-3.1.2.jar"/> - <library name="rdf4j-repository-sail-3.1.2.jar"/> - <library name="rdf4j-repository-sparql-3.1.2.jar"/> - <library name="rdf4j-rio-api-3.1.2.jar"/> - <library name="rdf4j-rio-binary-3.1.2.jar"/> - <library name="rdf4j-rio-datatypes-3.1.2.jar"/> - <library name="rdf4j-rio-jsonld-3.1.2.jar"/> - <library name="rdf4j-rio-languages-3.1.2.jar"/> - <library name="rdf4j-rio-n3-3.1.2.jar"/> - <library name="rdf4j-rio-nquads-3.1.2.jar"/> - <library name="rdf4j-rio-ntriples-3.1.2.jar"/> - <library name="rdf4j-rio-rdfjson-3.1.2.jar"/> - <library name="rdf4j-rio-rdfxml-3.1.2.jar"/> - <library name="rdf4j-rio-trig-3.1.2.jar"/> - <library name="rdf4j-rio-trix-3.1.2.jar"/> - <library name="rdf4j-rio-turtle-3.1.2.jar"/> - <library name="rdf4j-sail-api-3.1.2.jar"/> - <library name="rdf4j-sail-base-3.1.2.jar"/> - <library name="rdf4j-sail-memory-3.1.2.jar"/> - <library name="rdf4j-util-3.1.2.jar"/> - <library name="rome-1.12.2.jar"/> - <library name="rome-utils-1.12.2.jar"/> - <library name="saaj-impl-1.4.0-b03.jar"/> + <library name="rdf4j-http-client-3.7.4.jar"/> + <library name="rdf4j-http-protocol-3.7.4.jar"/> + <library name="rdf4j-model-3.7.4.jar"/> + <library name="rdf4j-model-api-3.7.4.jar"/> + <library name="rdf4j-model-vocabulary-3.7.4.jar"/> + <library name="rdf4j-query-3.7.4.jar"/> + <library name="rdf4j-queryalgebra-evaluation-3.7.4.jar"/> + <library name="rdf4j-queryalgebra-model-3.7.4.jar"/> + <library name="rdf4j-queryparser-api-3.7.4.jar"/> + <library name="rdf4j-queryparser-sparql-3.7.4.jar"/> + <library name="rdf4j-queryresultio-api-3.7.4.jar"/> + <library name="rdf4j-queryresultio-binary-3.7.4.jar"/> + <library name="rdf4j-queryresultio-sparqlxml-3.7.4.jar"/> + <library name="rdf4j-repository-api-3.7.4.jar"/> + <library name="rdf4j-repository-sail-3.7.4.jar"/> + <library name="rdf4j-repository-sparql-3.7.4.jar"/> + <library name="rdf4j-rio-api-3.7.4.jar"/> + <library name="rdf4j-rio-binary-3.7.4.jar"/> + <library name="rdf4j-rio-datatypes-3.7.4.jar"/> + <library name="rdf4j-rio-hdt-3.4.3.jar"/> + <library name="rdf4j-rio-jsonld-3.7.4.jar"/> + <library name="rdf4j-rio-languages-3.7.4.jar"/> + <library name="rdf4j-rio-n3-3.7.4.jar"/> + <library name="rdf4j-rio-nquads-3.7.4.jar"/> + <library name="rdf4j-rio-ntriples-3.7.4.jar"/> + <library name="rdf4j-rio-rdfjson-3.7.4.jar"/> + <library name="rdf4j-rio-rdfxml-3.7.4.jar"/> + <library name="rdf4j-rio-trig-3.7.4.jar"/> + <library name="rdf4j-rio-trix-3.7.4.jar"/> + <library name="rdf4j-rio-turtle-3.7.4.jar"/> + <library name="rdf4j-sail-api-3.7.4.jar"/> + <library name="rdf4j-sail-base-3.7.4.jar"/> + <library name="rdf4j-sail-memory-3.7.4.jar"/> + <library name="rdf4j-util-3.7.4.jar"/> + <library name="rome-1.16.0.jar"/> + <library name="rome-utils-1.16.0.jar"/> <library name="semargl-core-0.7.jar"/> <library name="semargl-rdf-0.7.jar"/> <library name="semargl-rdf4j-0.7.jar"/> <library name="semargl-rdfa-0.7.jar"/> - <library name="sentiment-analysis-parser-0.1.jar"/> - <library name="sis-feature-1.0.jar"/> - <library name="sis-metadata-1.0.jar"/> - <library name="sis-netcdf-1.0.jar"/> - <library name="sis-referencing-1.0.jar"/> - <library name="sis-storage-1.0.jar"/> - <library name="sis-utility-1.0.jar"/> - <library name="snakeyaml-1.26.jar"/> - <library name="stax-ex-1.8.2.jar"/> - <library name="stax2-api-3.1.4.jar"/> + <library name="snakeyaml-1.30.jar"/> <library name="tagsoup-1.2.1.jar"/> - <library name="tika-core-1.24.jar"/> - <library name="tika-parsers-1.24.jar"/> - <library name="txw2-2.3.2.jar"/> - <library name="udunits-4.5.5.jar"/> - <library name="unit-api-1.0.jar"/> + <library name="tika-core-2.2.1.jar"/> + <library name="tika-parser-apple-module-2.2.1.jar"/> + <library name="tika-parser-audiovideo-module-2.2.1.jar"/> + <library name="tika-parser-cad-module-2.2.1.jar"/> + <library name="tika-parser-code-module-2.2.1.jar"/> + <library name="tika-parser-crypto-module-2.2.1.jar"/> + <library name="tika-parser-digest-commons-2.2.1.jar"/> + <library name="tika-parser-font-module-2.2.1.jar"/> + <library name="tika-parser-html-commons-2.2.1.jar"/> + <library name="tika-parser-html-module-2.2.1.jar"/> + <library name="tika-parser-image-module-2.2.1.jar"/> + <library name="tika-parser-mail-commons-2.2.1.jar"/> + <library name="tika-parser-mail-module-2.2.1.jar"/> + <library name="tika-parser-microsoft-module-2.2.1.jar"/> + <library name="tika-parser-miscoffice-module-2.2.1.jar"/> + <library name="tika-parser-news-module-2.2.1.jar"/> + <library name="tika-parser-ocr-module-2.2.1.jar"/> + <library name="tika-parser-pdf-module-2.2.1.jar"/> + <library name="tika-parser-pkg-module-2.2.1.jar"/> + <library name="tika-parser-text-module-2.2.1.jar"/> + <library name="tika-parser-xml-module-2.2.1.jar"/> + <library name="tika-parser-xmp-commons-2.2.1.jar"/> + <library name="tika-parser-zip-commons-2.2.1.jar"/> + <library name="tika-parsers-standard-package-2.2.1.jar"/> + <library name="txw2-2.3.5.jar"/> <library name="vinnie-2.0.2.jar"/> <library name="vorbis-java-core-0.8.jar"/> <library name="vorbis-java-tika-0.8.jar"/> - <library name="woodstox-core-5.0.3.jar"/> <library name="xmlbeans-3.1.0.jar"/> - <library name="xmlschema-core-2.2.5.jar"/> - <library name="xmpbox-2.0.19.jar"/> - <library name="xmpcore-6.1.10.jar"/> - <library name="xmpcore-shaded-6.1.10.jar"/> - <library name="xz-1.8.jar"/> + <library name="xmpcore-6.1.11.jar"/> + <library name="xz-1.9.jar"/> <!-- End Any23 dependencies --> </runtime> diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java index c0f1d6f..09dc32e 100644 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java +++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java @@ -106,7 +106,7 @@ public class Any23IndexingFilter implements IndexingFilter { return doc; } - private String keyToShortKey(String key) { + private static String keyToShortKey(String key) { if (key.startsWith("<") && key.endsWith(">")) { key = key.substring(1, key.length() - 1); } diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java index af7f135..bed659f 100644 --- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java +++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java @@ -20,13 +20,17 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.net.URISyntaxException; import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.Collections; import java.util.Set; import java.util.TreeSet; -import java.util.Collections; -import java.util.Arrays; import org.apache.any23.Any23; import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.filter.IgnoreAccidentalRDFa; +import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments; +import org.apache.any23.mime.TikaMIMETypeDetector; +import org.apache.any23.mime.purifier.WhiteSpacesPurifier; import org.apache.any23.writer.BenchmarkTripleHandler; import org.apache.any23.writer.NTriplesWriter; import org.apache.any23.writer.TripleHandler; @@ -76,7 +80,7 @@ public class Any23ParseFilter implements HtmlParseFilter { Set<String> triples = null; Any23Parser(String url, String htmlContent, String contentType, String... extractorNames) throws TripleHandlerException { - triples = new TreeSet<>(); + this.triples = new TreeSet<>(); try { parse(url, htmlContent, contentType, extractorNames); } catch (URISyntaxException e) { @@ -91,33 +95,32 @@ public class Any23ParseFilter implements HtmlParseFilter { * Maintains a {@link java.util.Set} containing the triples * @return a {@link java.util.Set} of triples. */ - private Set<String> getTriples() { - return triples; + Set<String> getTriples() { + return this.triples; } - private void parse(String url, String htmlContent, String contentType, String... extractorNames) throws URISyntaxException, IOException, TripleHandlerException { + private void parse(String url, String htmlContent, String contentType, String... extractorNames) + throws URISyntaxException, IOException, TripleHandlerException { Any23 any23 = new Any23(extractorNames); - any23.setMIMETypeDetector(null); + any23.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier())); ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try { - TripleHandler tHandler = new NTriplesWriter(baos); - BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler); + try (TripleHandler tHandler = new IgnoreTitlesOfEmptyDocuments( + new IgnoreAccidentalRDFa( + new NTriplesWriter(baos))); + BenchmarkTripleHandler bHandler = new BenchmarkTripleHandler(tHandler)) { try { any23.extract(htmlContent, url, contentType, "UTF-8", bHandler); } catch (IOException e) { LOG.error("Error while reading the source", e); } catch (ExtractionException e) { LOG.error("Error while extracting structured data", e); - } finally { - tHandler.close(); - bHandler.close(); } LOG.debug("Any23 BenchmarkTripleHandler.report: " + bHandler.report()); String n3 = baos.toString("UTF-8"); String[] triplesStrings = n3.split("\n"); - Collections.addAll(triples, triplesStrings); + Collections.addAll(this.triples, triplesStrings); } catch (IOException e) { LOG.error("Unexpected IOException", e); } @@ -139,8 +142,8 @@ public class Any23ParseFilter implements HtmlParseFilter { */ @Override public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { - String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta"); - String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml"); + String[] extractorNames = this.conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta"); + String[] supportedContentTypes = this.conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml"); String contentType = content.getContentType(); if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) { LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType); diff --git a/src/plugin/build-plugin.xml b/src/plugin/build-plugin.xml index 814123e..f6e87e8 100755 --- a/src/plugin/build-plugin.xml +++ b/src/plugin/build-plugin.xml @@ -80,7 +80,8 @@ </fileset> <!-- global test dependencies --> <fileset dir="${nutch.root}/build/test/lib"> - <include name="*.jar" /> + <include name="junit*.jar" /> + <include name="hamcrest*.jar" /> </fileset> <path refid="classpath"/> </path> diff --git a/src/plugin/language-identifier/ivy.xml b/src/plugin/language-identifier/ivy.xml index e7bf2e6..feba0bb 100644 --- a/src/plugin/language-identifier/ivy.xml +++ b/src/plugin/language-identifier/ivy.xml @@ -36,7 +36,7 @@ </publications> <dependencies> - <dependency org="org.apache.tika" name="tika-langdetect-optimaize" rev="2.1.0" conf="*->default"> + <dependency org="org.apache.tika" name="tika-langdetect-optimaize" rev="2.2.1" conf="*->default"> <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) --> <exclude org="org.apache.tika" name="tika-core" /> <exclude org="com.google.guava" name="guava" /> diff --git a/src/plugin/language-identifier/plugin.xml b/src/plugin/language-identifier/plugin.xml index 85fa121..360b81a 100644 --- a/src/plugin/language-identifier/plugin.xml +++ b/src/plugin/language-identifier/plugin.xml @@ -27,15 +27,15 @@ </library> <!-- dependencies of Tika's Optimaize language detector (tika-langdetect-optimaize) --> <library name="annotations-12.0.jar"/> - <library name="checker-qual-3.8.0.jar"/> - <library name="error_prone_annotations-2.5.1.jar"/> + <library name="checker-qual-3.12.0.jar"/> + <library name="error_prone_annotations-2.7.1.jar"/> <library name="failureaccess-1.0.1.jar"/> <library name="j2objc-annotations-1.3.jar"/> <library name="jsonic-1.2.11.jar"/> <library name="jsr305-3.0.2.jar"/> <library name="language-detector-0.6.jar"/> <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/> - <library name="tika-langdetect-optimaize-2.1.0.jar"/> + <library name="tika-langdetect-optimaize-2.2.1.jar"/> </runtime> <requires> diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt index 697750e..cb3ed6b 100644 --- a/src/plugin/parse-tika/howto_upgrade_tika.txt +++ b/src/plugin/parse-tika/howto_upgrade_tika.txt @@ -23,8 +23,7 @@ (eventually with different versions) - duplicated libs can be added to the exclusions of transitive dependencies in build/plugins/parse-tika/ivy.xml - - but it should be made sure that the library versions in ivy/ivy.xml correspond to - those required by Tika + - but the library versions in ivy/ivy.xml MUST correspond to those required by Tika 5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/: @@ -34,6 +33,8 @@ $ cd ../language-identifier/ +It should be noted that Any23 also has a dependency on Tika so you may wish to check that there are no classpath conflicts in the any23 plugin as well. + 7. Build Nutch and run all unit tests: $ cd ../../../ diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml index 388b695..f0033b8 100644 --- a/src/plugin/parse-tika/ivy.xml +++ b/src/plugin/parse-tika/ivy.xml @@ -36,7 +36,7 @@ </publications> <dependencies> - <dependency org="org.apache.tika" name="tika-parsers-standard-package" rev="2.1.0" conf="*->default"> + <dependency org="org.apache.tika" name="tika-parsers-standard-package" rev="2.2.1" conf="*->default"> <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) --> <exclude org="org.apache.tika" name="tika-core" /> <exclude org="org.apache.commons" name="commons-lang3" /> diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml index 275406a..1d4e2c7 100644 --- a/src/plugin/parse-tika/plugin.xml +++ b/src/plugin/parse-tika/plugin.xml @@ -26,13 +26,14 @@ <export name="*"/> </library> <!-- dependencies of Tika (tika-parsers) --> + <library name="SparseBitSet-1.2.jar"/> <library name="apache-mime4j-core-0.8.4.jar"/> <library name="apache-mime4j-dom-0.8.4.jar"/> <library name="asm-9.2.jar"/> - <library name="bcmail-jdk15on-1.69.jar"/> - <library name="bcpkix-jdk15on-1.69.jar"/> - <library name="bcprov-jdk15on-1.69.jar"/> - <library name="bcutil-jdk15on-1.69.jar"/> + <library name="bcmail-jdk15on-1.70.jar"/> + <library name="bcpkix-jdk15on-1.70.jar"/> + <library name="bcprov-jdk15on-1.70.jar"/> + <library name="bcutil-jdk15on-1.70.jar"/> <library name="boilerpipe-1.1.0.jar"/> <library name="commons-codec-1.15.jar"/> <library name="commons-compress-1.21.jar"/> @@ -42,7 +43,7 @@ <library name="curvesapi-1.06.jar"/> <library name="dd-plist-1.23.jar"/> <library name="dec-0.1.2.jar"/> - <library name="fontbox-2.0.24.jar"/> + <library name="fontbox-2.0.25.jar"/> <library name="istack-commons-runtime-3.0.12.jar"/> <library name="jackcess-4.0.1.jar"/> <library name="jackcess-encrypt-4.0.1.jar"/> @@ -50,10 +51,10 @@ <library name="jakarta.activation-1.2.2.jar"/> <library name="jakarta.xml.bind-api-2.3.3.jar"/> <library name="java-libpst-0.9.3.jar"/> - <library name="jaxb-runtime-2.3.4.jar"/> + <library name="jaxb-runtime-2.3.5.jar"/> <library name="jbig2-imageio-3.0.3.jar"/> <library name="jcl-over-slf4j-1.7.32.jar"/> - <library name="jdom2-2.0.6.jar"/> + <library name="jdom2-2.0.6.1.jar"/> <library name="jempbox-1.8.16.jar"/> <library name="jhighlight-1.0.3.jar"/> <library name="jmatio-1.5.jar"/> @@ -61,41 +62,40 @@ <library name="junrar-7.4.0.jar"/> <library name="metadata-extractor-2.16.0.jar"/> <library name="parso-2.0.14.jar"/> - <library name="pdfbox-2.0.24.jar"/> - <library name="pdfbox-debugger-2.0.24.jar"/> - <library name="pdfbox-tools-2.0.24.jar"/> + <library name="pdfbox-2.0.25.jar"/> + <library name="pdfbox-debugger-2.0.25.jar"/> + <library name="pdfbox-tools-2.0.25.jar"/> <library name="poi-4.1.2.jar"/> <library name="poi-ooxml-4.1.2.jar"/> <library name="poi-ooxml-schemas-4.1.2.jar"/> <library name="poi-scratchpad-4.1.2.jar"/> <library name="rome-1.16.0.jar"/> <library name="rome-utils-1.16.0.jar"/> - <library name="SparseBitSet-1.2.jar"/> <library name="tagsoup-1.2.1.jar"/> - <library name="tika-parser-apple-module-2.1.0.jar"/> - <library name="tika-parser-audiovideo-module-2.1.0.jar"/> - <library name="tika-parser-cad-module-2.1.0.jar"/> - <library name="tika-parser-code-module-2.1.0.jar"/> - <library name="tika-parser-crypto-module-2.1.0.jar"/> - <library name="tika-parser-digest-commons-2.1.0.jar"/> - <library name="tika-parser-font-module-2.1.0.jar"/> - <library name="tika-parser-html-commons-2.1.0.jar"/> - <library name="tika-parser-html-module-2.1.0.jar"/> - <library name="tika-parser-image-module-2.1.0.jar"/> - <library name="tika-parser-mail-commons-2.1.0.jar"/> - <library name="tika-parser-mail-module-2.1.0.jar"/> - <library name="tika-parser-microsoft-module-2.1.0.jar"/> - <library name="tika-parser-miscoffice-module-2.1.0.jar"/> - <library name="tika-parser-news-module-2.1.0.jar"/> - <library name="tika-parser-ocr-module-2.1.0.jar"/> - <library name="tika-parser-pdf-module-2.1.0.jar"/> - <library name="tika-parser-pkg-module-2.1.0.jar"/> - <library name="tika-parsers-standard-package-2.1.0.jar"/> - <library name="tika-parser-text-module-2.1.0.jar"/> - <library name="tika-parser-xml-module-2.1.0.jar"/> - <library name="tika-parser-xmp-commons-2.1.0.jar"/> - <library name="tika-parser-zip-commons-2.1.0.jar"/> - <library name="txw2-2.3.4.jar"/> + <library name="tika-parser-apple-module-2.2.1.jar"/> + <library name="tika-parser-audiovideo-module-2.2.1.jar"/> + <library name="tika-parser-cad-module-2.2.1.jar"/> + <library name="tika-parser-code-module-2.2.1.jar"/> + <library name="tika-parser-crypto-module-2.2.1.jar"/> + <library name="tika-parser-digest-commons-2.2.1.jar"/> + <library name="tika-parser-font-module-2.2.1.jar"/> + <library name="tika-parser-html-commons-2.2.1.jar"/> + <library name="tika-parser-html-module-2.2.1.jar"/> + <library name="tika-parser-image-module-2.2.1.jar"/> + <library name="tika-parser-mail-commons-2.2.1.jar"/> + <library name="tika-parser-mail-module-2.2.1.jar"/> + <library name="tika-parser-microsoft-module-2.2.1.jar"/> + <library name="tika-parser-miscoffice-module-2.2.1.jar"/> + <library name="tika-parser-news-module-2.2.1.jar"/> + <library name="tika-parser-ocr-module-2.2.1.jar"/> + <library name="tika-parser-pdf-module-2.2.1.jar"/> + <library name="tika-parser-pkg-module-2.2.1.jar"/> + <library name="tika-parser-text-module-2.2.1.jar"/> + <library name="tika-parser-xml-module-2.2.1.jar"/> + <library name="tika-parser-xmp-commons-2.2.1.jar"/> + <library name="tika-parser-zip-commons-2.2.1.jar"/> + <library name="tika-parsers-standard-package-2.2.1.jar"/> + <library name="txw2-2.3.5.jar"/> <library name="vorbis-java-core-0.8.jar"/> <library name="vorbis-java-tika-0.8.jar"/> <library name="xmlbeans-3.1.0.jar"/> diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java index 3dfe26d..de0c1c6 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java @@ -26,7 +26,7 @@ import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.Protocol; import org.apache.nutch.protocol.ProtocolException; import org.apache.nutch.protocol.ProtocolFactory; -import org.apache.tika.metadata.OfficeOpenXMLCore; +import org.apache.tika.metadata.DublinCore; import org.junit.Assert; import org.junit.Test; @@ -60,7 +60,7 @@ public class TestRTFParser extends TikaParserTest { Metadata meta = parse.getData().getParseMeta(); Assert.assertEquals("test rft document", title); - Assert.assertEquals("tests", meta.get(OfficeOpenXMLCore.SUBJECT.getName())); + Assert.assertEquals("tests", meta.get(DublinCore.SUBJECT.getName())); } }