Repository: any23 Updated Branches: refs/heads/master 258c19150 -> 3e5dce1dd
ANY23-169 Fixed url resolving errors in MicrodataExtractor Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/3e5dce1d Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/3e5dce1d Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/3e5dce1d Branch: refs/heads/master Commit: 3e5dce1dd9b043b6d6c9687f6f212b9b2ed2e573 Parents: 258c191 Author: Hans <firedrak...@gmail.com> Authored: Fri Apr 13 03:33:22 2018 -0500 Committer: Hans <firedrak...@gmail.com> Committed: Fri Apr 13 03:33:22 2018 -0500 ---------------------------------------------------------------------- .../extractor/microdata/MicrodataExtractor.java | 117 +++++++------------ .../html/AbstractExtractorTestCase.java | 2 +- .../microdata/MicrodataExtractorTest.java | 14 +++ pom.xml | 2 +- ...crodata-nested-url-resolving-expected.nquads | 30 +++++ .../microdata-nested-url-resolving.html | 35 ++++++ .../schemaorg-example-2-expected.nquads | 8 +- 7 files changed, 129 insertions(+), 79 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/3e5dce1d/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java index 513ffbb..3663800 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java @@ -28,6 +28,7 @@ import org.apache.any23.extractor.html.DomUtils; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.DCTerms; import org.apache.any23.vocab.XHTML; +import org.eclipse.rdf4j.common.net.ParsedIRI; import org.eclipse.rdf4j.model.Literal; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.IRI; @@ -39,8 +40,6 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.util.Date; @@ -48,7 +47,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Objects; +import java.util.Optional; import java.util.Set; /** @@ -241,26 +240,14 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { if (href == null) { return; } - URL absoluteURL; - if (!isAbsoluteURL(href.getTextContent())) { - try { - absoluteURL = toAbsoluteURL( - documentIRI.toString(), - href.getTextContent(), - '/' - ); - } catch (MalformedURLException e) { - // okay, it's not an absolute URL, return - return; - } - } else { - try { - absoluteURL = new URL(href.getTextContent()); - } catch (MalformedURLException e) { - // cannot happen - return; - } + IRI iri; + try { + iri = toAbsoluteIRI(documentIRI, href.getTextContent()); + } catch (URISyntaxException e) { + // cannot happen + return; } + String[] relTokens = rel.getTextContent().split(" "); Set<String> tokensWithNoDuplicates = new HashSet<>(); for (String relToken : relTokens) { @@ -275,16 +262,11 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { tokensWithNoDuplicates.add(relToken.toLowerCase()); } for (String token : tokensWithNoDuplicates) { - IRI predicate; - if (isAbsoluteURL(token)) { - predicate = RDFUtils.iri(token); - } else { - predicate = RDFUtils.iri(XHTML.NS + token); - } + IRI predicate = toAbsoluteIRI(token).orElseGet(() -> RDFUtils.iri(XHTML.NS + token.trim())); out.writeTriple( documentIRI, predicate, - RDFUtils.iri(absoluteURL.toString()) + iri ); } } @@ -304,9 +286,10 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { String name = DomUtils.readAttribute(meta, "name", null); String content = DomUtils.readAttribute(meta, "content", null); if (name != null && content != null) { - if (isAbsoluteURL(name)) { + Optional<IRI> nameIRI = toAbsoluteIRI(name); + if (nameIRI.isPresent()) { processMetaElement( - RDFUtils.iri(name), + nameIRI.get(), content, getLanguage(meta), documentIRI, @@ -385,7 +368,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { } out.writeTriple( documentIRI, - RDFUtils.iri(XHTML.NS + name.toLowerCase()), + RDFUtils.iri(XHTML.NS + name.toLowerCase().trim()), subject ); } @@ -455,7 +438,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { mappings, out ); - } catch (MalformedURLException e) { + } catch (URISyntaxException e) { throw new ExtractionException( "Error while processing on subject '" + subject + "' the itemProp: '" + itemProp + "' " @@ -472,17 +455,8 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { } private static Resource createSubjectForItemId(String itemId) { - if (itemId != null) { - try { - URI uri = new URI(itemId.trim()); - if (uri.isAbsolute()) { - return RDFUtils.iri(uri.toString()); - } - } catch (URISyntaxException e) { - //not an absolute uri - } - } - return RDFUtils.bnode(); + Optional<IRI> iri = toAbsoluteIRI(itemId); + return iri.isPresent() ? iri.get() : RDFUtils.bnode(); } private void processProperty( @@ -493,7 +467,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { IRI documentIRI, Map<ItemScope, Resource> mappings, ExtractionResult out - ) throws MalformedURLException, ExtractionException { + ) throws URISyntaxException, ExtractionException { IRI predicate = getPredicate(itemScopeType != null ? itemScopeType : defaultNamespace, propName); if (predicate == null) { @@ -508,10 +482,7 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { } else if (propType.equals(ItemPropValue.Type.Plain)) { value = RDFUtils.literal((String) propValue, documentLanguage); } else if (propType.equals(ItemPropValue.Type.Link)) { - value = RDFUtils.iri(toAbsoluteURL( - documentIRI.toString(), - (String) propValue, - '/').toString()); + value = toAbsoluteIRI(documentIRI, (String)propValue); } else if (propType.equals(ItemPropValue.Type.Date)) { value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE); } else { @@ -522,37 +493,37 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { } private static IRI getPredicate(IRI itemType, String localName) { - if (isAbsoluteURL(localName)) { - return RDFUtils.iri(localName); - } else if (itemType != null) { - return RDFUtils.iri(itemType.getNamespace(), Objects.requireNonNull(localName)); - } else { - return null; + return toAbsoluteIRI(localName).orElseGet(() -> itemType == null ? null : + RDFUtils.iri(itemType.getNamespace(), localName.trim())); + } + + private static Optional<IRI> toAbsoluteIRI(String urlString) { + if (urlString != null) { + try { + ParsedIRI iri = ParsedIRI.create(urlString.trim()); + if (iri.isAbsolute()) { + return Optional.of(RDFUtils.iri(iri.toString())); + } + } catch (RuntimeException e) { + //not an absolute iri + } } + return Optional.empty(); } - private static boolean isAbsoluteURL(String urlString) { - boolean result = false; + private static IRI toAbsoluteIRI(IRI documentIRI, String part) throws URISyntaxException { + ParsedIRI iri; try { - URL url = new URL(urlString); - String protocol = url.getProtocol(); - if (protocol != null && protocol.trim().length() > 0) - result = true; - } catch (MalformedURLException e) { - return false; + iri = ParsedIRI.create(part.trim()); + } catch (RuntimeException e) { + throw new URISyntaxException(String.valueOf(part), e.getClass().getName() + ": " + e.getMessage()); } - return result; - } - private URL toAbsoluteURL(String ns, String part, char trailing) - throws MalformedURLException { - if (isAbsoluteURL(part)) { - return new URL(part); + if (iri.isAbsolute()) { + return RDFUtils.iri(iri.toString()); } - char lastChar = ns.charAt(ns.length() - 1); - if (lastChar == '#' || lastChar == '/') - return new URL(ns + part); - return new URL(ns + trailing + part); + + return RDFUtils.iri(new ParsedIRI(documentIRI.toString()).resolve(iri).toString()); } private void notifyError(MicrodataParserException[] errors, ExtractionResult out) { http://git-wip-us.apache.org/repos/asf/any23/blob/3e5dce1d/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java index 5354924..f04d59f 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java +++ b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java @@ -624,7 +624,7 @@ public abstract class AbstractExtractorTestCase extends AbstractAny23TestBase { * @param p * predicate * @return matching object. - * @throws org.openrdf.repository.RepositoryException + * @throws org.eclipse.rdf4j.repository.RepositoryException */ protected Value findObject(Resource s, IRI p) throws RepositoryException { RepositoryResult<Statement> statements = conn.getStatements(s, p, null, http://git-wip-us.apache.org/repos/asf/any23/blob/3e5dce1d/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java index 8161b36..1294c93 100644 --- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java @@ -22,6 +22,7 @@ import org.apache.any23.extractor.ExtractorFactory; import org.apache.any23.extractor.html.AbstractExtractorTestCase; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.SINDICE; +import org.eclipse.rdf4j.model.IRI; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.junit.Assert; @@ -172,6 +173,19 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase { logger.debug(dumpHumanReadableTriples()); } + @Test + public void testMicrodataNestedUrlResolving() throws IOException { + IRI oldBaseIRI = baseIRI; + try { + logger.info("\n"); + baseIRI = RDFUtils.iri("https://ruben.verborgh.org/tmp/schemaorg-test.html"); + extractAndVerifyAgainstNQuads("microdata-nested-url-resolving.html", + "microdata-nested-url-resolving-expected.nquads"); + } finally { + baseIRI = oldBaseIRI; + } + } + private void extractAndVerifyAgainstNQuads(String actual, String expected) throws RepositoryException, RDFHandlerException, IOException, RDFParseException { assertExtract("/microdata/" + actual); http://git-wip-us.apache.org/repos/asf/any23/blob/3e5dce1d/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 516ed64..392a3cd 100644 --- a/pom.xml +++ b/pom.xml @@ -274,7 +274,7 @@ <httpcore.version>4.4.6</httpcore.version> <owlapi.version>5.1.3</owlapi.version> <poi.version>3.16</poi.version> - <rdf4j.version>2.3.0</rdf4j.version> + <rdf4j.version>2.3.1</rdf4j.version> <semargl.version>0.7</semargl.version> <slf4j.logger.version>1.7.25</slf4j.logger.version> <tika.version>1.17</tika.version> http://git-wip-us.apache.org/repos/asf/any23/blob/3e5dce1d/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads new file mode 100644 index 0000000..0eb4bcf --- /dev/null +++ b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads @@ -0,0 +1,30 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +_:node1causocqkx2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/BlogPosting> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx2 <http://schema.org/alternativeHeadline> "Solution-based problem-solving restricts the result before the start."@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx2 <http://schema.org/datePublished> "2013-07-30"^^<http://www.w3.org/2001/XMLSchema#date> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx3 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx3 <http://schema.org/givenName> "Ruben"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html>. +_:node1causocqkx3 <http://schema.org/familyName> "Verborgh"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx3 <http://schema.org/name> <https://ruben.verborgh.org/> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx3 <http://schema.org/url> <https://ruben.verborgh.org/> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx2 <http://schema.org/author> _:node1causocqkx3 <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx2 <http://schema.org/name> "One hammer for a thousand nails"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx2 <http://schema.org/headline> "One hammer for a thousand nails"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +<https://ruben.verborgh.org/tmp/schemaorg-test.html> <http://www.w3.org/1999/xhtml/microdata#item> _:node1causocqkx2 <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +<https://ruben.verborgh.org/tmp/schemaorg-test.html> <http://purl.org/dc/terms/title> "One hammer for a thousand nails | Ruben Verborgh"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> . http://git-wip-us.apache.org/repos/asf/any23/blob/3e5dce1d/test-resources/src/test/resources/microdata/microdata-nested-url-resolving.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving.html b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving.html new file mode 100644 index 0000000..ec3e677 --- /dev/null +++ b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving.html @@ -0,0 +1,35 @@ +<!DOCTYPE HTML> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- Original page source: https://ruben.verborgh.org/tmp/schemaorg-test.html --> +<html lang="en" prefix="rv: http://ruben.verborgh.org/# og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# article: http://ogp.me/ns/article#"> +<head> + <meta charset="utf-8"> + <title>One hammer for a thousand nails | Ruben Verborgh</title> + <meta property="dc:title" content="One hammer for a thousand nails"/> +</head> +<body> +<article class="blog" itemscope itemtype=http://schema.org/BlogPosting> + <h1 itemprop="name headline">One hammer for a thousand nails</h1> + <h2 itemprop="alternativeHeadline">Solution-based problem-solving restricts the result before the start.</h2> + <p class="signature"> + <span class="author" itemprop="author" itemscope itemtype=http://schema.org/Person><a itemprop="name url" href="/"><span itemprop="givenName">Ruben</span><span class="spacing"> </span><span itemprop="familyName">Verborgh</span></a></span><br> + <time itemprop="datePublished" datetime="2013-07-30T20:30:00+02:00">30 July 2013</time> + </p> +</article> +</body> +</html> http://git-wip-us.apache.org/repos/asf/any23/blob/3e5dce1d/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads index 504b6c8..8c6e70d 100644 --- a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads +++ b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads @@ -19,13 +19,13 @@ _:node8b30931f1dde708283dc52546c5572a6 <http://www.w3.org/1999/02/22-rdf-syntax- _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/price> "$55,000.00" <http://bob.example.com/> . _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/description> "2010 Dodge Challenger SRT8 Limited EditionBright Silver Metallic with Dark Slate Gray Leather Interior6.1 Liter (370 CI) V8 SRT HEMI Engine6 Speed Manual Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" <http://bob.example.com/> . _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/name> "2010 Dodge Challenger SRT8" <http://bob.example.com/> . -_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/image> <http://bob.example.com//microdata/images/2010-dodge-challenger-srt8.jpg> <http://bob.example.com/> . +_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/image> <http://bob.example.com/microdata/images/2010-dodge-challenger-srt8.jpg> <http://bob.example.com/> . _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/url> <http://vheminc.com/> <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/microdata#item> _:node8b30931f1dde708283dc52546c5572a6 <http://bob.example.com/> . <http://bob.example.com/> <http://purl.org/dc/terms/title> "HTML5 Microdata Example - http://schema.org/Product" <http://bob.example.com/> . -<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com//images/favicon1.ico> <http://bob.example.com/> . -<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#shortcut> <http://bob.example.com//images/favicon1.ico> <http://bob.example.com/> . -<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com//images/favicon1.gif> <http://bob.example.com/> . +<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com/images/favicon1.ico> <http://bob.example.com/> . +<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#shortcut> <http://bob.example.com/images/favicon1.ico> <http://bob.example.com/> . +<http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#icon> <http://bob.example.com/images/favicon1.gif> <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#robots> "noarchive" <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#description> "HTML5 Microdata Example for http://schema.org/Product" <http://bob.example.com/> . <http://bob.example.com/> <http://www.w3.org/1999/xhtml/vocab#author> "Edward Lewis" <http://bob.example.com/> . \ No newline at end of file