Repository: any23 Updated Branches: refs/heads/master 33ce96c39 -> d283d70ce
ANY23-374 fix schemeless microdata urls Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/d283d70c Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/d283d70c Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/d283d70c Branch: refs/heads/master Commit: d283d70ceb692cacb1f31659ee5d5c987822028f Parents: 33ce96c Author: Hans <[email protected]> Authored: Tue Jul 31 12:21:26 2018 -0500 Committer: Hans <[email protected]> Committed: Tue Jul 31 12:24:42 2018 -0500 ---------------------------------------------------------------------- .../any23/extractor/microdata/ItemScope.java | 20 ++++++++++-- .../microdata/MicrodataExtractorTest.java | 9 ++++++ .../microdata/microdata-missing-scheme.html | 33 ++++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java index 0ab0fee..2f079bb 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemScope.java @@ -28,6 +28,7 @@ import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.regex.Pattern; /** * This class describes a <b>Microdata <i>itemscope</i></b>. @@ -75,12 +76,27 @@ public class ItemScope extends Item { this(xpath, itemProps, id, refs, stringToUrl(type), itemId); } + private static final Pattern looksLikeStartsWithHost = Pattern.compile("[^:/.]+(\\.[^:/.]+)+(:\\d+)?([/#?].*)?"); + static URL stringToUrl(String type) { if (StringUtils.isNotBlank(type)) { try { - return new URL(ParsedIRI.create(type.trim()).toString()); + ParsedIRI iri = ParsedIRI.create(type.trim()); + if (StringUtils.isBlank(iri.getScheme())) { + String host = iri.getHost(); + if (StringUtils.isNotBlank(host)) { + iri = new ParsedIRI("http", iri.getUserInfo(), host, iri.getPort(), iri.getPath(), iri.getQuery(), iri.getFragment()); + } else { + String path = iri.getPath(); + if (path != null && looksLikeStartsWithHost.matcher(path).matches()) { + iri = ParsedIRI.create("http://" + iri.toString()); + } + } + } + + return new URL(iri.toString()); } catch (MalformedURLException murle) { - throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL."); + throw new IllegalArgumentException("Invalid type '" + type + "', must be a valid URL. " + murle.getMessage()); } } else { return null; http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java index f2e7852..280b3f7 100644 --- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java @@ -23,6 +23,7 @@ import org.apache.any23.extractor.html.AbstractExtractorTestCase; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.SINDICE; import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.vocabulary.RDF; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.junit.Assert; @@ -83,6 +84,14 @@ public class MicrodataExtractorTest extends AbstractExtractorTestCase { assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4); } + @Test + public void testMicrodataMissingScheme() { + assertExtract("/microdata/microdata-missing-scheme.html"); + assertModelNotEmpty(); + assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer")); + System.out.println(dumpHumanReadableTriples()); + } + /** * Reference test as provided by <a href="http://googlewebmastercentral.blogspot.com/2010/03/microdata-support-for-rich-snippets.html">Google Rich Snippet for Microdata.</a> * http://git-wip-us.apache.org/repos/asf/any23/blob/d283d70c/test-resources/src/test/resources/microdata/microdata-missing-scheme.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/microdata-missing-scheme.html b/test-resources/src/test/resources/microdata/microdata-missing-scheme.html new file mode 100644 index 0000000..af8277f --- /dev/null +++ b/test-resources/src/test/resources/microdata/microdata-missing-scheme.html @@ -0,0 +1,33 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <title>Missing Scheme</title> +</head> +<body> + +<div itemscope itemtype="http://schema.org/Question"> + <h3 itemprop="name">Name</h3> + <div itemprop="acceptedAnswer" itemscope itemtype="schema.org/Answer"> + <p itemprop="text">Text</p> + </div> +</div> + +</body> +</html> \ No newline at end of file
