Repository: any23 Updated Branches: refs/heads/master 9f7ba688d -> ef7826df5
ANY23-389 fix html base elements for RDFa Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/ef7826df Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/ef7826df Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/ef7826df Branch: refs/heads/master Commit: ef7826df5e4ff9a2d32d1b9105760760a0293581 Parents: 9f7ba68 Author: Hans <[email protected]> Authored: Fri Aug 17 13:56:40 2018 -0500 Committer: Hans <[email protected]> Committed: Fri Aug 17 13:56:40 2018 -0500 ---------------------------------------------------------------------- .../any23/extractor/rdf/BaseRDFExtractor.java | 34 ++++++++++++++++++-- .../rdfa/opengraph-structured-properties.html | 3 ++ 2 files changed, 34 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/ef7826df/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index e908d55..767f6ee 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -26,6 +26,7 @@ import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.Extractor; import org.apache.any23.extractor.IssueReport; import org.apache.any23.extractor.html.JsoupUtils; +import org.eclipse.rdf4j.common.net.ParsedIRI; import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.RDFParseException; import org.eclipse.rdf4j.rio.RDFParser; @@ -40,8 +41,6 @@ import org.jsoup.nodes.Entities; import org.jsoup.nodes.Node; import org.jsoup.select.NodeFilter; import org.jsoup.select.NodeTraversor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.ByteArrayInputStream; import java.io.IOException; @@ -57,10 +56,10 @@ import java.util.regex.Pattern; * {@link org.apache.any23.extractor.Extractor.ContentExtractor}. * * @author Michele Mostarda ([email protected]) + * @author Hans Brende ([email protected]) */ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { - private static final Logger LOG = LoggerFactory.getLogger(BaseRDFExtractor.class); private boolean verifyDataType; private boolean stopAtFirstError; @@ -176,6 +175,35 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { tagName = tagName.substring(tagName.lastIndexOf(':') + 1); ((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName : "div"); + // fix for ANY23-389 + resolve_base: + if ("base".equalsIgnoreCase(tagName) && node.hasAttr("href")) { + String href = node.attr("href"); + String absHref; + try { + ParsedIRI parsedHref = ParsedIRI.create(href.trim()); + if (parsedHref.isAbsolute()) { + absHref = parsedHref.toString(); + } else { + parsedHref = ParsedIRI.create(iri.trim()).resolve(parsedHref); + if (parsedHref.isAbsolute()) { + absHref = parsedHref.toString(); + } else { + // shouldn't happen unless document IRI wasn't absolute + // ignore and let underlying RDFa parser report the issue + break resolve_base; + } + } + } catch (RuntimeException e) { + // can't parse href as a relative or absolute IRI: + // ignore and let underlying RDFa parser report the issue + break resolve_base; + } + if (!absHref.equals(href)) { + node.attr("href", absHref); + } + } + return FilterResult.CONTINUE; } return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType http://git-wip-us.apache.org/repos/asf/any23/blob/ef7826df/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html b/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html index 365ddac..7d7dbc2 100644 --- a/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html +++ b/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html @@ -19,6 +19,9 @@ <!-- All of the content below is based on the OGP examples provided at http://ogp.me/, this ensures that thw Any23 coverage is suffciently up-to-date. --> + + <!-- use relative base href to make sure ANY23-389 is fixed --> + <base href=""> <!-- Begin Basic Metadata --> <title>The Rock (1996)</title>
