Repository: any23
Updated Branches:
  refs/heads/master 9f7ba688d -> ef7826df5


ANY23-389 fix html base elements for RDFa


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/ef7826df
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/ef7826df
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/ef7826df

Branch: refs/heads/master
Commit: ef7826df5e4ff9a2d32d1b9105760760a0293581
Parents: 9f7ba68
Author: Hans <[email protected]>
Authored: Fri Aug 17 13:56:40 2018 -0500
Committer: Hans <[email protected]>
Committed: Fri Aug 17 13:56:40 2018 -0500

----------------------------------------------------------------------
 .../any23/extractor/rdf/BaseRDFExtractor.java   | 34 ++++++++++++++++++--
 .../rdfa/opengraph-structured-properties.html   |  3 ++
 2 files changed, 34 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/ef7826df/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index e908d55..767f6ee 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -26,6 +26,7 @@ import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.IssueReport;
 import org.apache.any23.extractor.html.JsoupUtils;
+import org.eclipse.rdf4j.common.net.ParsedIRI;
 import org.eclipse.rdf4j.rio.RDFFormat;
 import org.eclipse.rdf4j.rio.RDFParseException;
 import org.eclipse.rdf4j.rio.RDFParser;
@@ -40,8 +41,6 @@ import org.jsoup.nodes.Entities;
 import org.jsoup.nodes.Node;
 import org.jsoup.select.NodeFilter;
 import org.jsoup.select.NodeTraversor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
@@ -57,10 +56,10 @@ import java.util.regex.Pattern;
  * {@link org.apache.any23.extractor.Extractor.ContentExtractor}.
  *
  * @author Michele Mostarda ([email protected])
+ * @author Hans Brende ([email protected])
  */
 public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
 
-    private static final Logger LOG = 
LoggerFactory.getLogger(BaseRDFExtractor.class);
     private boolean verifyDataType;
     private boolean stopAtFirstError;
 
@@ -176,6 +175,35 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                             tagName = 
tagName.substring(tagName.lastIndexOf(':') + 1);
                             
((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName 
: "div");
 
+                            // fix for ANY23-389
+                            resolve_base:
+                            if ("base".equalsIgnoreCase(tagName) && 
node.hasAttr("href")) {
+                                String href = node.attr("href");
+                                String absHref;
+                                try {
+                                    ParsedIRI parsedHref = 
ParsedIRI.create(href.trim());
+                                    if (parsedHref.isAbsolute()) {
+                                        absHref = parsedHref.toString();
+                                    } else {
+                                        parsedHref = 
ParsedIRI.create(iri.trim()).resolve(parsedHref);
+                                        if (parsedHref.isAbsolute()) {
+                                            absHref = parsedHref.toString();
+                                        } else {
+                                            // shouldn't happen unless 
document IRI wasn't absolute
+                                            // ignore and let underlying RDFa 
parser report the issue
+                                            break resolve_base;
+                                        }
+                                    }
+                                } catch (RuntimeException e) {
+                                    // can't parse href as a relative or 
absolute IRI:
+                                    // ignore and let underlying RDFa parser 
report the issue
+                                    break resolve_base;
+                                }
+                                if (!absHref.equals(href)) {
+                                    node.attr("href", absHref);
+                                }
+                            }
+
                             return FilterResult.CONTINUE;
                         }
                         return node instanceof DataNode || node instanceof 
Comment || node instanceof DocumentType

http://git-wip-us.apache.org/repos/asf/any23/blob/ef7826df/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
 
b/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
index 365ddac..7d7dbc2 100644
--- 
a/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
+++ 
b/test-resources/src/test/resources/html/rdfa/opengraph-structured-properties.html
@@ -19,6 +19,9 @@
   <!--  All of the content below is based on the OGP examples provided at
   http://ogp.me/, this ensures that thw Any23 coverage is suffciently 
up-to-date.
    -->
+
+  <!-- use relative base href to make sure ANY23-389 is fixed -->
+  <base href="">
   
   <!-- Begin Basic Metadata -->
   <title>The Rock (1996)</title>

Reply via email to