This is an automated email from the ASF dual-hosted git repository.

hansbrende pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/any23.git


The following commit(s) were added to refs/heads/master by this push:
     new 6ee5d24  ANY23-428 resolve rdfa vocabs more intelligently
     new 67376fa  Merge pull request #158 from HansBrende/ANY23-428
6ee5d24 is described below

commit 6ee5d2485b28d3c92ac581620411259812b11c20
Author: Hans <[email protected]>
AuthorDate: Sun Mar 29 18:05:58 2020 -0500

    ANY23-428 resolve rdfa vocabs more intelligently
---
 .../apache/any23/extractor/rdfa/JsoupScanner.java  | 33 ++++++++++++++++++++++
 .../any23/extractor/rdfa/RDFa11ExtractorTest.java  |  8 ++++++
 .../html/rdfa/vocab-without-trailing-slash.html    | 23 +++++++++++++++
 3 files changed, 64 insertions(+)

diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdfa/JsoupScanner.java 
b/core/src/main/java/org/apache/any23/extractor/rdfa/JsoupScanner.java
index 066f050..e7cac7d 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdfa/JsoupScanner.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdfa/JsoupScanner.java
@@ -17,6 +17,7 @@
 
 package org.apache.any23.extractor.rdfa;
 
+import org.apache.commons.lang3.ArrayUtils;
 import org.jsoup.nodes.CDataNode;
 import org.jsoup.nodes.Comment;
 import org.jsoup.nodes.Element;
@@ -49,6 +50,25 @@ class JsoupScanner implements NodeVisitor {
         return str == null ? "" : str;
     }
 
+    private static final String[] commonHashDelimitedVocabs = {
+            "http://creativecommons.org/ns";,
+            "http://www.w3.org/2002/07/owl";,
+            "http://www.w3.org/1999/02/22-rdf-syntax-ns";,
+            "http://www.w3.org/ns/rdfa";,
+            "http://www.w3.org/2000/01/rdf-schema";,
+            "http://www.w3.org/1999/xhtml/vocab";,
+            "http://www.w3.org/2001/XMLSchema";,
+            "http://microformats.org/profile/hcard";,
+            "http://www.w3.org/2006/vcard/ns";,
+            "http://ogp.me/ns";,
+            "http://ogp.me/ns/music";,
+            "http://ogp.me/ns/video";,
+            "http://ogp.me/ns/article";,
+            "http://ogp.me/ns/book";,
+            "http://ogp.me/ns/profile";,
+            "http://ogp.me/ns/website";
+    };
+
     private void startElement(Element e) throws SAXException {
         ns.pushContext();
 
@@ -68,6 +88,19 @@ class JsoupScanner implements NodeVisitor {
                     handler.startPrefixMapping(localName, value);
                     continue;
                 }
+            } else if (name.equalsIgnoreCase("vocab")) {
+                // Fix for ANY23-428
+                name = "vocab";
+                value = value.trim();
+                int len = value.length();
+                char lastChar;
+                if (len != 0 && (lastChar = value.charAt(len - 1)) != '/' && 
lastChar != '#' && lastChar != ':') {
+                    if (ArrayUtils.contains(commonHashDelimitedVocabs, value)) 
{
+                        value += "#";
+                    } else {
+                        value += "/";
+                    }
+                }
             }
 
             remainingAttrs.add(name);
diff --git 
a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java 
b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
index 35ae030..d24d186 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java
@@ -228,6 +228,14 @@ public class RDFa11ExtractorTest extends 
AbstractRDFaExtractorTestCase {
         );
     }
 
+    @Test
+    public void testVocabWithoutTrailingSlash() {
+        // test for issue ANY23-428
+        assertExtract("/html/rdfa/vocab-without-trailing-slash.html");
+
+        assertContains(null, RDF.TYPE, 
RDFUtils.iri("http://schema.org/BreadcrumbList";));
+    }
+
     /**
      * Tests that the default parser settings enable tolerance in data type 
parsing.
      */
diff --git 
a/test-resources/src/test/resources/html/rdfa/vocab-without-trailing-slash.html 
b/test-resources/src/test/resources/html/rdfa/vocab-without-trailing-slash.html
new file mode 100644
index 0000000..382e453
--- /dev/null
+++ 
b/test-resources/src/test/resources/html/rdfa/vocab-without-trailing-slash.html
@@ -0,0 +1,23 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head></head>
+<body>
+<ol vocaB="http://schema.org"; typeof="BreadcrumbList">
+</ol>
+</body>
+</html>
\ No newline at end of file

Reply via email to