Repository: any23
Updated Branches:
  refs/heads/master 7cbd82e88 -> 6b1469152


ANY23-404 hardcode default microdata registry


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/6b146915
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/6b146915
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/6b146915

Branch: refs/heads/master
Commit: 6b1469152ccd30f93b0686a73bd1ba02955d6411
Parents: 7cbd82e
Author: Hans <[email protected]>
Authored: Tue Oct 23 19:37:37 2018 -0500
Committer: Hans <[email protected]>
Committed: Tue Oct 23 19:37:37 2018 -0500

----------------------------------------------------------------------
 .../extractor/microdata/MicrodataExtractor.java | 54 ++++++++++++--------
 .../microdata/MicrodataExtractorTest.java       | 21 ++++++++
 .../src/test/resources/microdata/example2.html  | 28 ++++++++++
 .../src/test/resources/microdata/example5.html  | 31 +++++++++++
 4 files changed, 113 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
index 3663800..3b45dd4 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
@@ -64,8 +64,6 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
 
     private String documentLanguage;
 
-    private IRI defaultNamespace;
-
     @Override
     public ExtractorDescription getDescription() {
         return MicrodataExtractorFactory.getDescriptionInstance();
@@ -95,7 +93,10 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
             return;
         }
 
+        final IRI documentIRI = extractionContext.getDocumentIRI();
+
         boolean isStrict = 
extractionParameters.getFlag("any23.microdata.strict");
+        final IRI defaultNamespace;
         if (!isStrict) {
             defaultNamespace = 
RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default"));
             if (!defaultNamespace.getLocalName().isEmpty()) {
@@ -110,10 +111,9 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
         /**
          * 5.2.6
          */
-        final IRI documentIRI = extractionContext.getDocumentIRI();
         final Map<ItemScope, Resource> mappings = new HashMap<>();
         for (ItemScope itemScope : itemScopes) {
-            Resource subject = processType(itemScope, documentIRI, out, 
mappings);
+            Resource subject = processType(itemScope, documentIRI, out, 
mappings, defaultNamespace);
             out.writeTriple(
                     documentIRI,
                     MICRODATA_ITEM,
@@ -417,26 +417,31 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
     private Resource processType(
             ItemScope itemScope,
             IRI documentIRI, ExtractionResult out,
-            Map<ItemScope, Resource> mappings
+            Map<ItemScope, Resource> mappings, IRI defaultNamespace
     ) throws ExtractionException {
         Resource subject = mappings.computeIfAbsent(itemScope, scope -> 
createSubjectForItemId(scope.getItemId()));
 
         IRI itemScopeType = getType(itemScope);
         if (itemScopeType != null) {
             out.writeTriple(subject, RDF.TYPE, itemScopeType);
+            defaultNamespace = getNamespaceIRI(itemScopeType);
         }
         for (Map.Entry<String, List<ItemProp>> itemProps : 
itemScope.getProperties().entrySet()) {
             String propName = itemProps.getKey();
+            IRI predicate = getPredicate(defaultNamespace, propName);
+            if (predicate == null) {
+                continue;
+            }
             for (ItemProp itemProp : itemProps.getValue()) {
                 try {
                     processProperty(
                             subject,
-                            propName,
+                            predicate,
                             itemProp,
-                            itemScopeType,
                             documentIRI,
                             mappings,
-                            out
+                            out,
+                            defaultNamespace
                     );
                 } catch (URISyntaxException e) {
                     throw new ExtractionException(
@@ -461,40 +466,47 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
 
     private void processProperty(
             Resource subject,
-            String propName,
+            IRI predicate,
             ItemProp itemProp,
-            IRI itemScopeType,
             IRI documentIRI,
             Map<ItemScope, Resource> mappings,
-            ExtractionResult out
+            ExtractionResult out,
+            IRI defaultNamespace
     ) throws URISyntaxException, ExtractionException {
 
-        IRI predicate = getPredicate(itemScopeType != null ? itemScopeType : 
defaultNamespace, propName);
-        if (predicate == null) {
-            return;
-        }
-
         Value value;
         Object propValue = itemProp.getValue().getContent();
         ItemPropValue.Type propType = itemProp.getValue().getType();
         if (propType.equals(ItemPropValue.Type.Nested)) {
-            value = processType((ItemScope) propValue, documentIRI, out, 
mappings);
+            value = processType((ItemScope) propValue, documentIRI, out, 
mappings, defaultNamespace);
         } else if (propType.equals(ItemPropValue.Type.Plain)) {
             value = RDFUtils.literal((String) propValue, documentLanguage);
         } else if (propType.equals(ItemPropValue.Type.Link)) {
             value = toAbsoluteIRI(documentIRI, (String)propValue);
+            //TODO: support registries so hardcoding not needed
+            if 
(predicate.stringValue().equals("http://schema.org/additionalType";)) {
+                out.writeTriple(subject, RDF.TYPE, value);
+            }
         } else if (propType.equals(ItemPropValue.Type.Date)) {
             value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) 
propValue), XMLSchema.DATE);
         } else {
             throw new RuntimeException("Invalid Type '" +
-                    propType + "' for ItemPropValue with name: '" + propName + 
"'");
+                    propType + "' for ItemPropValue with name: '" + predicate 
+ "'");
         }
         out.writeTriple(subject, predicate, value);
     }
 
-    private static IRI getPredicate(IRI itemType, String localName) {
-        return toAbsoluteIRI(localName).orElseGet(() -> itemType == null ? 
null :
-                RDFUtils.iri(itemType.getNamespace(), localName.trim()));
+    private static final String hcardPrefix    = 
"http://microformats.org/profile/hcard";;
+    private static final IRI hcardNamespaceIRI = 
RDFUtils.iri("http://microformats.org/profile/hcard#";);
+
+    private static IRI getNamespaceIRI(IRI itemType) {
+        //TODO: support registries so hardcoding not needed
+        return itemType.stringValue().startsWith(hcardPrefix) ? 
hcardNamespaceIRI : itemType;
+    }
+
+    private static IRI getPredicate(IRI namespaceIRI, String localName) {
+        return toAbsoluteIRI(localName).orElseGet(() -> namespaceIRI == null ? 
null :
+                RDFUtils.iri(namespaceIRI.getNamespace(), localName.trim()));
     }
 
     private static Optional<IRI> toAbsoluteIRI(String urlString) {

http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
index e858ea3..fedd5fa 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataExtractorTest.java
@@ -24,6 +24,7 @@ import 
org.apache.any23.extractor.html.AbstractExtractorTestCase;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
 import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Value;
 import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -78,6 +79,26 @@ public class MicrodataExtractorTest extends 
AbstractExtractorTestCase {
     }
 
     @Test
+    public void testExample2() {
+        //Property URI generation for hcard
+        assertExtract("/microdata/example2.html");
+        assertContains(null, RDF.TYPE, 
RDFUtils.iri("http://microformats.org/profile/hcard";));
+        assertContains(null, 
RDFUtils.iri("http://microformats.org/profile/hcard#given-name";), (Value)null);
+        assertContains(null, 
RDFUtils.iri("http://microformats.org/profile/hcard#n";), (Value)null);
+    }
+
+    @Test
+    public void testExample5() {
+        //Vocabulary expansion for schema.org
+        assertExtract("/microdata/example5.html");
+        assertContains(null, RDF.TYPE, 
RDFUtils.iri("http://schema.org/Person";));
+        assertContains(null, RDF.TYPE, 
RDFUtils.iri("http://xmlns.com/foaf/0.1/Person";));
+        assertContains(null, RDFUtils.iri("http://schema.org/additionalType";), 
RDFUtils.iri("http://xmlns.com/foaf/0.1/Person";));
+        assertContains(null, RDFUtils.iri("http://schema.org/email";), 
RDFUtils.iri("mailto:[email protected]";));
+        assertContains(null, RDFUtils.iri("http://xmlns.com/foaf/0.1/mbox";), 
RDFUtils.iri("mailto:[email protected]";));
+    }
+
+    @Test
     public void testMicrodataBasic() {
         assertExtract("/microdata/microdata-basic.html");
         assertModelNotEmpty();

http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/test-resources/src/test/resources/microdata/example2.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/example2.html 
b/test-resources/src/test/resources/microdata/example2.html
new file mode 100644
index 0000000..6ad5a33
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/example2.html
@@ -0,0 +1,28 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- source: http://w3c.github.io/microdata-rdf -->
+
+<html lang="en">
+<body>
+<span itemscope itemtype="http://microformats.org/profile/hcard";>
+  <span itemprop="n" itemscope>
+    <span itemprop="given-name">Princeton</span>
+  </span>
+</span>
+</body>
+</html>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/6b146915/test-resources/src/test/resources/microdata/example5.html
----------------------------------------------------------------------
diff --git a/test-resources/src/test/resources/microdata/example5.html 
b/test-resources/src/test/resources/microdata/example5.html
new file mode 100644
index 0000000..ba05051
--- /dev/null
+++ b/test-resources/src/test/resources/microdata/example5.html
@@ -0,0 +1,31 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- source: http://w3c.github.io/microdata-rdf -->
+
+<html lang="en">
+<head>
+</head>
+<body>
+<div itemscope itemtype="http://schema.org/Person";>
+    <link itemprop="additionalType" href="http://xmlns.com/foaf/0.1/Person"/>
+    <a itemprop="email http://xmlns.com/foaf/0.1/mbox"; 
href="mailto:[email protected]";>
+        [email protected]
+    </a>
+</div>
+</body>
+</html>
\ No newline at end of file

Reply via email to