Repository: any23
Updated Branches:
  refs/heads/master 6b1469152 -> f23c25cc2


ANY23-405 Parse microdata property values correctly


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/f23c25cc
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/f23c25cc
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/f23c25cc

Branch: refs/heads/master
Commit: f23c25cc23938aa27551426d38dd0139fd30b9f4
Parents: 6b14691
Author: Hans <[email protected]>
Authored: Wed Oct 24 10:35:10 2018 -0500
Committer: Hans <[email protected]>
Committed: Wed Oct 24 10:35:10 2018 -0500

----------------------------------------------------------------------
 .../extractor/microdata/ItemPropValue.java      | 27 ++++++
 .../extractor/microdata/MicrodataExtractor.java |  6 +-
 .../extractor/microdata/MicrodataParser.java    | 98 +++++++++++++++++---
 .../java/org/apache/any23/rdf/RDFUtils.java     | 10 +-
 ...crodata-nested-url-resolving-expected.nquads |  2 +-
 5 files changed, 120 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java 
b/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java
index b4710de..2b6659a 100644
--- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java
+++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java
@@ -25,6 +25,8 @@ import java.util.Date;
 import java.util.Objects;
 
 import org.apache.any23.util.StringUtils;
+import org.eclipse.rdf4j.model.Literal;
+import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
 
 /**
  * Describes a possible value for a <b>Microdata item property</b>.
@@ -97,6 +99,31 @@ public class ItemPropValue {
         this.content = type.checkClass(content);
     }
 
+    ItemPropValue(Literal literal) {
+        this.literal = literal;
+
+        Type type;
+        Object content;
+
+        //for backwards compatibility:
+        if (XMLSchema.DATE.equals(literal.getDatatype()) || 
XMLSchema.DATETIME.equals(literal.getDatatype())) {
+            try {
+                content = parseDateTime(literal.getLabel());
+                type = Type.Date;
+            } catch (Exception e) {
+                content = literal.getLabel();
+                type = Type.Plain;
+            }
+        } else {
+            content = literal.getLabel();
+            type = Type.Plain;
+        }
+        this.type = type;
+        this.content = content;
+    }
+
+    Literal literal;
+
     /**
      * @return the content object.
      */

http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
index 3b45dd4..d49f7ce 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java
@@ -33,6 +33,8 @@ import org.eclipse.rdf4j.model.Literal;
 import org.eclipse.rdf4j.model.Resource;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Value;
+import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
+import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.eclipse.rdf4j.model.vocabulary.RDF;
 import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
 import org.w3c.dom.Document;
@@ -477,7 +479,9 @@ public class MicrodataExtractor implements 
Extractor.TagSoupDOMExtractor {
         Value value;
         Object propValue = itemProp.getValue().getContent();
         ItemPropValue.Type propType = itemProp.getValue().getType();
-        if (propType.equals(ItemPropValue.Type.Nested)) {
+        if (itemProp.getValue().literal != null) {
+            value = itemProp.getValue().literal;
+        } else if (propType.equals(ItemPropValue.Type.Nested)) {
             value = processType((ItemScope) propValue, documentIRI, out, 
mappings, defaultNamespace);
         } else if (propType.equals(ItemPropValue.Type.Plain)) {
             value = RDFUtils.literal((String) propValue, documentLanguage);

http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
index f305620..970c31b 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
@@ -17,7 +17,11 @@
 package org.apache.any23.extractor.microdata;
 
 import org.apache.any23.extractor.html.DomUtils;
+import org.apache.any23.rdf.RDFUtils;
 import org.apache.commons.lang.StringUtils;
+import org.eclipse.rdf4j.model.Literal;
+import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
+import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
@@ -315,8 +319,51 @@ public class MicrodataParser {
             return itemPropValue;
 
         final String nodeName = node.getNodeName().toLowerCase();
+
+        //see http://w3c.github.io/microdata-rdf/#dfn-property-values
+        if ("data".equals(nodeName) || "meter".equals(nodeName)) {
+            String value = StringUtils.stripToEmpty(readContentAttribute(node, 
"value"));
+            Literal l;
+            if (XMLDatatypeUtil.isValidInteger(value)) {
+                l = RDFUtils.literal(value, XMLSchema.INTEGER);
+            } else if (XMLDatatypeUtil.isValidDouble(value)) {
+                l = RDFUtils.literal(value, XMLSchema.DOUBLE);
+            } else {
+                l = RDFUtils.literal(value);
+            }
+            return new ItemPropValue(l);
+        }
+        if( "time".equals(nodeName) ) {
+            String dateTimeStr = 
StringUtils.stripToEmpty(readContentAttribute(node, "datetime"));
+            Literal l;
+            if (XMLDatatypeUtil.isValidDate(dateTimeStr)) {
+                l = RDFUtils.literal(dateTimeStr, XMLSchema.DATE);
+            } else if (XMLDatatypeUtil.isValidTime(dateTimeStr)) {
+                l = RDFUtils.literal(dateTimeStr, XMLSchema.TIME);
+            } else if (XMLDatatypeUtil.isValidDateTime(dateTimeStr)) {
+                l = RDFUtils.literal(dateTimeStr, XMLSchema.DATETIME);
+            } else if (XMLDatatypeUtil.isValidGYearMonth(dateTimeStr)) {
+                l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEARMONTH);
+            } else if (XMLDatatypeUtil.isValidGYear(dateTimeStr)) {
+                l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEAR);
+            } else if (XMLDatatypeUtil.isValidDuration(dateTimeStr)) {
+                l = RDFUtils.literal(dateTimeStr, XMLSchema.DURATION);
+            } else {
+                String lang = getLanguage(node);
+                if (lang != null) {
+                    l = RDFUtils.literal(dateTimeStr, lang);
+                } else {
+                    l = RDFUtils.literal(dateTimeStr);
+                }
+            }
+            return new ItemPropValue(l);
+        }
+
         if (DomUtils.hasAttribute(node, "content")) {
-            return new ItemPropValue(DomUtils.readAttribute(node, "content"), 
ItemPropValue.Type.Plain);
+            String val = DomUtils.readAttribute(node, "content");
+            String lang = getLanguage(node);
+            Literal l = lang == null ? RDFUtils.literal(val) : 
RDFUtils.literal(val, lang);
+            return new ItemPropValue(l);
         }
 
         if( SRC_TAGS.contains(nodeName) ) {
@@ -329,29 +376,50 @@ public class MicrodataParser {
         if( "object".equals(nodeName) ) {
             return new ItemPropValue( DomUtils.readAttribute(node, "data"), 
ItemPropValue.Type.Link);
         }
-        if( "time".equals(nodeName) ) {
-            final String dateTimeStr = DomUtils.readAttribute(node, 
"datetime");
-            final Date dateTime;
-            try {
-                dateTime = ItemPropValue.parseDateTime(dateTimeStr);
-            } catch (ParseException pe) {
-                throw new MicrodataParserException(
-                        String.format("Invalid format for datetime '%s'", 
dateTimeStr),
-                        node
-                );
-            }
-            return new ItemPropValue(dateTime, ItemPropValue.Type.Date);
-        }
 
         if( isItemScope(node) ) {
             return new ItemPropValue( getItemScope(node), 
ItemPropValue.Type.Nested);
         }
 
-        final ItemPropValue newItemPropValue = new ItemPropValue( 
node.getTextContent(), ItemPropValue.Type.Plain);
+        String lang = getLanguage(node);
+        Literal l = lang == null ? RDFUtils.literal(node.getTextContent()) : 
RDFUtils.literal(node.getTextContent(), lang);
+        final ItemPropValue newItemPropValue = new ItemPropValue(l);
         itemPropValues.put(node, newItemPropValue);
         return newItemPropValue;
     }
 
+    private static String readContentAttribute(Node node, String attrName) {
+        NamedNodeMap attributes = node.getAttributes();
+        if (attributes != null) {
+            Node attr = attributes.getNamedItem("content");
+            if (attr != null) {
+                return attr.getNodeValue();
+            }
+            attr = attributes.getNamedItem(attrName);
+            if (attr != null) {
+                return attr.getNodeValue();
+            }
+        }
+        return node.getTextContent();
+    }
+
+    //see https://www.w3.org/TR/html52/dom.html#the-lang-and-xmllang-attributes
+    private static String getLanguage(Node node) {
+        String lang;
+        do {
+            lang = DomUtils.readAttribute(node, "xml:lang", null);
+            if (StringUtils.isNotBlank(lang)) {
+                return lang.trim();
+            }
+            lang = DomUtils.readAttribute(node, "lang", null);
+            if (StringUtils.isNotBlank(lang)) {
+                return lang.trim();
+            }
+            node = node.getParentNode();
+        } while (node != null);
+        return null;
+    }
+
     /**
      * Returns all the <b>itemprop</b>s for the given <b>itemscope</b> node.
      *

http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java 
b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
index 44a98e0..552d61f 100644
--- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
+++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java
@@ -274,9 +274,8 @@ public class RDFUtils {
 
     /**
      * Creates a {@link Literal}.
-     * @param s string representation of the base namespace for the
-     * {@link org.eclipse.rdf4j.model.Literal}
-     * @param l the local name to associate with the namespace.
+     * @param s the literal's label
+     * @param l the literal's language
      * @return valid {@link org.eclipse.rdf4j.model.Literal}
      */
     public static Literal literal(String s, String l) {
@@ -290,9 +289,8 @@ public class RDFUtils {
 
     /**
      * Creates a {@link Literal}.
-     * @param s string representation of the base namespace for the
-     * {@link org.eclipse.rdf4j.model.Literal}
-     * @param datatype the datatype to associate with the namespace.
+     * @param s the literal's label
+     * @param datatype the literal's datatype
      * @return valid {@link org.eclipse.rdf4j.model.Literal}
      */
     public static Literal literal(String s, org.eclipse.rdf4j.model.IRI 
datatype) {

http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads
 
b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads
index 0eb4bcf..0cff257 100644
--- 
a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads
+++ 
b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads
@@ -17,7 +17,7 @@
 
 _:node1causocqkx2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/BlogPosting> 
<https://ruben.verborgh.org/tmp/schemaorg-test.html> .
 _:node1causocqkx2 <http://schema.org/alternativeHeadline> "Solution-based 
problem-solving restricts the result before the start."@en 
<https://ruben.verborgh.org/tmp/schemaorg-test.html> .
-_:node1causocqkx2 <http://schema.org/datePublished> 
"2013-07-30"^^<http://www.w3.org/2001/XMLSchema#date> 
<https://ruben.verborgh.org/tmp/schemaorg-test.html> .
+_:node1causocqkx2 <http://schema.org/datePublished> 
"2013-07-30T20:30:00+02:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> 
<https://ruben.verborgh.org/tmp/schemaorg-test.html> .
 _:node1causocqkx3 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
<http://schema.org/Person> <https://ruben.verborgh.org/tmp/schemaorg-test.html> 
.
 _:node1causocqkx3 <http://schema.org/givenName> "Ruben"@en 
<https://ruben.verborgh.org/tmp/schemaorg-test.html>.
 _:node1causocqkx3 <http://schema.org/familyName> "Verborgh"@en 
<https://ruben.verborgh.org/tmp/schemaorg-test.html> .

Reply via email to