Repository: any23 Updated Branches: refs/heads/master 6b1469152 -> f23c25cc2
ANY23-405 Parse microdata property values correctly Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/f23c25cc Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/f23c25cc Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/f23c25cc Branch: refs/heads/master Commit: f23c25cc23938aa27551426d38dd0139fd30b9f4 Parents: 6b14691 Author: Hans <[email protected]> Authored: Wed Oct 24 10:35:10 2018 -0500 Committer: Hans <[email protected]> Committed: Wed Oct 24 10:35:10 2018 -0500 ---------------------------------------------------------------------- .../extractor/microdata/ItemPropValue.java | 27 ++++++ .../extractor/microdata/MicrodataExtractor.java | 6 +- .../extractor/microdata/MicrodataParser.java | 98 +++++++++++++++++--- .../java/org/apache/any23/rdf/RDFUtils.java | 10 +- ...crodata-nested-url-resolving-expected.nquads | 2 +- 5 files changed, 120 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java b/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java index b4710de..2b6659a 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/ItemPropValue.java @@ -25,6 +25,8 @@ import java.util.Date; import java.util.Objects; import org.apache.any23.util.StringUtils; +import org.eclipse.rdf4j.model.Literal; +import org.eclipse.rdf4j.model.vocabulary.XMLSchema; /** * Describes a possible value for a <b>Microdata item property</b>. @@ -97,6 +99,31 @@ public class ItemPropValue { this.content = type.checkClass(content); } + ItemPropValue(Literal literal) { + this.literal = literal; + + Type type; + Object content; + + //for backwards compatibility: + if (XMLSchema.DATE.equals(literal.getDatatype()) || XMLSchema.DATETIME.equals(literal.getDatatype())) { + try { + content = parseDateTime(literal.getLabel()); + type = Type.Date; + } catch (Exception e) { + content = literal.getLabel(); + type = Type.Plain; + } + } else { + content = literal.getLabel(); + type = Type.Plain; + } + this.type = type; + this.content = content; + } + + Literal literal; + /** * @return the content object. */ http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java index 3b45dd4..d49f7ce 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataExtractor.java @@ -33,6 +33,8 @@ import org.eclipse.rdf4j.model.Literal; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.model.vocabulary.RDF; import org.eclipse.rdf4j.model.vocabulary.XMLSchema; import org.w3c.dom.Document; @@ -477,7 +479,9 @@ public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { Value value; Object propValue = itemProp.getValue().getContent(); ItemPropValue.Type propType = itemProp.getValue().getType(); - if (propType.equals(ItemPropValue.Type.Nested)) { + if (itemProp.getValue().literal != null) { + value = itemProp.getValue().literal; + } else if (propType.equals(ItemPropValue.Type.Nested)) { value = processType((ItemScope) propValue, documentIRI, out, mappings, defaultNamespace); } else if (propType.equals(ItemPropValue.Type.Plain)) { value = RDFUtils.literal((String) propValue, documentLanguage); http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java index f305620..970c31b 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java @@ -17,7 +17,11 @@ package org.apache.any23.extractor.microdata; import org.apache.any23.extractor.html.DomUtils; +import org.apache.any23.rdf.RDFUtils; import org.apache.commons.lang.StringUtils; +import org.eclipse.rdf4j.model.Literal; +import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil; +import org.eclipse.rdf4j.model.vocabulary.XMLSchema; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; @@ -315,8 +319,51 @@ public class MicrodataParser { return itemPropValue; final String nodeName = node.getNodeName().toLowerCase(); + + //see http://w3c.github.io/microdata-rdf/#dfn-property-values + if ("data".equals(nodeName) || "meter".equals(nodeName)) { + String value = StringUtils.stripToEmpty(readContentAttribute(node, "value")); + Literal l; + if (XMLDatatypeUtil.isValidInteger(value)) { + l = RDFUtils.literal(value, XMLSchema.INTEGER); + } else if (XMLDatatypeUtil.isValidDouble(value)) { + l = RDFUtils.literal(value, XMLSchema.DOUBLE); + } else { + l = RDFUtils.literal(value); + } + return new ItemPropValue(l); + } + if( "time".equals(nodeName) ) { + String dateTimeStr = StringUtils.stripToEmpty(readContentAttribute(node, "datetime")); + Literal l; + if (XMLDatatypeUtil.isValidDate(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.DATE); + } else if (XMLDatatypeUtil.isValidTime(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.TIME); + } else if (XMLDatatypeUtil.isValidDateTime(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.DATETIME); + } else if (XMLDatatypeUtil.isValidGYearMonth(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEARMONTH); + } else if (XMLDatatypeUtil.isValidGYear(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEAR); + } else if (XMLDatatypeUtil.isValidDuration(dateTimeStr)) { + l = RDFUtils.literal(dateTimeStr, XMLSchema.DURATION); + } else { + String lang = getLanguage(node); + if (lang != null) { + l = RDFUtils.literal(dateTimeStr, lang); + } else { + l = RDFUtils.literal(dateTimeStr); + } + } + return new ItemPropValue(l); + } + if (DomUtils.hasAttribute(node, "content")) { - return new ItemPropValue(DomUtils.readAttribute(node, "content"), ItemPropValue.Type.Plain); + String val = DomUtils.readAttribute(node, "content"); + String lang = getLanguage(node); + Literal l = lang == null ? RDFUtils.literal(val) : RDFUtils.literal(val, lang); + return new ItemPropValue(l); } if( SRC_TAGS.contains(nodeName) ) { @@ -329,29 +376,50 @@ public class MicrodataParser { if( "object".equals(nodeName) ) { return new ItemPropValue( DomUtils.readAttribute(node, "data"), ItemPropValue.Type.Link); } - if( "time".equals(nodeName) ) { - final String dateTimeStr = DomUtils.readAttribute(node, "datetime"); - final Date dateTime; - try { - dateTime = ItemPropValue.parseDateTime(dateTimeStr); - } catch (ParseException pe) { - throw new MicrodataParserException( - String.format("Invalid format for datetime '%s'", dateTimeStr), - node - ); - } - return new ItemPropValue(dateTime, ItemPropValue.Type.Date); - } if( isItemScope(node) ) { return new ItemPropValue( getItemScope(node), ItemPropValue.Type.Nested); } - final ItemPropValue newItemPropValue = new ItemPropValue( node.getTextContent(), ItemPropValue.Type.Plain); + String lang = getLanguage(node); + Literal l = lang == null ? RDFUtils.literal(node.getTextContent()) : RDFUtils.literal(node.getTextContent(), lang); + final ItemPropValue newItemPropValue = new ItemPropValue(l); itemPropValues.put(node, newItemPropValue); return newItemPropValue; } + private static String readContentAttribute(Node node, String attrName) { + NamedNodeMap attributes = node.getAttributes(); + if (attributes != null) { + Node attr = attributes.getNamedItem("content"); + if (attr != null) { + return attr.getNodeValue(); + } + attr = attributes.getNamedItem(attrName); + if (attr != null) { + return attr.getNodeValue(); + } + } + return node.getTextContent(); + } + + //see https://www.w3.org/TR/html52/dom.html#the-lang-and-xmllang-attributes + private static String getLanguage(Node node) { + String lang; + do { + lang = DomUtils.readAttribute(node, "xml:lang", null); + if (StringUtils.isNotBlank(lang)) { + return lang.trim(); + } + lang = DomUtils.readAttribute(node, "lang", null); + if (StringUtils.isNotBlank(lang)) { + return lang.trim(); + } + node = node.getParentNode(); + } while (node != null); + return null; + } + /** * Returns all the <b>itemprop</b>s for the given <b>itemscope</b> node. * http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/core/src/main/java/org/apache/any23/rdf/RDFUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java index 44a98e0..552d61f 100644 --- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java +++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java @@ -274,9 +274,8 @@ public class RDFUtils { /** * Creates a {@link Literal}. - * @param s string representation of the base namespace for the - * {@link org.eclipse.rdf4j.model.Literal} - * @param l the local name to associate with the namespace. + * @param s the literal's label + * @param l the literal's language * @return valid {@link org.eclipse.rdf4j.model.Literal} */ public static Literal literal(String s, String l) { @@ -290,9 +289,8 @@ public class RDFUtils { /** * Creates a {@link Literal}. - * @param s string representation of the base namespace for the - * {@link org.eclipse.rdf4j.model.Literal} - * @param datatype the datatype to associate with the namespace. + * @param s the literal's label + * @param datatype the literal's datatype * @return valid {@link org.eclipse.rdf4j.model.Literal} */ public static Literal literal(String s, org.eclipse.rdf4j.model.IRI datatype) { http://git-wip-us.apache.org/repos/asf/any23/blob/f23c25cc/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads index 0eb4bcf..0cff257 100644 --- a/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads +++ b/test-resources/src/test/resources/microdata/microdata-nested-url-resolving-expected.nquads @@ -17,7 +17,7 @@ _:node1causocqkx2 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/BlogPosting> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . _:node1causocqkx2 <http://schema.org/alternativeHeadline> "Solution-based problem-solving restricts the result before the start."@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> . -_:node1causocqkx2 <http://schema.org/datePublished> "2013-07-30"^^<http://www.w3.org/2001/XMLSchema#date> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . +_:node1causocqkx2 <http://schema.org/datePublished> "2013-07-30T20:30:00+02:00"^^<http://www.w3.org/2001/XMLSchema#dateTime> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . _:node1causocqkx3 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Person> <https://ruben.verborgh.org/tmp/schemaorg-test.html> . _:node1causocqkx3 <http://schema.org/givenName> "Ruben"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html>. _:node1causocqkx3 <http://schema.org/familyName> "Verborgh"@en <https://ruben.verborgh.org/tmp/schemaorg-test.html> .
