Repository: any23 Updated Branches: refs/heads/master f87ac66bc -> 2175c2d37
ANY23-240 insert newlines where advisable in microdata Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/837d1935 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/837d1935 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/837d1935 Branch: refs/heads/master Commit: 837d1935baa8bbc487bc806c063627cd04f4c134 Parents: f87ac66 Author: Hans <[email protected]> Authored: Mon Oct 29 20:22:02 2018 -0500 Committer: Hans <[email protected]> Committed: Mon Oct 29 20:22:02 2018 -0500 ---------------------------------------------------------------------- .../extractor/microdata/MicrodataParser.java | 55 +++++++++++++++++++- .../schemaorg-example-2-expected.nquads | 2 +- 2 files changed, 55 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/837d1935/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java index 8964b32..013a318 100644 --- a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java +++ b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java @@ -23,10 +23,12 @@ import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Literal; import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil; import org.eclipse.rdf4j.model.vocabulary.XMLSchema; +import org.jsoup.parser.Tag; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; +import org.w3c.dom.NodeList; import org.w3c.dom.traversal.DocumentTraversal; import org.w3c.dom.traversal.NodeFilter; import org.w3c.dom.traversal.TreeWalker; @@ -39,6 +41,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -417,12 +420,62 @@ public class MicrodataParser { } String lang = getLanguage(node); - Literal l = lang == null ? RDFUtils.literal(node.getTextContent()) : RDFUtils.literal(node.getTextContent(), lang); + StringBuilder content = new StringBuilder(); + appendFormatted(node, content, false); + Literal l = RDFUtils.literal(content.toString(), lang); final ItemPropValue newItemPropValue = new ItemPropValue(l); itemPropValues.put(node, newItemPropValue); return newItemPropValue; } + private static boolean shouldSeparateWithNewline(CharSequence s0, CharSequence s1) { + for (int i = 0, len = s1.length(); i < len; i++) { + char ch = s1.charAt(i); + if (ch == '\n' || ch == '\r') { + return false; + } + if (!Character.isWhitespace(ch)) { + break; + } + } + for (int i = s0.length() - 1; i >= 0; i--) { + char ch = s0.charAt(i); + if (ch == '\n' || ch == '\r') { + return false; + } + if (!Character.isWhitespace(ch)) { + return true; + } + } + return false; + } + + private static boolean appendFormatted(Node node, StringBuilder sb, boolean needsNewline) { + switch (node.getNodeType()) { + case Node.TEXT_NODE: + String text = node.getTextContent(); + if (text.isEmpty()) { + return needsNewline; + } + if (needsNewline && shouldSeparateWithNewline(sb, text)) { + sb.append('\n'); + } + sb.append(text); + return false; + case Node.ELEMENT_NODE: + final String nodeName = node.getNodeName().toLowerCase(Locale.ENGLISH); + final boolean thisNeedsNewline = "br".equals(nodeName) || Tag.valueOf(nodeName).isBlock(); + final NodeList children = node.getChildNodes(); + boolean prevChildNeedsNewline = needsNewline || thisNeedsNewline; + for (int i = 0, len = children.getLength(); i < len; i++) { + prevChildNeedsNewline = appendFormatted(children.item(i), sb, prevChildNeedsNewline); + } + return prevChildNeedsNewline || thisNeedsNewline; + default: + return needsNewline; + } + } + private static String readContentAttribute(Node node, String attrName) { NamedNodeMap attributes = node.getAttributes(); if (attributes != null) { http://git-wip-us.apache.org/repos/asf/any23/blob/837d1935/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads index cc86cf9..2258212 100644 --- a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads +++ b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads @@ -17,7 +17,7 @@ _:node8b30931f1dde708283dc52546c5572a6 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Product> <http://bob.example.com/> . _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/price> "$55,000.00" <http://bob.example.com/> . -_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/description> "2010 Dodge Challenger SRT8 Limited EditionBright Silver Metallic with Dark Slate Gray Leather Interior6.1 Liter (370 CI) V8 SRT HEMI Engine6 Speed Manual Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" <http://bob.example.com/> . +_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/description> "2010 Dodge Challenger SRT8 Limited Edition\nBright Silver Metallic with Dark Slate Gray Leather Interior\n6.1 Liter (370 CI) V8 SRT HEMI Engine\n6 Speed Manual Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" <http://bob.example.com/> . _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/name> "2010 Dodge Challenger SRT8" <http://bob.example.com/> . _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/image> <http://bob.example.com/microdata/images/2010-dodge-challenger-srt8.jpg> <http://bob.example.com/> . _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/url> <http://vheminc.com/> <http://bob.example.com/> .
