Repository: any23
Updated Branches:
  refs/heads/master f87ac66bc -> 2175c2d37


ANY23-240 insert newlines where advisable in microdata


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/837d1935
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/837d1935
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/837d1935

Branch: refs/heads/master
Commit: 837d1935baa8bbc487bc806c063627cd04f4c134
Parents: f87ac66
Author: Hans <[email protected]>
Authored: Mon Oct 29 20:22:02 2018 -0500
Committer: Hans <[email protected]>
Committed: Mon Oct 29 20:22:02 2018 -0500

----------------------------------------------------------------------
 .../extractor/microdata/MicrodataParser.java    | 55 +++++++++++++++++++-
 .../schemaorg-example-2-expected.nquads         |  2 +-
 2 files changed, 55 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/837d1935/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
index 8964b32..013a318 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/microdata/MicrodataParser.java
@@ -23,10 +23,12 @@ import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.Literal;
 import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
 import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
+import org.jsoup.parser.Tag;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
 import org.w3c.dom.traversal.DocumentTraversal;
 import org.w3c.dom.traversal.NodeFilter;
 import org.w3c.dom.traversal.TreeWalker;
@@ -39,6 +41,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;
@@ -417,12 +420,62 @@ public class MicrodataParser {
         }
 
         String lang = getLanguage(node);
-        Literal l = lang == null ? RDFUtils.literal(node.getTextContent()) : 
RDFUtils.literal(node.getTextContent(), lang);
+        StringBuilder content = new StringBuilder();
+        appendFormatted(node, content, false);
+        Literal l = RDFUtils.literal(content.toString(), lang);
         final ItemPropValue newItemPropValue = new ItemPropValue(l);
         itemPropValues.put(node, newItemPropValue);
         return newItemPropValue;
     }
 
+    private static boolean shouldSeparateWithNewline(CharSequence s0, 
CharSequence s1) {
+        for (int i = 0, len = s1.length(); i < len; i++) {
+            char ch = s1.charAt(i);
+            if (ch == '\n' || ch == '\r') {
+                return false;
+            }
+            if (!Character.isWhitespace(ch)) {
+                break;
+            }
+        }
+        for (int i = s0.length() - 1; i >= 0; i--) {
+            char ch = s0.charAt(i);
+            if (ch == '\n' || ch == '\r') {
+                return false;
+            }
+            if (!Character.isWhitespace(ch)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private static boolean appendFormatted(Node node, StringBuilder sb, 
boolean needsNewline) {
+        switch (node.getNodeType()) {
+            case Node.TEXT_NODE:
+                String text = node.getTextContent();
+                if (text.isEmpty()) {
+                    return needsNewline;
+                }
+                if (needsNewline && shouldSeparateWithNewline(sb, text)) {
+                    sb.append('\n');
+                }
+                sb.append(text);
+                return false;
+            case Node.ELEMENT_NODE:
+                final String nodeName = 
node.getNodeName().toLowerCase(Locale.ENGLISH);
+                final boolean thisNeedsNewline = "br".equals(nodeName) || 
Tag.valueOf(nodeName).isBlock();
+                final NodeList children = node.getChildNodes();
+                boolean prevChildNeedsNewline = needsNewline || 
thisNeedsNewline;
+                for (int i = 0, len = children.getLength(); i < len; i++) {
+                    prevChildNeedsNewline = appendFormatted(children.item(i), 
sb, prevChildNeedsNewline);
+                }
+                return prevChildNeedsNewline || thisNeedsNewline;
+            default:
+                return needsNewline;
+        }
+    }
+
     private static String readContentAttribute(Node node, String attrName) {
         NamedNodeMap attributes = node.getAttributes();
         if (attributes != null) {

http://git-wip-us.apache.org/repos/asf/any23/blob/837d1935/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
 
b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
index cc86cf9..2258212 100644
--- 
a/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
+++ 
b/test-resources/src/test/resources/microdata/schemaorg-example-2-expected.nquads
@@ -17,7 +17,7 @@
 
 _:node8b30931f1dde708283dc52546c5572a6 
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Product> 
<http://bob.example.com/> .
 _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/price> "$55,000.00" 
<http://bob.example.com/> .
-_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/description> "2010 
Dodge Challenger SRT8 Limited EditionBright Silver Metallic with Dark Slate 
Gray Leather Interior6.1 Liter (370 CI) V8 SRT HEMI Engine6 Speed Manual 
Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" 
<http://bob.example.com/> .
+_:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/description> "2010 
Dodge Challenger SRT8 Limited Edition\nBright Silver Metallic with Dark Slate 
Gray Leather Interior\n6.1 Liter (370 CI) V8 SRT HEMI Engine\n6 Speed Manual 
Transmission with 3:92 Rear Axle Ratio (DEC, Track Pak)" 
<http://bob.example.com/> .
 _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/name> "2010 Dodge 
Challenger SRT8" <http://bob.example.com/> .
 _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/image> 
<http://bob.example.com/microdata/images/2010-dodge-challenger-srt8.jpg> 
<http://bob.example.com/> .
 _:node8b30931f1dde708283dc52546c5572a6 <http://schema.org/url> 
<http://vheminc.com/> <http://bob.example.com/> .

Reply via email to