Repository: any23
Updated Branches:
  refs/heads/master 5d3d7159e -> 778d05ede


ANY23-328 Strip comments from json-ld content to make parsing more lenient


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/189bf260
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/189bf260
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/189bf260

Branch: refs/heads/master
Commit: 189bf260e74436860054469fde8192531cce6f14
Parents: 4131761
Author: Hans <firedrak...@gmail.com>
Authored: Sun Feb 11 12:11:32 2018 -0600
Committer: Hans <firedrak...@gmail.com>
Committed: Mon Feb 12 13:34:52 2018 -0600

----------------------------------------------------------------------
 .../extractor/html/EmbeddedJSONLDExtractor.java |   5 +-
 .../any23/extractor/rdf/BaseRDFExtractor.java   | 116 +++++++++++++++++++
 .../html/EmbeddedJSONLDExtractorTest.java       |   7 ++
 .../html/html-jsonld-strip-comments.html        |  51 ++++++++
 4 files changed, 177 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/189bf260/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
index aeffdda..f220d0d 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
@@ -27,6 +27,7 @@ import org.apache.any23.extractor.rdf.JSONLDExtractor;
 import org.apache.any23.extractor.rdf.JSONLDExtractorFactory;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.SINDICE;
+import org.apache.commons.io.IOUtils;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.w3c.dom.Document;
@@ -34,6 +35,7 @@ import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -145,8 +147,7 @@ public class EmbeddedJSONLDExtractor implements 
Extractor.TagSoupDOMExtractor {
       for (int i = 0; i < attributes.getLength(); i++) {
         if 
("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) {
           extractor.run(extractionParameters, extractionContext,
-                  DomUtils.nodeToInputStream(jsonldNode
-                          .getFirstChild()), out);
+                  IOUtils.toInputStream(jsonldNode.getTextContent(), 
StandardCharsets.UTF_8), out);
         }
       }
       Node nameAttribute = attributes.getNamedItem("name");

http://git-wip-us.apache.org/repos/asf/any23/blob/189bf260/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index 8f89f21..e4d16e2 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -41,6 +41,7 @@ import org.slf4j.LoggerFactory;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.PushbackInputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
@@ -145,6 +146,8 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                 }, doc);
 
                 in = new 
ByteArrayInputStream(doc.toString().getBytes(charset));
+            } else if (format.hasFileExtension("jsonld") || 
format.hasMIMEType("application/ld+json")) {
+                in = new JsonCommentStripperInputStream(in);
             }
 
             parser.parse(in, iri);
@@ -155,4 +158,117 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
         }
     }
 
+
+    private static class JsonCommentStripperInputStream extends InputStream {
+
+        private int prevChar;
+        private boolean inQuote;
+        private boolean inCDATA;
+
+        private final PushbackInputStream wrapped;
+
+        JsonCommentStripperInputStream(InputStream in) {
+            wrapped = new PushbackInputStream(in, 16);
+        }
+
+        private boolean isNextOrUnread(int... next) throws IOException {
+            int i = -1;
+            for (int test : next) {
+                int c = wrapped.read();
+                if (c != test) {
+                    if (c != -1) {
+                        wrapped.unread(c);
+                    }
+                    while (i >= 0) {
+                        wrapped.unread(next[i--]);
+                    }
+                    return false;
+                }
+                i++;
+            }
+            return true;
+        }
+
+        @Override
+        public int read() throws IOException {
+            return prevChar = privateRead();
+        }
+
+        private int privateRead() throws IOException {
+            PushbackInputStream stream = wrapped;
+            int c = stream.read();
+
+            if (inQuote) {
+                if (c == '"' && prevChar != '\\') {
+                    inQuote = false;
+                }
+                return c;
+            }
+
+            //we're not in a quote
+            switch (c) {
+                case '/':
+                    if (isNextOrUnread('/')) {
+                        //single line comment: read to end of line
+                        for (;;) {
+                            c = stream.read();
+                            if (c == -1 || c == '\r' || c == '\n') {
+                                return c;
+                            }
+                        }
+                    } else if (isNextOrUnread('*')) {
+                        //multiline comment: read till next "*/"
+                        for (;;) {
+                            c = stream.read();
+                            if (c == -1) {
+                                return c;
+                            } else if (c == '*') {
+                                c = stream.read();
+                                if (c == -1) {
+                                    return c;
+                                } else if (c == '/') {
+                                    //replace entire comment with single space
+                                    return ' ';
+                                }
+                            }
+                        }
+                    } else {
+                        return c;
+                    }
+                case '<':
+                    if (isNextOrUnread('!','[','C','D','A','T','A','[')) {
+                        inCDATA = true;
+                        return ' ';
+                    } else {
+                        return c;
+                    }
+                case '#':
+                    for (;;) {
+                        c = stream.read();
+                        if (c == -1 || c == '\r' || c == '\n') {
+                            return c;
+                        }
+                    }
+                case ']':
+                    if (inCDATA) {
+                        if (isNextOrUnread(']', '>')) {
+                            inCDATA = false;
+                            return ' ';
+                        } else {
+                            return c;
+                        }
+                    } else {
+                        return c;
+                    }
+                case '"':
+                    inQuote = true;
+                    return c;
+                default:
+                    return c;
+            }
+
+        }
+
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/any23/blob/189bf260/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
index 6e7bfa4..caf580d 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
@@ -53,6 +53,13 @@ public class EmbeddedJSONLDExtractorTest extends 
AbstractExtractorTestCase {
                assertStatementsSize(null, null, null, 7);
        }
 
+       @Test
+       public void testJSONLDCommentStripping() throws Exception {
+               assertExtract("/html/html-jsonld-strip-comments.html");
+               assertModelNotEmpty();
+               assertStatementsSize(null, null, null, 3);
+       }
+
        @Override
        protected ExtractorFactory<?> getExtractorFactory() {
                return new EmbeddedJSONLDExtractorFactory();

http://git-wip-us.apache.org/repos/asf/any23/blob/189bf260/test-resources/src/test/resources/html/html-jsonld-strip-comments.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/html/html-jsonld-strip-comments.html 
b/test-resources/src/test/resources/html/html-jsonld-strip-comments.html
new file mode 100644
index 0000000..a75569e
--- /dev/null
+++ b/test-resources/src/test/resources/html/html-jsonld-strip-comments.html
@@ -0,0 +1,51 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html>
+<head>
+    <title>Hello World!</title>
+    <meta name="title" content="Embedded JSONLD extractor"/>
+    <!-- As per spec in 
http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents -->
+    <script type="application/ld+json">
+    /* first
+    multiline comment
+    # */
+    # for funsies -- although this one won't occur in html
+    //first single line comment!
+    <![CDATA[
+    //second single line comment
+    /* //**second multiline comment* */ //third single line comment
+    [{
+      "@context": "http://json-ld.org/contexts/person.jsonld";,
+      "@id": "http://dbpedia.org/resource/Robert_Millar";,
+      //the above urls should test that comments inside quotes are *not* 
stripped
+      "@type": "Person",]]> /*
+       multiline comment
+      inside json */ "name": <![CDATA["Robert\" Millar", //comment
+      #comment
+      "born": "1958-09-13T00:00:00"
+    }]]]> ///some more commenting
+    /* a
+    final
+    multiline
+    comment*/ //a final single line comment
+    </script>
+
+
+</head>
+<h1>Embedded JSONLD Extractor</h1>
+<p>It extracts only the embedded JSON-LD elements.
+</html>

Reply via email to