Repository: any23 Updated Branches: refs/heads/master 5d3d7159e -> 778d05ede
ANY23-328 Strip comments from json-ld content to make parsing more lenient Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/189bf260 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/189bf260 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/189bf260 Branch: refs/heads/master Commit: 189bf260e74436860054469fde8192531cce6f14 Parents: 4131761 Author: Hans <[email protected]> Authored: Sun Feb 11 12:11:32 2018 -0600 Committer: Hans <[email protected]> Committed: Mon Feb 12 13:34:52 2018 -0600 ---------------------------------------------------------------------- .../extractor/html/EmbeddedJSONLDExtractor.java | 5 +- .../any23/extractor/rdf/BaseRDFExtractor.java | 116 +++++++++++++++++++ .../html/EmbeddedJSONLDExtractorTest.java | 7 ++ .../html/html-jsonld-strip-comments.html | 51 ++++++++ 4 files changed, 177 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/189bf260/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java index aeffdda..f220d0d 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java @@ -27,6 +27,7 @@ import org.apache.any23.extractor.rdf.JSONLDExtractor; import org.apache.any23.extractor.rdf.JSONLDExtractorFactory; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.SINDICE; +import org.apache.commons.io.IOUtils; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.w3c.dom.Document; @@ -34,6 +35,7 @@ import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -145,8 +147,7 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor { for (int i = 0; i < attributes.getLength(); i++) { if ("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) { extractor.run(extractionParameters, extractionContext, - DomUtils.nodeToInputStream(jsonldNode - .getFirstChild()), out); + IOUtils.toInputStream(jsonldNode.getTextContent(), StandardCharsets.UTF_8), out); } } Node nameAttribute = attributes.getNamedItem("name"); http://git-wip-us.apache.org/repos/asf/any23/blob/189bf260/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index 8f89f21..e4d16e2 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -41,6 +41,7 @@ import org.slf4j.LoggerFactory; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.PushbackInputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashSet; @@ -145,6 +146,8 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { }, doc); in = new ByteArrayInputStream(doc.toString().getBytes(charset)); + } else if (format.hasFileExtension("jsonld") || format.hasMIMEType("application/ld+json")) { + in = new JsonCommentStripperInputStream(in); } parser.parse(in, iri); @@ -155,4 +158,117 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { } } + + private static class JsonCommentStripperInputStream extends InputStream { + + private int prevChar; + private boolean inQuote; + private boolean inCDATA; + + private final PushbackInputStream wrapped; + + JsonCommentStripperInputStream(InputStream in) { + wrapped = new PushbackInputStream(in, 16); + } + + private boolean isNextOrUnread(int... next) throws IOException { + int i = -1; + for (int test : next) { + int c = wrapped.read(); + if (c != test) { + if (c != -1) { + wrapped.unread(c); + } + while (i >= 0) { + wrapped.unread(next[i--]); + } + return false; + } + i++; + } + return true; + } + + @Override + public int read() throws IOException { + return prevChar = privateRead(); + } + + private int privateRead() throws IOException { + PushbackInputStream stream = wrapped; + int c = stream.read(); + + if (inQuote) { + if (c == '"' && prevChar != '\\') { + inQuote = false; + } + return c; + } + + //we're not in a quote + switch (c) { + case '/': + if (isNextOrUnread('/')) { + //single line comment: read to end of line + for (;;) { + c = stream.read(); + if (c == -1 || c == '\r' || c == '\n') { + return c; + } + } + } else if (isNextOrUnread('*')) { + //multiline comment: read till next "*/" + for (;;) { + c = stream.read(); + if (c == -1) { + return c; + } else if (c == '*') { + c = stream.read(); + if (c == -1) { + return c; + } else if (c == '/') { + //replace entire comment with single space + return ' '; + } + } + } + } else { + return c; + } + case '<': + if (isNextOrUnread('!','[','C','D','A','T','A','[')) { + inCDATA = true; + return ' '; + } else { + return c; + } + case '#': + for (;;) { + c = stream.read(); + if (c == -1 || c == '\r' || c == '\n') { + return c; + } + } + case ']': + if (inCDATA) { + if (isNextOrUnread(']', '>')) { + inCDATA = false; + return ' '; + } else { + return c; + } + } else { + return c; + } + case '"': + inQuote = true; + return c; + default: + return c; + } + + } + + } + } http://git-wip-us.apache.org/repos/asf/any23/blob/189bf260/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java index 6e7bfa4..caf580d 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java @@ -53,6 +53,13 @@ public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase { assertStatementsSize(null, null, null, 7); } + @Test + public void testJSONLDCommentStripping() throws Exception { + assertExtract("/html/html-jsonld-strip-comments.html"); + assertModelNotEmpty(); + assertStatementsSize(null, null, null, 3); + } + @Override protected ExtractorFactory<?> getExtractorFactory() { return new EmbeddedJSONLDExtractorFactory(); http://git-wip-us.apache.org/repos/asf/any23/blob/189bf260/test-resources/src/test/resources/html/html-jsonld-strip-comments.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/html-jsonld-strip-comments.html b/test-resources/src/test/resources/html/html-jsonld-strip-comments.html new file mode 100644 index 0000000..a75569e --- /dev/null +++ b/test-resources/src/test/resources/html/html-jsonld-strip-comments.html @@ -0,0 +1,51 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html> +<head> + <title>Hello World!</title> + <meta name="title" content="Embedded JSONLD extractor"/> + <!-- As per spec in http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents --> + <script type="application/ld+json"> + /* first + multiline comment + # */ + # for funsies -- although this one won't occur in html + //first single line comment! + <![CDATA[ + //second single line comment + /* //**second multiline comment* */ //third single line comment + [{ + "@context": "http://json-ld.org/contexts/person.jsonld", + "@id": "http://dbpedia.org/resource/Robert_Millar", + //the above urls should test that comments inside quotes are *not* stripped + "@type": "Person",]]> /* + multiline comment + inside json */ "name": <![CDATA["Robert\" Millar", //comment + #comment + "born": "1958-09-13T00:00:00" + }]]]> ///some more commenting + /* a + final + multiline + comment*/ //a final single line comment + </script> + + +</head> +<h1>Embedded JSONLD Extractor</h1> +<p>It extracts only the embedded JSON-LD elements. +</html>
