Repository: any23 Updated Branches: refs/heads/master 1867cc66d -> 316b4ec0d
ANY23-338 fixed problem with json comment stripping Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/316b4ec0 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/316b4ec0 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/316b4ec0 Branch: refs/heads/master Commit: 316b4ec0d6285a204789792084caf012c000b196 Parents: 1867cc6 Author: Hans <[email protected]> Authored: Wed Mar 28 14:37:49 2018 -0500 Committer: Hans <[email protected]> Committed: Wed Mar 28 14:37:49 2018 -0500 ---------------------------------------------------------------------- .../apache/any23/extractor/rdf/BaseRDFExtractor.java | 12 ++++++------ .../extractor/html/EmbeddedJSONLDExtractorTest.java | 3 +++ .../test/resources/html/html-jsonld-strip-comments.html | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/316b4ec0/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index e4d16e2..61b58c1 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -161,7 +161,7 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { private static class JsonCommentStripperInputStream extends InputStream { - private int prevChar; + private boolean inEscape; private boolean inQuote; private boolean inCDATA; @@ -191,16 +191,16 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { @Override public int read() throws IOException { - return prevChar = privateRead(); - } - - private int privateRead() throws IOException { PushbackInputStream stream = wrapped; int c = stream.read(); if (inQuote) { - if (c == '"' && prevChar != '\\') { + if (inEscape) { + inEscape = false; + } else if (c == '"') { inQuote = false; + } else if (c == '\\') { + inEscape = true; } return c; } http://git-wip-us.apache.org/repos/asf/any23/blob/316b4ec0/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java index caf580d..23af441 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java @@ -17,6 +17,8 @@ package org.apache.any23.extractor.html; import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.vocab.FOAF; import org.junit.Test; /** @@ -58,6 +60,7 @@ public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase { assertExtract("/html/html-jsonld-strip-comments.html"); assertModelNotEmpty(); assertStatementsSize(null, null, null, 3); + assertContains(RDFUtils.iri(FOAF.NS, "name"), "Robert\\\" Millar\\\\\"\"\\\\"); } @Override http://git-wip-us.apache.org/repos/asf/any23/blob/316b4ec0/test-resources/src/test/resources/html/html-jsonld-strip-comments.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/html-jsonld-strip-comments.html b/test-resources/src/test/resources/html/html-jsonld-strip-comments.html index a75569e..f12f1cb 100644 --- a/test-resources/src/test/resources/html/html-jsonld-strip-comments.html +++ b/test-resources/src/test/resources/html/html-jsonld-strip-comments.html @@ -34,7 +34,7 @@ //the above urls should test that comments inside quotes are *not* stripped "@type": "Person",]]> /* multiline comment - inside json */ "name": <![CDATA["Robert\" Millar", //comment + inside json */ "name": <![CDATA["Robert\\\" Millar\\\\\"\"\\\\", //comment #comment "born": "1958-09-13T00:00:00" }]]]> ///some more commenting
