Repository: any23 Updated Branches: refs/heads/master 22b3047d5 -> e046f7329
improve JsonCleaningInputStream Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/e046f732 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/e046f732 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/e046f732 Branch: refs/heads/master Commit: e046f7329538b61f17225e64f79c280c4d248aa9 Parents: 22b3047 Author: Hans <[email protected]> Authored: Mon Aug 6 14:31:08 2018 -0500 Committer: Hans <[email protected]> Committed: Mon Aug 6 14:31:08 2018 -0500 ---------------------------------------------------------------------- .../any23/extractor/rdf/BaseRDFExtractor.java | 147 ---------- .../any23/extractor/rdf/JSONLDExtractor.java | 6 +- .../extractor/rdf/JsonCleaningInputStream.java | 271 +++++++++++++++++++ .../test/java/org/apache/any23/Any23Test.java | 2 +- .../extractor/rdf/JSONLDExtractorTest.java | 2 +- 5 files changed, 276 insertions(+), 152 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index ea582cb..796bada 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -46,7 +46,6 @@ import org.slf4j.LoggerFactory; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.PushbackInputStream; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.HashSet; @@ -215,150 +214,4 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { } } - - static class JsonCleaningInputStream extends InputStream { - - private boolean inEscape; - private int quoteChar; - private boolean inCDATA; - private boolean needsComma; - - private final PushbackInputStream wrapped; - - JsonCleaningInputStream(InputStream in) { - wrapped = new PushbackInputStream(in, 16); - } - - private static boolean isNextOrUnread(PushbackInputStream stream, int... next) throws IOException { - int i = -1; - for (int test : next) { - int c = stream.read(); - if (c != test) { - if (c != -1) { - stream.unread(c); - } - while (i >= 0) { - stream.unread(next[i--]); - } - return false; - } - i++; - } - return true; - } - - @Override - public int read() throws IOException { - PushbackInputStream stream = wrapped; - - for (;;) { - int c = stream.read(); - - //other types of comments are handled by enabling fasterxml's - //ALLOW_COMMENTS and ALLOW_YAML_COMMENTS features - if (inCDATA) { - if (c == ']' && isNextOrUnread(stream, ']', '>')) { - inCDATA = false; - continue; - } - } else { - if (c == '<' && isNextOrUnread(stream, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) { - inCDATA = true; - continue; - } - } - - int q = quoteChar; - if (q != 0) { - //we're in a quote - if (inEscape) { - //end escape - inEscape = false; - } else if (c == '\\') { - //begin escape - inEscape = true; - } else if (c == q) { - //end quote - quoteChar = 0; - } - return c; - } - - //we're not in a quote - switch (c) { - case ',': - case ';': - //don't write out comma yet! - needsComma = true; - continue; - case '}': - case ']': - //discard comma at end of object or array - needsComma = false; - return c; - case -1: - case '\r': - case '\n': - return c; - case 0x09: - case 0x0b: - case 0x0c: - case 0x1c: - case 0x1d: - case 0x1e: - case 0x1f: - case 0x20: - return ' '; - case 0xc2: - if (isNextOrUnread(stream, 0xa0)) { - return ' '; - } - break; - case 0xe1: - if (isNextOrUnread(stream, 0x9a, 0x80) - || isNextOrUnread(stream, 0xa0, 0x8e)) { - return ' '; - } - break; - case 0xe2: - int c1 = stream.read(); - if (c1 == 0x80) { - int c2 = stream.read(); - //space separators - if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf - //line and paragraph separators - || c2 == 0xa8 || c2 == 0xa9) { - return ' '; - } - stream.unread(c2); - } else if (c1 == 0x81) { - int c2 = stream.read(); - if (c2 == 0x9f) { - return ' '; - } - stream.unread(c2); - } - stream.unread(c1); - break; - case 0xe3: - if (isNextOrUnread(stream, 0x80, 0x80)) { - return ' '; - } - break; - default: - break; - } - if (needsComma) { - stream.unread(c); - stream.unread(' '); - needsComma = false; - return ','; - } else if (c == '"' || c == '\'') { - quoteChar = c; - } - return c; - } - } - } - } http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java index 71f2459..1806adf 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java @@ -56,15 +56,15 @@ public class JSONLDExtractor extends BaseRDFExtractor { } JSON_FACTORY.enable(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER); - JSON_FACTORY.enable(JsonParser.Feature.ALLOW_COMMENTS); + JSON_FACTORY.disable(JsonParser.Feature.ALLOW_COMMENTS); //handled by JsonCleaningInputStream JSON_FACTORY.disable(JsonParser.Feature.ALLOW_MISSING_VALUES); //handled by JsonCleaningInputStream JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS); JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS); - JSON_FACTORY.enable(JsonParser.Feature.ALLOW_SINGLE_QUOTES); + JSON_FACTORY.disable(JsonParser.Feature.ALLOW_SINGLE_QUOTES); //handled by JsonCleaningInputStream JSON_FACTORY.disable(JsonParser.Feature.ALLOW_TRAILING_COMMA); //handled by JsonCleaningInputStream JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS); JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES); - JSON_FACTORY.enable(JsonParser.Feature.ALLOW_YAML_COMMENTS); + JSON_FACTORY.disable(JsonParser.Feature.ALLOW_YAML_COMMENTS); //handled by JsonCleaningInputStream JSON_FACTORY.enable(JsonParser.Feature.IGNORE_UNDEFINED); JSON_FACTORY.enable(JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION); JSON_FACTORY.disable(JsonParser.Feature.STRICT_DUPLICATE_DETECTION); http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java b/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java new file mode 100644 index 0000000..bda229e --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdf/JsonCleaningInputStream.java @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.rdf; + +import java.io.IOException; +import java.io.InputStream; +import java.io.PushbackInputStream; + +/** + * This class uses several strategies to fix common JSON syntax errors, including: + * <ol> + * <li>Remove CDATA markers</li> + * <li>Remove YAML and C-style comments</li> + * <li>Allow single-quoted strings</li> + * <li>Ignore duplicated commas between elements of objects and arrays</li> + * <li>Remove trailing commas from objects and arrays</li> + * <li>Insert omitted commas after objects and arrays</li> + * <li>Ignore all unicode whitespace characters (assumes UTF-8 encoding)</li> + * <li>Treat semi-colons as commas</li> + * </ol> + * + * @author Hans Brende ([email protected]) + */ +class JsonCleaningInputStream extends InputStream { + + private static final int EOL_COMMENT = 1; + private static final int MULTILINE_COMMENT = 2; + + private static final int NEEDS_COMMA = 1; + private static final int NEEDS_COMMA_AND_NEWLINE = 2; + + private boolean inEscape; + private boolean inCDATA; + private int needsComma; + private int currentState; + + private final PushbackInputStream in; + + JsonCleaningInputStream(InputStream in) { + this.in = new PushbackInputStream(in, 16); + } + + private static void unread(PushbackInputStream in, int c) throws IOException { + if (c != -1) { + in.unread(c); + } + } + + private static boolean isNextOrUnread(PushbackInputStream in, int... next) throws IOException { + int i = -1; + for (int test : next) { + int c = in.read(); + if (c != test) { + unread(in, c); + while (i >= 0) { + in.unread(next[i--]); + } + return false; + } + i++; + } + return true; + } + + @Override + public int read() throws IOException { + PushbackInputStream in = this.in; + + for (;;) { + int c = in.read(); + + if (c == -1) { + return c; + } + + if (inCDATA) { + if (c == ']' && isNextOrUnread(in, ']', '>')) { + inCDATA = false; + continue; + } + } else { + if (c == '<' && isNextOrUnread(in, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) { + inCDATA = true; + continue; + } + } + + int ctx = currentState; + switch (ctx) { + case 0: + break; + case EOL_COMMENT: + if (c == '\r' || c == '\n') { + //end single-line comment + currentState = 0; + if (needsComma != 0) { + needsComma = NEEDS_COMMA_AND_NEWLINE; + continue; + } + return c; + } + continue; + case MULTILINE_COMMENT: + if (c == '\r' || c == '\n') { + if (needsComma != 0) { + needsComma = NEEDS_COMMA_AND_NEWLINE; + continue; + } + return c; + } else if (c == '*' && isNextOrUnread(in, '/')) { + //end multiline comment + currentState = 0; + } + continue; + default: + //we're in a quote + if (inEscape) { + //end escape + inEscape = false; + } else if (c == '\\') { + //begin escape + inEscape = true; + } else if (c == ctx) { + //end quote + currentState = 0; + return '"'; + } + return c; + } + + $whitespace: { + //we're not in a quote + switch (c) { + case '#': + currentState = EOL_COMMENT; + continue; + case '/': + int next = in.read(); + if (next == '/') { + currentState = EOL_COMMENT; + continue; + } else if (next == '*') { + currentState = MULTILINE_COMMENT; + continue; + } + unread(in, next); + break; + case ',': + case ';': + //don't write out comma yet! + needsComma = NEEDS_COMMA; + continue; + case '}': + case ']': + // Only thing that can follow '}' or ']' is: + // '}' or ']' or ',' or EOF + needsComma = NEEDS_COMMA; + return c; + case '\r': + case '\n': + if (needsComma != 0) { + needsComma = NEEDS_COMMA_AND_NEWLINE; + continue; + } + return c; + // UTF-8 whitespace detection + case 0x09: + case 0x0b: + case 0x0c: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x20: + break $whitespace; + case 0xc2: + if (isNextOrUnread(in, 0xa0)) { + break $whitespace; + } + break; + case 0xe1: + if (isNextOrUnread(in, 0x9a, 0x80) + || isNextOrUnread(in, 0xa0, 0x8e)) { + break $whitespace; + } + break; + case 0xe2: + int c1 = in.read(); + if (c1 == 0x80) { + int c2 = in.read(); + //space separators + if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf + //line and paragraph separators + || c2 == 0xa8 || c2 == 0xa9) { + break $whitespace; + } + unread(in, c2); + in.unread(0x80); + } else if (c1 == 0x81) { + int c2 = in.read(); + if (c2 == 0x9f) { + break $whitespace; + } + unread(in, c2); + in.unread(0x81); + } else { + unread(in, c1); + } + break; + case 0xe3: + if (isNextOrUnread(in, 0x80, 0x80)) { + break $whitespace; + } + break; + default: + break; + } + + //here: character is not whitespace + + int nc = needsComma; + if (nc != 0) { + in.unread(c); + if (nc == NEEDS_COMMA) { + in.unread(' '); + } else { + for (int i = NEEDS_COMMA_AND_NEWLINE; i < nc; i++) { + in.unread(' '); + } + in.unread('\n'); + } + needsComma = 0; + return ','; + } else if (c == '"' || c == '\'') { + currentState = c; + return '"'; + } + return c; + } //end $whitespace + + //here: character is whitespace + + int nc = needsComma; + if (nc != 0) { + if (nc != NEEDS_COMMA) { + needsComma = (nc + 1) & 0xFF; + } + continue; + } + + return ' '; + + } + + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/test/java/org/apache/any23/Any23Test.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/Any23Test.java b/core/src/test/java/org/apache/any23/Any23Test.java index 085db04..d1d3467 100644 --- a/core/src/test/java/org/apache/any23/Any23Test.java +++ b/core/src/test/java/org/apache/any23/Any23Test.java @@ -345,7 +345,7 @@ public class Any23Test extends Any23OnlineTestBase { } finally { compositeTH1.close(); } - logger.info(baos.toString()); + logger.debug(baos.toString()); Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth1.getCount()); http://git-wip-us.apache.org/repos/asf/any23/blob/e046f732/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java index 215b552..f1338b4 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java @@ -71,7 +71,7 @@ public class JSONLDExtractorTest { for (int i = 0; i <= Character.MAX_CODE_POINT; i++) { if (Character.isWhitespace(i) || Character.isSpaceChar(i)) { byte[] bytes = new String(Character.toChars(i)).getBytes(StandardCharsets.UTF_8); - InputStream stream = new BaseRDFExtractor.JsonCleaningInputStream(new ByteArrayInputStream(bytes)); + InputStream stream = new JsonCleaningInputStream(new ByteArrayInputStream(bytes)); if (i == '\r' || i == '\n') { Assert.assertEquals(stream.read(), i); } else {
