Repository: any23 Updated Branches: refs/heads/master 817e744af -> 837f92b91
ANY23-382 don't kill extraction on fatal json parsing errors Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/837f92b9 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/837f92b9 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/837f92b9 Branch: refs/heads/master Commit: 837f92b9167d7944dbc88a965d6e17cf22b375e0 Parents: 817e744 Author: Hans <[email protected]> Authored: Fri Aug 3 16:06:15 2018 -0500 Committer: Hans <[email protected]> Committed: Fri Aug 3 16:06:15 2018 -0500 ---------------------------------------------------------------------- .../any23/extractor/rdf/BaseRDFExtractor.java | 197 +++++-------------- .../any23/extractor/rdf/JSONLDExtractor.java | 27 +++ .../any23/extractor/rdf/RDFParserFactory.java | 2 +- .../html/EmbeddedJSONLDExtractorTest.java | 10 +- .../resources/html/html-jsonld-fatal-error.html | 61 ++++++ 5 files changed, 151 insertions(+), 146 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index c0994bd..0e32efc 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -17,11 +17,14 @@ package org.apache.any23.extractor.rdf; +import com.fasterxml.jackson.core.JsonLocation; +import com.fasterxml.jackson.core.JsonParseException; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionParameters; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.Extractor; +import org.apache.any23.extractor.IssueReport; import org.apache.any23.extractor.html.JsoupUtils; import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.RDFParseException; @@ -197,7 +200,18 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { } catch (RDFHandlerException ex) { throw new IllegalStateException("Unexpected exception.", ex); } catch (RDFParseException ex) { - throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult); + Throwable cause = ex.getCause(); + if (cause instanceof JsonParseException) { + JsonParseException err = (JsonParseException)cause; + JsonLocation loc = err.getLocation(); + if (loc == null) { + extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, err.getOriginalMessage(), -1L, -1L); + } else { + extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, err.getOriginalMessage(), loc.getLineNr(), loc.getColumnNr()); + } + } else { + throw new ExtractionException("Error while parsing RDF document.", ex, extractionResult); + } } } @@ -205,7 +219,7 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { private static class JsonCleaningInputStream extends InputStream { private boolean inEscape; - private boolean inQuote; + private int quoteChar; private boolean inCDATA; private boolean needsComma; @@ -240,13 +254,37 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { for (;;) { int c = stream.read(); - if (inQuote) { - return readQuoted(c, stream); + //other types of comments are handled by enabling fasterxml's + //ALLOW_COMMENTS and ALLOW_YAML_COMMENTS features + if (inCDATA) { + if (c == ']' && isNextOrUnread(stream, ']', '>')) { + inCDATA = false; + continue; + } + } else { + if (c == '<' && isNextOrUnread(stream, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) { + inCDATA = true; + continue; + } } - //we're not in a quote - c = stripComments(c, stream); + int q = quoteChar; + if (q != 0) { + //we're in a quote + if (inEscape) { + //end escape + inEscape = false; + } else if (c == '\\') { + //begin escape + inEscape = true; + } else if (c == q) { + //end quote + quoteChar = 0; + } + return c; + } + //we're not in a quote switch (c) { case ',': case ';': @@ -258,150 +296,21 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { //discard comma at end of object or array needsComma = false; return c; - case -1: - return c; - default: - if (Character.isWhitespace(c)) { - return ' '; - } else if (needsComma) { - stream.unread(c); - stream.unread(' '); - needsComma = false; - return ','; - } else if (c == '"') { - inQuote = true; - } - return c; - } - } - - } - - private int readQuoted(int c, PushbackInputStream stream) throws IOException { - if (inEscape) { - switch (c) { - case 'u': - //TODO: validate that 'u' is followed by 4 hex chars? - case '"': - case '\\': - case '/': - case 'b': - case 'f': - case 'n': - case 'r': - case 't': - case -1: - inEscape = false; - return c; default: - stream.unread(c); - inEscape = false; - return '\\'; - } - } else { - switch (c) { - case '\\': - break; - case '\n': - stream.unread('n'); - break; - case '\r': - stream.unread('r'); - break; - case '\b': - stream.unread('b'); - break; - case '\f': - stream.unread('f'); - break; - case '\t': - stream.unread('t'); - break; - case '"': - inQuote = false; - return c; - case -1: - return c; - default: - if (c < 0x20 || c == 0x7f) { - String hex = Integer.toHexString(c); - int ind = hex.length() - 1; - stream.unread(hex.charAt(ind)); - stream.unread(ind == 0 ? '0' : hex.charAt(--ind)); - stream.unread(ind == 0 ? '0' : hex.charAt(--ind)); - stream.unread(ind == 0 ? '0' : hex.charAt(--ind)); - stream.unread('u'); - break; - } else { - return c; - } - } - inEscape = true; - return '\\'; - } - } - - private int stripComments(int c, PushbackInputStream stream) throws IOException { - switch (c) { - case '/': - if (isNextOrUnread(stream, '/')) { - //single line comment: read to end of line - for (;;) { - c = stream.read(); - if (c == -1 || c == '\r' || c == '\n') { - return c; + if (c != -1 && !Character.isWhitespace(c)) { + if (needsComma) { + stream.unread(c); + stream.unread(' '); + needsComma = false; + return ','; + } else if (c == '"' || c == '\'') { + quoteChar = c; } } - } else if (isNextOrUnread(stream,'*')) { - //multiline comment: read till next "*/" - for (;;) { - c = stream.read(); - if (c == -1) { - return c; - } else if (c == '*') { - c = stream.read(); - if (c == -1) { - return c; - } else if (c == '/') { - //replace entire comment with single space - return ' '; - } - } - } - } else { - return c; - } - case '<': - if (isNextOrUnread(stream,'!','[','C','D','A','T','A','[')) { - inCDATA = true; - return ' '; - } else { - return c; - } - case '#': - for (;;) { - c = stream.read(); - if (c == -1 || c == '\r' || c == '\n') { - return c; - } - } - case ']': - if (inCDATA) { - if (isNextOrUnread(stream, ']', '>')) { - inCDATA = false; - return ' '; - } else { - return c; - } - } else { return c; - } - default: - return c; + } } - } - } } http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java index 402e267..71f2459 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java @@ -17,12 +17,16 @@ package org.apache.any23.extractor.rdf; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonParser; import com.github.jsonldjava.utils.JsonUtils; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; import org.eclipse.rdf4j.rio.RDFParser; +import java.lang.reflect.Field; + /** * Concrete implementation of {@link org.apache.any23.extractor.Extractor.ContentExtractor} * handling <a href="http://www.w3.org/TR/json-ld/">JSON-LD</a> format. @@ -41,6 +45,29 @@ public class JSONLDExtractor extends BaseRDFExtractor { throw new AssertionError("You have an outdated version of jsonld-java on the classpath. " + "Upgrade to at least version 0.12.0. See: https://issues.apache.org/jira/browse/ANY23-336", th); } + + JsonFactory JSON_FACTORY; + try { + Field field = JsonUtils.class.getDeclaredField("JSON_FACTORY"); + field.setAccessible(true); + JSON_FACTORY = (JsonFactory)field.get(null); + } catch (Exception e) { + throw new AssertionError(e); + } + + JSON_FACTORY.enable(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER); + JSON_FACTORY.enable(JsonParser.Feature.ALLOW_COMMENTS); + JSON_FACTORY.disable(JsonParser.Feature.ALLOW_MISSING_VALUES); //handled by JsonCleaningInputStream + JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS); + JSON_FACTORY.enable(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS); + JSON_FACTORY.enable(JsonParser.Feature.ALLOW_SINGLE_QUOTES); + JSON_FACTORY.disable(JsonParser.Feature.ALLOW_TRAILING_COMMA); //handled by JsonCleaningInputStream + JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS); + JSON_FACTORY.enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES); + JSON_FACTORY.enable(JsonParser.Feature.ALLOW_YAML_COMMENTS); + JSON_FACTORY.enable(JsonParser.Feature.IGNORE_UNDEFINED); + JSON_FACTORY.enable(JsonParser.Feature.INCLUDE_SOURCE_IN_LOCATION); + JSON_FACTORY.disable(JsonParser.Feature.STRICT_DUPLICATE_DETECTION); } http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java index 2778621..6b4406a 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java @@ -284,7 +284,7 @@ public class RDFParserFactory { ) { parser.getParserConfig().setNonFatalErrors(stopAtFirstError ? Collections.emptySet() : new HashSet<>(parser.getSupportedSettings())); parser.set(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES, verifyDataType); - parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true); + parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, verifyDataType); parser.setParseErrorListener(new InternalParseErrorListener(extractionResult)); parser.setValueFactory( http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java index 41a0711..4141bd2 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java @@ -17,6 +17,7 @@ package org.apache.any23.extractor.html; import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.IssueReport; import org.apache.any23.rdf.RDFUtils; import org.apache.any23.vocab.FOAF; import org.junit.Test; @@ -75,13 +76,20 @@ public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase { assertExtract("/html/html-jsonld-unescaped-characters.html"); assertModelNotEmpty(); assertStatementsSize(null, null, null, 375); - assertContains(RDFUtils.iri("http://schema.org/name"), "Weezer & Pixies\\\u0008"); + assertContains(RDFUtils.iri("http://schema.org/name"), "Weezer & Pixies\u0008"); assertContains(RDFUtils.iri("http://schema.org/description"), "#1 MAGIC SHOW IN L.A.\nThe current WINNER of the CWâs Penn & Tellerâs FOOL US, Illusionist " + "extraordinaire Ivan Amodei is on a national tour with his show INTIMATE ILLUSIONS." + "\n\nCurrently, on an ei..."); } + @Test + public void testJSONLDFatalError() { + assertExtract("/html/html-jsonld-fatal-error.html",false); + assertIssue(IssueReport.IssueLevel.FATAL, ".*Unexpected character .* was expecting comma to separate Object entries.*"); + assertStatementsSize(null, null, null, 4); + } + @Override protected ExtractorFactory<?> getExtractorFactory() { return new EmbeddedJSONLDExtractorFactory(); http://git-wip-us.apache.org/repos/asf/any23/blob/837f92b9/test-resources/src/test/resources/html/html-jsonld-fatal-error.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/html-jsonld-fatal-error.html b/test-resources/src/test/resources/html/html-jsonld-fatal-error.html new file mode 100644 index 0000000..1ccb7ab --- /dev/null +++ b/test-resources/src/test/resources/html/html-jsonld-fatal-error.html @@ -0,0 +1,61 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> <!-- Excerpted from: http://osl.ugr.es/JSLUGR/ --> +<html lang="es"> + +<head> + <title>Jornadas de Software Libre de la Universidad de Granada</title> +</head> + +<body id="page-top" data-spy="scroll" data-target=".navbar-fixed-top"> + + + +<script type="application/ld+json"> + { + "@context": "http://schema.org", + "@type": "Organization", + "url": "http://osl.ugr.es", + "contactPoint": [{ + "@type": "ContactPoint", + "email": "[email protected]", + "name": "Jornadas de Software Libre" + "contactType": "Organizing committee", + "url": "http://osl.ugr.es" + }] + } + </script> + +<script type="application/ld+json"> + { + "@context": { + "ical": "http://www.w3.org/2002/12/cal/ical#", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "ical:dtstart": { + "@type": "xsd:dateTime" + } + }, + "ical:summary": "Jornadas de Software Libre", + "ical:location": "Por determinar. Granada, España", + "ical:dtstart": "2017-09-27T08:00Z", + "ical:dtend": "2017-09-28T16:00Z" + } + </script> + +</body> + +</html> \ No newline at end of file
