Repository: any23 Updated Branches: refs/heads/master 837f92b91 -> 0df8cdba6
ANY23-383 allow all unicode space characters in JSON-LD Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0df8cdba Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0df8cdba Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0df8cdba Branch: refs/heads/master Commit: 0df8cdba68fea0c6dcf819759627759c7597f0cb Parents: 837f92b Author: Hans <[email protected]> Authored: Sat Aug 4 00:47:16 2018 -0500 Committer: Hans <[email protected]> Committed: Sat Aug 4 10:06:58 2018 -0500 ---------------------------------------------------------------------- cli/pom.xml | 12 ++++ core/pom.xml | 12 ++++ .../any23/extractor/rdf/BaseRDFExtractor.java | 72 ++++++++++++++++---- .../html/EmbeddedJSONLDExtractorTest.java | 5 ++ .../extractor/rdf/JSONLDExtractorTest.java | 22 +++++- encoding/pom.xml | 12 ++++ mime/pom.xml | 12 ++++ pom.xml | 16 +++++ .../html/html-jsonld-bad-character.html | 43 ++++++++++++ 9 files changed, 193 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/cli/pom.xml ---------------------------------------------------------------------- diff --git a/cli/pom.xml b/cli/pom.xml index 573646e..fdd7dea 100644 --- a/cli/pom.xml +++ b/cli/pom.xml @@ -139,6 +139,18 @@ </exclusion> </exclusions> </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-annotations</artifactId> + </dependency> <!-- END: Tika --> <!-- BEGIN: RDF4J --> http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index 12cc6ae..49a1bfc 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -116,6 +116,18 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-compress</artifactId> </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-annotations</artifactId> + </dependency> <!-- END: Tika --> <!-- BEGIN: RDF4J --> http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index 0e32efc..797d878 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -216,7 +216,7 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { } - private static class JsonCleaningInputStream extends InputStream { + static class JsonCleaningInputStream extends InputStream { private boolean inEscape; private int quoteChar; @@ -290,25 +290,73 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { case ';': //don't write out comma yet! needsComma = true; - break; + continue; case '}': case ']': //discard comma at end of object or array needsComma = false; return c; - default: - if (c != -1 && !Character.isWhitespace(c)) { - if (needsComma) { - stream.unread(c); - stream.unread(' '); - needsComma = false; - return ','; - } else if (c == '"' || c == '\'') { - quoteChar = c; + case -1: + case '\r': + case '\n': + return c; + case 0x09: + case 0x0b: + case 0x0c: + case 0x1c: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x20: + return ' '; + case 0xc2: + if (isNextOrUnread(stream, 0xa0)) { + return ' '; + } + break; + case 0xe1: + if (isNextOrUnread(stream, 0x9a, 0x80) + || isNextOrUnread(stream, 0xa0, 0x8e)) { + return ' '; + } + break; + case 0xe2: + int c1 = stream.read(); + if (c1 == 0x80) { + int c2 = stream.read(); + //space separators + if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf + //line and paragraph separators + || c2 == 0xa8 || c2 == 0xa9) { + return ' '; } + stream.unread(c2); + } else if (c1 == 0x81) { + int c2 = stream.read(); + if (c2 == 0x9f) { + return ' '; + } + stream.unread(c2); } - return c; + stream.unread(c1); + break; + case 0xe3: + if (isNextOrUnread(stream, 0x80, 0x80)) { + return ' '; + } + break; + default: + break; + } + if (needsComma) { + stream.unread(c); + stream.unread(' '); + needsComma = false; + return ','; + } else if (c == '"' || c == '\'') { + quoteChar = c; } + return c; } } } http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java index 4141bd2..5daedd4 100644 --- a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java @@ -90,6 +90,11 @@ public class EmbeddedJSONLDExtractorTest extends AbstractExtractorTestCase { assertStatementsSize(null, null, null, 4); } + @Test + public void testJSONLDBadCharacter() throws Exception { + assertExtract("/html/html-jsonld-bad-character.html"); + } + @Override protected ExtractorFactory<?> getExtractorFactory() { return new EmbeddedJSONLDExtractorFactory(); http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java index 1e9aa6f..215b552 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java @@ -16,8 +16,11 @@ */ package org.apache.any23.extractor.rdf; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionException; @@ -29,6 +32,7 @@ import org.apache.any23.writer.RDFXMLWriter; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.eclipse.rdf4j.model.IRI; @@ -61,7 +65,23 @@ public class JSONLDExtractorTest { final IRI uri = RDFUtils.iri("http://host.com/place-example.jsonld"); extract(uri, "/org/apache/any23/extractor/rdf/place-example.jsonld"); } - + + @Test + public void testWhitespaceCleaning() throws Exception { + for (int i = 0; i <= Character.MAX_CODE_POINT; i++) { + if (Character.isWhitespace(i) || Character.isSpaceChar(i)) { + byte[] bytes = new String(Character.toChars(i)).getBytes(StandardCharsets.UTF_8); + InputStream stream = new BaseRDFExtractor.JsonCleaningInputStream(new ByteArrayInputStream(bytes)); + if (i == '\r' || i == '\n') { + Assert.assertEquals(stream.read(), i); + } else { + Assert.assertEquals(stream.read(), ' '); + } + Assert.assertEquals(stream.read(), -1); + } + } + } + public void extract(IRI uri, String filePath) throws IOException, ExtractionException, TripleHandlerException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/encoding/pom.xml ---------------------------------------------------------------------- diff --git a/encoding/pom.xml b/encoding/pom.xml index 873c3de..7916ebc 100644 --- a/encoding/pom.xml +++ b/encoding/pom.xml @@ -136,6 +136,18 @@ <groupId>org.slf4j</groupId> <!-- also replaces httpclient commons-logging dependency --> <artifactId>jcl-over-slf4j</artifactId> </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-annotations</artifactId> + </dependency> <!-- END: Tika --> <!-- BEGIN: test dependencies --> http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/mime/pom.xml ---------------------------------------------------------------------- diff --git a/mime/pom.xml b/mime/pom.xml index e4caf5e..c833def 100644 --- a/mime/pom.xml +++ b/mime/pom.xml @@ -165,6 +165,18 @@ <groupId>org.slf4j</groupId> <artifactId>jcl-over-slf4j</artifactId> </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-annotations</artifactId> + </dependency> <!-- END: Tika --> http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 50ff0d9..ce2ee5d 100644 --- a/pom.xml +++ b/pom.xml @@ -280,6 +280,7 @@ <tika.version>1.18</tika.version> <openie_2.11.version>4.2.6</openie_2.11.version> <openregex.version>1.1.1</openregex.version> + <jackson.version>2.9.6</jackson.version> <!-- Overridden in profiles to add JDK specific arguments to surefire --> <surefire-extra-args /> @@ -398,6 +399,21 @@ <artifactId>poi-scratchpad</artifactId> <version>${poi.version}</version> </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + <version>${jackson.version}</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + <version>${jackson.version}</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-annotations</artifactId> + <version>${jackson.version}</version> + </dependency> <!-- END: Tika --> <!-- BEGIN: RDF4J --> http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/test-resources/src/test/resources/html/html-jsonld-bad-character.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/html/html-jsonld-bad-character.html b/test-resources/src/test/resources/html/html-jsonld-bad-character.html new file mode 100644 index 0000000..659c53c --- /dev/null +++ b/test-resources/src/test/resources/html/html-jsonld-bad-character.html @@ -0,0 +1,43 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<!-- Excerpted from: https://america.france.fr/es --> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <title>France.fr - La Francia inesperada por aquellas y aquellos que la confo</title> +</head> +<body> + +<script type="application/ld+json"> ⨠+{ ⨠+ "@context": "http://schema.org", + â¨â¨"@type": "WebSite", + â¨â¨"name": "FRANCE.FR",â¨â¨ + "alternateName": "Atout France", + â¨"url": "https://www.france.fr", + â¨"potentialAction": {â¨â¨ + "@type": "SearchAction",â¨â¨ + "target": "https://america.france.fr/es/busqueda?q={search_term_string}",⨠+ "query-input": "required name=q", + "sameAs": ["http:\/\/www.atout-france.fr\/","https:\/\/www.diplomatie.gouv.fr\/es\/","http:\/\/media.atout-france.fr\/","http:\/\/www.meeting.france.fr\/"] + } +â¨} +â¨</script> + +</body> +</html> \ No newline at end of file
