Repository: any23
Updated Branches:
  refs/heads/master 837f92b91 -> 0df8cdba6


ANY23-383 allow all unicode space characters in JSON-LD


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/0df8cdba
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/0df8cdba
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/0df8cdba

Branch: refs/heads/master
Commit: 0df8cdba68fea0c6dcf819759627759c7597f0cb
Parents: 837f92b
Author: Hans <[email protected]>
Authored: Sat Aug 4 00:47:16 2018 -0500
Committer: Hans <[email protected]>
Committed: Sat Aug 4 10:06:58 2018 -0500

----------------------------------------------------------------------
 cli/pom.xml                                     | 12 ++++
 core/pom.xml                                    | 12 ++++
 .../any23/extractor/rdf/BaseRDFExtractor.java   | 72 ++++++++++++++++----
 .../html/EmbeddedJSONLDExtractorTest.java       |  5 ++
 .../extractor/rdf/JSONLDExtractorTest.java      | 22 +++++-
 encoding/pom.xml                                | 12 ++++
 mime/pom.xml                                    | 12 ++++
 pom.xml                                         | 16 +++++
 .../html/html-jsonld-bad-character.html         | 43 ++++++++++++
 9 files changed, 193 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/cli/pom.xml
----------------------------------------------------------------------
diff --git a/cli/pom.xml b/cli/pom.xml
index 573646e..fdd7dea 100644
--- a/cli/pom.xml
+++ b/cli/pom.xml
@@ -139,6 +139,18 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
     <!-- END: Tika -->
 
     <!-- BEGIN: RDF4J -->

http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index 12cc6ae..49a1bfc 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -116,6 +116,18 @@
         <groupId>org.apache.commons</groupId>
         <artifactId>commons-compress</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
     <!-- END: Tika -->
 
     <!-- BEGIN: RDF4J -->

http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index 0e32efc..797d878 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -216,7 +216,7 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
     }
 
 
-    private static class JsonCleaningInputStream extends InputStream {
+    static class JsonCleaningInputStream extends InputStream {
 
         private boolean inEscape;
         private int quoteChar;
@@ -290,25 +290,73 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                     case ';':
                         //don't write out comma yet!
                         needsComma = true;
-                        break;
+                        continue;
                     case '}':
                     case ']':
                         //discard comma at end of object or array
                         needsComma = false;
                         return c;
-                    default:
-                        if (c != -1 && !Character.isWhitespace(c)) {
-                            if (needsComma) {
-                                stream.unread(c);
-                                stream.unread(' ');
-                                needsComma = false;
-                                return ',';
-                            } else if (c == '"' || c == '\'') {
-                                quoteChar = c;
+                    case -1:
+                    case '\r':
+                    case '\n':
+                        return c;
+                    case 0x09:
+                    case 0x0b:
+                    case 0x0c:
+                    case 0x1c:
+                    case 0x1d:
+                    case 0x1e:
+                    case 0x1f:
+                    case 0x20:
+                        return ' ';
+                    case 0xc2:
+                        if (isNextOrUnread(stream, 0xa0)) {
+                            return ' ';
+                        }
+                        break;
+                    case 0xe1:
+                        if (isNextOrUnread(stream, 0x9a, 0x80)
+                                || isNextOrUnread(stream, 0xa0, 0x8e)) {
+                            return ' ';
+                        }
+                        break;
+                    case 0xe2:
+                        int c1 = stream.read();
+                        if (c1 == 0x80) {
+                            int c2 = stream.read();
+                            //space separators
+                            if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf
+                                    //line and paragraph separators
+                                    || c2 == 0xa8 || c2 == 0xa9) {
+                                return ' ';
                             }
+                            stream.unread(c2);
+                        } else if (c1 == 0x81) {
+                            int c2 = stream.read();
+                            if (c2 == 0x9f) {
+                                return ' ';
+                            }
+                            stream.unread(c2);
                         }
-                        return c;
+                        stream.unread(c1);
+                        break;
+                    case 0xe3:
+                        if (isNextOrUnread(stream, 0x80, 0x80)) {
+                            return ' ';
+                        }
+                        break;
+                    default:
+                        break;
+                }
+                if (needsComma) {
+                    stream.unread(c);
+                    stream.unread(' ');
+                    needsComma = false;
+                    return ',';
+                } else if (c == '"' || c == '\'') {
+                    quoteChar = c;
                 }
+                return c;
             }
         }
     }

http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
index 4141bd2..5daedd4 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
@@ -90,6 +90,11 @@ public class EmbeddedJSONLDExtractorTest extends 
AbstractExtractorTestCase {
                assertStatementsSize(null, null, null, 4);
        }
 
+       @Test
+       public void testJSONLDBadCharacter() throws Exception {
+               assertExtract("/html/html-jsonld-bad-character.html");
+       }
+
        @Override
        protected ExtractorFactory<?> getExtractorFactory() {
                return new EmbeddedJSONLDExtractorFactory();

http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java 
b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
index 1e9aa6f..215b552 100644
--- a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
+++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java
@@ -16,8 +16,11 @@
  */
 package org.apache.any23.extractor.rdf;
 
+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
@@ -29,6 +32,7 @@ import org.apache.any23.writer.RDFXMLWriter;
 import org.apache.any23.writer.TripleHandler;
 import org.apache.any23.writer.TripleHandlerException;
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 import org.eclipse.rdf4j.model.IRI;
@@ -61,7 +65,23 @@ public class JSONLDExtractorTest {
       final IRI uri = RDFUtils.iri("http://host.com/place-example.jsonld";);
       extract(uri, "/org/apache/any23/extractor/rdf/place-example.jsonld");
   }
-  
+
+  @Test
+  public void testWhitespaceCleaning() throws Exception {
+    for (int i = 0; i <= Character.MAX_CODE_POINT; i++) {
+      if (Character.isWhitespace(i) || Character.isSpaceChar(i)) {
+        byte[] bytes = new 
String(Character.toChars(i)).getBytes(StandardCharsets.UTF_8);
+        InputStream stream = new BaseRDFExtractor.JsonCleaningInputStream(new 
ByteArrayInputStream(bytes));
+        if (i == '\r' || i == '\n') {
+          Assert.assertEquals(stream.read(), i);
+        } else {
+          Assert.assertEquals(stream.read(), ' ');
+        }
+        Assert.assertEquals(stream.read(), -1);
+      }
+    }
+  }
+
   public void extract(IRI uri, String filePath) 
     throws IOException, ExtractionException, TripleHandlerException {
     ByteArrayOutputStream baos = new ByteArrayOutputStream();

http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/encoding/pom.xml
----------------------------------------------------------------------
diff --git a/encoding/pom.xml b/encoding/pom.xml
index 873c3de..7916ebc 100644
--- a/encoding/pom.xml
+++ b/encoding/pom.xml
@@ -136,6 +136,18 @@
       <groupId>org.slf4j</groupId> <!-- also replaces httpclient 
commons-logging dependency -->
       <artifactId>jcl-over-slf4j</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
     <!-- END: Tika -->
 
     <!-- BEGIN: test dependencies -->

http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/mime/pom.xml
----------------------------------------------------------------------
diff --git a/mime/pom.xml b/mime/pom.xml
index e4caf5e..c833def 100644
--- a/mime/pom.xml
+++ b/mime/pom.xml
@@ -165,6 +165,18 @@
       <groupId>org.slf4j</groupId>
       <artifactId>jcl-over-slf4j</artifactId>
     </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-databind</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>com.fasterxml.jackson.core</groupId>
+      <artifactId>jackson-annotations</artifactId>
+    </dependency>
     <!-- END: Tika -->
 
 

http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 50ff0d9..ce2ee5d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -280,6 +280,7 @@
     <tika.version>1.18</tika.version>
     <openie_2.11.version>4.2.6</openie_2.11.version>
     <openregex.version>1.1.1</openregex.version>
+    <jackson.version>2.9.6</jackson.version>
 
     <!-- Overridden in profiles to add JDK specific arguments to surefire -->
     <surefire-extra-args />
@@ -398,6 +399,21 @@
         <artifactId>poi-scratchpad</artifactId>
         <version>${poi.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-core</artifactId>
+        <version>${jackson.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-databind</artifactId>
+        <version>${jackson.version}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.fasterxml.jackson.core</groupId>
+        <artifactId>jackson-annotations</artifactId>
+        <version>${jackson.version}</version>
+      </dependency>
       <!-- END: Tika -->
 
       <!-- BEGIN: RDF4J -->

http://git-wip-us.apache.org/repos/asf/any23/blob/0df8cdba/test-resources/src/test/resources/html/html-jsonld-bad-character.html
----------------------------------------------------------------------
diff --git 
a/test-resources/src/test/resources/html/html-jsonld-bad-character.html 
b/test-resources/src/test/resources/html/html-jsonld-bad-character.html
new file mode 100644
index 0000000..659c53c
--- /dev/null
+++ b/test-resources/src/test/resources/html/html-jsonld-bad-character.html
@@ -0,0 +1,43 @@
+<!DOCTYPE html>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<!-- Excerpted from: https://america.france.fr/es -->
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>France.fr - La Francia inesperada por aquellas y aquellos que la 
confo</title>
+</head>
+<body>
+
+<script type="application/ld+json"> 

+{ 

+    "@context": "http://schema.org";,
+    

"@type": "WebSite",
+    

"name": "FRANCE.FR",


+    "alternateName": "Atout France",
+    
"url": "https://www.france.fr";,
+    
"potentialAction": {


+        "@type": "SearchAction",


+        "target": 
"https://america.france.fr/es/busqueda?q={search_term_string}",

+        "query-input": "required name=q",
+        "sameAs": 
["http:\/\/www.atout-france.fr\/","https:\/\/www.diplomatie.gouv.fr\/es\/","http:\/\/media.atout-france.fr\/","http:\/\/www.meeting.france.fr\/"]
+    }
+
}
+
</script>
+
+</body>
+</html>
\ No newline at end of file

Reply via email to