ANY23-378 clean commas in JSON-LD

Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/aae21370
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/aae21370
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/aae21370

Branch: refs/heads/master
Commit: aae21370e70715f82f7cc868b9a298f1178d0f80
Parents: a07d1f0
Author: Hans <[email protected]>
Authored: Wed Aug 1 11:25:21 2018 -0500
Committer: Hans <[email protected]>
Committed: Wed Aug 1 11:25:21 2018 -0500

----------------------------------------------------------------------
 .../any23/extractor/rdf/BaseRDFExtractor.java   |  83 +-
 .../html/EmbeddedJSONLDExtractorTest.java       |   7 +
 .../test/resources/html/html-jsonld-commas.html | 794 +++++++++++++++++++
 3 files changed, 859 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/aae21370/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index de717c1..2cb2c2e 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -165,7 +165,7 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
 
                 in = new 
ByteArrayInputStream(doc.toString().getBytes(charset));
             } else if (format.hasFileExtension("jsonld") || 
format.hasMIMEType("application/ld+json")) {
-                in = new JsonCommentStripperInputStream(in);
+                in = new JsonCleaningInputStream(in);
             }
 
             parser.parse(in, iri);
@@ -177,28 +177,29 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
     }
 
 
-    private static class JsonCommentStripperInputStream extends InputStream {
+    private static class JsonCleaningInputStream extends InputStream {
 
         private boolean inEscape;
         private boolean inQuote;
         private boolean inCDATA;
+        private boolean needsComma;
 
         private final PushbackInputStream wrapped;
 
-        JsonCommentStripperInputStream(InputStream in) {
+        JsonCleaningInputStream(InputStream in) {
             wrapped = new PushbackInputStream(in, 16);
         }
 
-        private boolean isNextOrUnread(int... next) throws IOException {
+        private static boolean isNextOrUnread(PushbackInputStream stream, 
int... next) throws IOException {
             int i = -1;
             for (int test : next) {
-                int c = wrapped.read();
+                int c = stream.read();
                 if (c != test) {
                     if (c != -1) {
-                        wrapped.unread(c);
+                        stream.unread(c);
                     }
                     while (i >= 0) {
-                        wrapped.unread(next[i--]);
+                        stream.unread(next[i--]);
                     }
                     return false;
                 }
@@ -210,23 +211,58 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
         @Override
         public int read() throws IOException {
             PushbackInputStream stream = wrapped;
-            int c = stream.read();
-
-            if (inQuote) {
-                if (inEscape) {
-                    inEscape = false;
-                } else if (c == '"') {
-                    inQuote = false;
-                } else if (c == '\\') {
-                    inEscape = true;
+
+            for (;;) {
+                int c = stream.read();
+
+                if (inQuote) {
+                    if (inEscape) {
+                        inEscape = false;
+                    } else if (c == '"') {
+                        inQuote = false;
+                    } else if (c == '\\') {
+                        inEscape = true;
+                    }
+                    return c;
+                }
+
+                //we're not in a quote
+                c = stripComments(c, stream);
+
+                switch (c) {
+                    case ',':
+                    case ';':
+                        //don't write out comma yet!
+                        needsComma = true;
+                        break;
+                    case '}':
+                    case ']':
+                        //discard comma at end of object or array
+                        needsComma = false;
+                        return c;
+                    case -1:
+                        return c;
+                    default:
+                        if (Character.isWhitespace(c)) {
+                            return ' ';
+                        } else if (needsComma) {
+                            stream.unread(c);
+                            stream.unread(' ');
+                            needsComma = false;
+                            return ',';
+                        } else if (c == '"') {
+                            inQuote = true;
+                        }
+                        return c;
                 }
-                return c;
             }
 
-            //we're not in a quote
+        }
+
+        private int stripComments(int c, PushbackInputStream stream) throws 
IOException {
             switch (c) {
                 case '/':
-                    if (isNextOrUnread('/')) {
+                    if (isNextOrUnread(stream, '/')) {
                         //single line comment: read to end of line
                         for (;;) {
                             c = stream.read();
@@ -234,7 +270,7 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                                 return c;
                             }
                         }
-                    } else if (isNextOrUnread('*')) {
+                    } else if (isNextOrUnread(stream,'*')) {
                         //multiline comment: read till next "*/"
                         for (;;) {
                             c = stream.read();
@@ -254,7 +290,7 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                         return c;
                     }
                 case '<':
-                    if (isNextOrUnread('!','[','C','D','A','T','A','[')) {
+                    if 
(isNextOrUnread(stream,'!','[','C','D','A','T','A','[')) {
                         inCDATA = true;
                         return ' ';
                     } else {
@@ -269,7 +305,7 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                     }
                 case ']':
                     if (inCDATA) {
-                        if (isNextOrUnread(']', '>')) {
+                        if (isNextOrUnread(stream, ']', '>')) {
                             inCDATA = false;
                             return ' ';
                         } else {
@@ -278,9 +314,6 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
                     } else {
                         return c;
                     }
-                case '"':
-                    inQuote = true;
-                    return c;
                 default:
                     return c;
             }

http://git-wip-us.apache.org/repos/asf/any23/blob/aae21370/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
 
b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
index 23af441..30a0773 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractorTest.java
@@ -63,6 +63,13 @@ public class EmbeddedJSONLDExtractorTest extends 
AbstractExtractorTestCase {
                assertContains(RDFUtils.iri(FOAF.NS, "name"), "Robert\\\" 
Millar\\\\\"\"\\\\");
        }
 
+       @Test
+       public void testJSONLDCommaNormalization() {
+               assertExtract("/html/html-jsonld-commas.html");
+               assertModelNotEmpty();
+               assertStatementsSize(null, null, null, 30);
+       }
+
        @Override
        protected ExtractorFactory<?> getExtractorFactory() {
                return new EmbeddedJSONLDExtractorFactory();

Reply via email to