any23 git commit: ANY23-326 fixed rdfa issue with unclosed input & meta tags

lewismc Wed, 24 Jan 2018 20:50:13 -0800

Repository: any23
Updated Branches:
  refs/heads/master 07f7421cd -> eefa208db



ANY23-326 fixed rdfa issue with unclosed input & meta tags


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/eefa208d
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/eefa208d
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/eefa208d

Branch: refs/heads/master
Commit: eefa208db3b4ad176ab3636fb3cc539bc00ea100
Parents: 07f7421
Author: Hans <[email protected]>
Authored: Wed Jan 24 06:26:40 2018 -0600
Committer: Hans <[email protected]>
Committed: Wed Jan 24 19:06:03 2018 -0600

----------------------------------------------------------------------
 .../resources/default-configuration.properties  |   2 +-
 .../apache/any23/extractor/html/JsoupUtils.java | 103 +++++++++++++++++++
 .../any23/extractor/html/TagSoupParser.java     |   1 -
 .../html/TagSoupParsingConfiguration.java       |  71 +------------
 .../any23/extractor/rdf/BaseRDFExtractor.java   |  46 ++++++++-
 5 files changed, 151 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/eefa208d/api/src/main/resources/default-configuration.properties
----------------------------------------------------------------------
diff --git a/api/src/main/resources/default-configuration.properties 
b/api/src/main/resources/default-configuration.properties
index d1d35de..a8ca0c2 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -79,4 +79,4 @@ any23.extraction.openie.confidence.threshold=0.5
 
 # Use legacy setting to parse html
 # with NekoHTML instead of Jsoup
-any23.tagsoup.legacy=off
\ No newline at end of file
+any23.tagsoup.legacy=off

http://git-wip-us.apache.org/repos/asf/any23/blob/eefa208d/core/src/main/java/org/apache/any23/extractor/html/JsoupUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/any23/extractor/html/JsoupUtils.java 
b/core/src/main/java/org/apache/any23/extractor/html/JsoupUtils.java
new file mode 100644
index 0000000..3b50221
--- /dev/null
+++ b/core/src/main/java/org/apache/any23/extractor/html/JsoupUtils.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.any23.extractor.html;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.SequenceInputStream;
+import java.util.Arrays;
+
+/**
+ * @author Hans Brende
+ */
+public class JsoupUtils {
+
+    public static Document parse(InputStream input, String documentIRI, String 
encoding) throws IOException {
+        //Jsoup doesn't allow null document URIs
+        if (documentIRI == null) {
+            documentIRI = "";
+        }
+
+        //workaround for Jsoup issue #1009
+        if (encoding == null) {
+
+            int c;
+            do {
+                c = input.read();
+            } while (c != -1 && Character.isWhitespace(c));
+
+            if (c != -1) {
+                int capacity = 256;
+                byte[] bytes = new byte[capacity];
+                int length = 0;
+                bytes[length++] = (byte)c;
+
+                if (c == '<') {
+                    c = input.read();
+                    if (c != -1) {
+                        bytes[length++] = (byte)c;
+                        if (c == '?') {
+                            c = input.read();
+
+                            while (c != -1) {
+                                if (length == capacity) {
+                                    capacity *= 2;
+                                    bytes = Arrays.copyOf(bytes, capacity);
+                                }
+                                bytes[length++] = (byte)c;
+
+                                if (c == '>') {
+                                    if (length >= 20 && bytes[length - 2] == 
'?') {
+                                        String decl = "<" + new String(bytes, 
2, length - 4) + ">";
+                                        org.jsoup.nodes.Document doc = 
org.jsoup.Jsoup.parse(decl, documentIRI, Parser.xmlParser());
+                                        for (org.jsoup.nodes.Element el : 
doc.children()) {
+                                            if 
("xml".equalsIgnoreCase(el.tagName())) {
+                                                String enc = 
el.attr("encoding");
+                                                if (enc != null && 
!enc.isEmpty()) {
+                                                    encoding = enc;
+                                                    break;
+                                                }
+                                            }
+                                        }
+                                    }
+                                    break;
+                                }
+
+                                c = input.read();
+                            }
+                        }
+                    }
+
+                }
+
+                input = new SequenceInputStream(new 
ByteArrayInputStream(bytes, 0, length), input);
+            }
+
+        }
+
+        //Use Parser.htmlParser() to parse javascript correctly
+        return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/eefa208d/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java 
b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
index 2147520..d96a07b 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
@@ -17,7 +17,6 @@
 
 package org.apache.any23.extractor.html;
 
-import org.apache.any23.configuration.DefaultConfiguration;
 import org.apache.any23.validator.DefaultValidator;
 import org.apache.any23.validator.Validator;
 import org.apache.any23.validator.ValidatorException;

http://git-wip-us.apache.org/repos/asf/any23/blob/eefa208d/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
 
b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
index 77e4524..2aeaac1 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
@@ -19,18 +19,14 @@ package org.apache.any23.extractor.html;
 
 import org.apache.any23.configuration.DefaultConfiguration;
 import org.jsoup.nodes.Attribute;
-import org.jsoup.parser.Parser;
 import org.jsoup.select.NodeTraversor;
 import org.jsoup.select.NodeVisitor;
 import org.w3c.dom.Comment;
 import org.w3c.dom.Document;
 import org.w3c.dom.Text;
 
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.SequenceInputStream;
-import java.util.Arrays;
 
 
 /**
@@ -67,70 +63,8 @@ abstract class TagSoupParsingConfiguration {
 
         @Override
         Document parse(InputStream input, String documentIRI, String encoding) 
throws IOException {
-            //Jsoup doesn't allow null document URIs
 
-            if (documentIRI == null) {
-                documentIRI = "";
-            }
-
-            //workaround for Jsoup issue #1009
-            if (encoding == null) {
-
-                int c;
-                do {
-                    c = input.read();
-                } while (c != -1 && Character.isWhitespace(c));
-
-                if (c != -1) {
-                    int capacity = 256;
-                    byte[] bytes = new byte[capacity];
-                    int length = 0;
-                    bytes[length++] = (byte)c;
-
-                    if (c == '<') {
-                        c = input.read();
-                        if (c != -1) {
-                            bytes[length++] = (byte)c;
-                            if (c == '?') {
-                                c = input.read();
-
-                                while (c != -1) {
-                                    if (length == capacity) {
-                                        capacity *= 2;
-                                        bytes = Arrays.copyOf(bytes, capacity);
-                                    }
-                                    bytes[length++] = (byte)c;
-
-                                    if (c == '>') {
-                                        if (length >= 20 && bytes[length - 2] 
== '?') {
-                                            String decl = "<" + new 
String(bytes, 2, length - 4) + ">";
-                                            org.jsoup.nodes.Document doc = 
org.jsoup.Jsoup.parse(decl, documentIRI, Parser.xmlParser());
-                                            for (org.jsoup.nodes.Element el : 
doc.children()) {
-                                                if 
("xml".equalsIgnoreCase(el.tagName())) {
-                                                    String enc = 
el.attr("encoding");
-                                                    if (enc != null && 
!enc.isEmpty()) {
-                                                        encoding = enc;
-                                                        break;
-                                                    }
-                                                }
-                                            }
-                                        }
-                                        break;
-                                    }
-
-                                    c = input.read();
-                                }
-                            }
-                        }
-
-                    }
-
-                    input = new SequenceInputStream(new 
ByteArrayInputStream(bytes, 0, length), input);
-                }
-
-            }
-
-            org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(input, 
encoding, documentIRI);
+            org.jsoup.nodes.Document document = JsoupUtils.parse(input, 
documentIRI, encoding);
 
             return convert(document);
         }
@@ -139,7 +73,8 @@ abstract class TagSoupParsingConfiguration {
         private static Document convert(org.jsoup.nodes.Document document) {
             Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();
 
-            for (org.jsoup.nodes.Element rootEl : document.children()) {
+            org.jsoup.nodes.Element rootEl = document.children().first();
+            if (rootEl != null) {
                 NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
             }
 

http://git-wip-us.apache.org/repos/asf/any23/blob/eefa208d/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
index 6b9377e..8f89f21 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java
@@ -22,16 +22,27 @@ import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
-import org.eclipse.rdf4j.rio.RDFHandlerException;
+import org.apache.any23.extractor.html.JsoupUtils;
+import org.eclipse.rdf4j.rio.RDFFormat;
 import org.eclipse.rdf4j.rio.RDFParseException;
 import org.eclipse.rdf4j.rio.RDFParser;
+import org.eclipse.rdf4j.rio.RDFHandlerException;
 import org.eclipse.rdf4j.rio.RioSetting;
 import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
+import org.jsoup.nodes.DataNode;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Entities;
+import org.jsoup.nodes.Node;
+import org.jsoup.select.NodeTraversor;
+import org.jsoup.select.NodeVisitor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.HashSet;
 
 /**
@@ -105,7 +116,38 @@ public abstract class BaseRDFExtractor implements 
Extractor.ContentExtractor {
             
parser.getParserConfig().addNonFatalError(BasicParserSettings.NORMALIZE_DATATYPE_VALUES);
             //ByteBuffer seems to represent incorrect content. Need to make 
sure it is the content
             //of the <script> node and not anything else!
-            parser.parse(in, extractionContext.getDocumentIRI().stringValue());
+            RDFFormat format = parser.getRDFFormat();
+            String iri = extractionContext.getDocumentIRI().stringValue();
+
+            if (format.hasFileExtension("xhtml") || 
format.hasMIMEType("application/xhtml+xml")) {
+                Charset charset = format.getCharset();
+                if (charset == null) {
+                    charset = StandardCharsets.UTF_8;
+                }
+                Document doc = JsoupUtils.parse(in, iri, null);
+                doc.outputSettings()
+                        .prettyPrint(false)
+                        .syntax(Document.OutputSettings.Syntax.xml)
+                        .escapeMode(Entities.EscapeMode.xhtml)
+                        .charset(charset);
+                //Delete scripts. Json-ld in script tags is extracted first
+                //from tag soup dom, so we should be fine.
+                NodeTraversor.traverse(new NodeVisitor() {
+                    @Override
+                    public void head(Node node, int depth) {
+                        if (node instanceof DataNode) {
+                            ((DataNode) node).setWholeData("");
+                        }
+                    }
+                    @Override
+                    public void tail(Node node, int depth) {
+                    }
+                }, doc);
+
+                in = new 
ByteArrayInputStream(doc.toString().getBytes(charset));
+            }
+
+            parser.parse(in, iri);
         } catch (RDFHandlerException ex) {
             throw new IllegalStateException("Unexpected exception.", ex);
         } catch (RDFParseException ex) {

any23 git commit: ANY23-326 fixed rdfa issue with unclosed input & meta tags

Reply via email to