[1/2] any23 git commit: ANY23-324 Changed default html parser from NekoHTML to Jsoup. This also indirectly fixes ANY23-317, ANY23-273, ANY23-267, and ANY23-326.

lewismc Tue, 23 Jan 2018 22:12:04 -0800

Repository: any23
Updated Branches:
  refs/heads/master f36c5e162 -> 07f7421cd



ANY23-324 Changed default html parser from NekoHTML to Jsoup. This also 
indirectly fixes ANY23-317, ANY23-273, ANY23-267, and ANY23-326.


Project: http://git-wip-us.apache.org/repos/asf/any23/repo
Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/2c76ada3
Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/2c76ada3
Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/2c76ada3

Branch: refs/heads/master
Commit: 2c76ada3bc812c37a46863e0529363f42339582a
Parents: f36c5e1
Author: Hans <[email protected]>
Authored: Thu Jan 18 15:08:27 2018 -0600
Committer: Hans <[email protected]>
Committed: Sun Jan 21 16:47:34 2018 -0600

----------------------------------------------------------------------
 .../resources/default-configuration.properties  |   4 +
 core/pom.xml                                    |   4 +
 .../extractor/html/EmbeddedJSONLDExtractor.java |   6 +-
 .../any23/extractor/html/HCardExtractor.java    |   3 +-
 .../any23/extractor/html/HTMLMetaExtractor.java |   6 +-
 .../any23/extractor/html/TagSoupParser.java     | 173 ++++++++------
 .../html/TagSoupParsingConfiguration.java       | 224 +++++++++++++++++++
 .../microdata/MicrodataParserTest.java          |   5 +-
 pom.xml                                         |   5 +
 9 files changed, 352 insertions(+), 78 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/api/src/main/resources/default-configuration.properties
----------------------------------------------------------------------
diff --git a/api/src/main/resources/default-configuration.properties 
b/api/src/main/resources/default-configuration.properties
index 4f68586..d1d35de 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -76,3 +76,7 @@ any23.extraction.csv.comment=#
 # A confidence threshold for the OpenIE extractions
 # Any extractions below this value will not be processed.
 any23.extraction.openie.confidence.threshold=0.5
+
+# Use legacy setting to parse html
+# with NekoHTML instead of Jsoup
+any23.tagsoup.legacy=off
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/pom.xml
----------------------------------------------------------------------
diff --git a/core/pom.xml b/core/pom.xml
index 554845a..59611d4 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -75,6 +75,10 @@
       <artifactId>nekohtml</artifactId>
     </dependency>
     <dependency>
+      <groupId>org.jsoup</groupId>
+      <artifactId>jsoup</artifactId>
+    </dependency>
+    <dependency>
       <groupId>com.beust</groupId>
       <artifactId>jcommander</artifactId>
     </dependency>

http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
 
b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
index 34728e5..1e6efdf 100644
--- 
a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java
@@ -123,8 +123,10 @@ public class EmbeddedJSONLDExtractor implements 
Extractor.TagSoupDOMExtractor {
     List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
     for (Node linkNode : linkNodes) {
       NamedNodeMap attributes = linkNode.getAttributes();
-      String rel = attributes.getNamedItem("rel").getTextContent();
-      String href = attributes.getNamedItem("href").getTextContent();
+      Node relNode = attributes.getNamedItem("rel");
+      String rel = relNode == null ? null : relNode.getTextContent();
+      Node hrefNode = attributes.getNamedItem("href");
+      String href = hrefNode == null ? null : hrefNode.getTextContent();
       if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
         prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
       }

http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
index c1160fa..822a8eb 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java
@@ -101,7 +101,8 @@ public class HCardExtractor extends 
EntityBasedMicroformatExtractor {
                 report.notifyIssue(
                         IssueReport.IssueLevel.WARNING,
                         "Current node tries to include an ancestor node.",
-                        nodeLocation[0], nodeLocation[1]
+                        nodeLocation == null ? -1 : nodeLocation[0],
+                        nodeLocation == null ? -1 : nodeLocation[1]
                 );
                 continue;
             }

http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java 
b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
index a3c6550..3ca4f50 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java
@@ -139,8 +139,10 @@ public class HTMLMetaExtractor implements 
Extractor.TagSoupDOMExtractor {
         List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
         for(Node linkNode : linkNodes) {
             NamedNodeMap attributes = linkNode.getAttributes();
-            String rel = attributes.getNamedItem("rel").getTextContent();
-            String href = attributes.getNamedItem("href").getTextContent();
+            Node relNode = attributes.getNamedItem("rel");
+            String rel = relNode == null ? null : relNode.getTextContent();
+            Node hrefNode = attributes.getNamedItem("href");
+            String href = hrefNode == null ? null : hrefNode.getTextContent();
             if(rel != null && href !=null && RDFUtils.isAbsoluteIRI(href)) {
                 prefixes.put(rel, 
SimpleValueFactory.getInstance().createIRI(href));
             }

http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java 
b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
index 9ef72f4..2147520 100644
--- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
+++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java
@@ -17,6 +17,7 @@
 
 package org.apache.any23.extractor.html;
 
+import org.apache.any23.configuration.DefaultConfiguration;
 import org.apache.any23.validator.DefaultValidator;
 import org.apache.any23.validator.Validator;
 import org.apache.any23.validator.ValidatorException;
@@ -56,6 +57,7 @@ import java.nio.charset.UnsupportedCharsetException;
  * @author Michele Mostarda ([email protected])
  * @author Davide Palmisano ([email protected])
  */
+
 public class TagSoupParser {
 
     public static final String ELEMENT_LOCATION = "Element-Location";
@@ -69,24 +71,32 @@ public class TagSoupParser {
     private final String documentIRI;
 
     private final String encoding;
-    
+
+    private final TagSoupParsingConfiguration config;
+
     private Document result = null;
 
+
     public TagSoupParser(InputStream input, String documentIRI) {
         this.input = input;
         this.documentIRI = documentIRI;
         this.encoding = null;
+
+        config = TagSoupParsingConfiguration.getDefault();
     }
 
     public TagSoupParser(InputStream input, String documentIRI, String 
encoding) {
-        if(encoding != null && !Charset.isSupported(encoding))
+        if (encoding != null && !Charset.isSupported(encoding))
             throw new UnsupportedCharsetException(String.format("Charset %s is 
not supported", encoding));
 
         this.input = input;
         this.documentIRI = documentIRI;
         this.encoding = encoding;
+
+        config = TagSoupParsingConfiguration.getDefault();
     }
 
+
     /**
      * Returns the DOM of the given document IRI. 
      *
@@ -97,22 +107,10 @@ public class TagSoupParser {
         if (result == null) {
             long startTime = System.currentTimeMillis();
             try {
-                result = parse();
-            } catch (SAXException ex) {
-                // should not happen, it's a tag soup parser
-                throw new RuntimeException("Shouldn not happen, it's a tag 
soup parser", ex);
-            } catch (TransformerException ex) {
-                // should not happen, it's a tag soup parser
-                throw new RuntimeException("Shouldn not happen, it's a tag 
soup parser", ex);
-            } catch (NullPointerException ex) {
-                if 
(ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
-                    throw new RuntimeException("Bug in NekoHTML, try upgrading 
to newer release!", ex);
-                } else {
-                    throw ex;
-                }
+                result = config.parse(input, documentIRI, encoding);
             } finally {
                 long elapsed = System.currentTimeMillis() - startTime;
-                logger.debug("Parsed " + documentIRI + " with NekoHTML, " + 
elapsed + "ms");
+                logger.debug("Parsed " + documentIRI + " with " + 
config.name() + ", " + elapsed + "ms");
             }
         }
         result.setDocumentURI(documentIRI);
@@ -142,70 +140,103 @@ public class TagSoupParser {
         return new DocumentReport( validator.validate(dIRI, document, 
applyFix), document );
     }
 
-    private Document parse() throws IOException, SAXException, 
TransformerException {
-        final DOMParser parser = new DOMParser() {
 
-            private QName currentQName;
-            private Augmentations currentAugmentations;
+    static TagSoupParsingConfiguration legacyConfig() {
+        return NekoHTML.instance;
+    }
+
+    private static class NekoHTML extends TagSoupParsingConfiguration {
+
+        private static final NekoHTML instance = new NekoHTML();
 
-            @Override
-            protected Element createElementNode(QName qName) {
-                final Element created = super.createElementNode(qName);
-                if (qName.equals(currentQName) && currentAugmentations != 
null) {
-                    final ElementLocation elementLocation = 
createElementLocation(
-                        currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
-                    );
-                    created.setUserData(ELEMENT_LOCATION, elementLocation, 
null);
+        @Override
+        Document parse(InputStream input, String documentIRI, String encoding) 
throws IOException {
+            try {
+                return parse(input, encoding);
+            } catch (SAXException ex) {
+                // should not happen, it's a tag soup parser
+                throw new RuntimeException("Should not happen, it's a tag soup 
parser", ex);
+            } catch (TransformerException ex) {
+                // should not happen, it's a tag soup parser
+                throw new RuntimeException("Should not happen, it's a tag soup 
parser", ex);
+            } catch (NullPointerException ex) {
+                if 
(ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
+                    throw new RuntimeException("Bug in NekoHTML, try upgrading 
to newer release!", ex);
+                } else {
+                    throw ex;
                 }
-                return created;
             }
+        }
 
-            @Override
-            public void startElement(QName qName, XMLAttributes xmlAttributes, 
Augmentations augmentations)
-            throws XNIException {
-                super.startElement(qName, xmlAttributes, augmentations);
-                currentQName = qName;
-                currentAugmentations = augmentations;
-            }
+        private Document parse(InputStream input, String encoding) throws 
IOException, SAXException, TransformerException {
+            final DOMParser parser = new DOMParser() {
+
+                private QName currentQName;
+                private Augmentations currentAugmentations;
 
-            private ElementLocation createElementLocation(Object obj) {
-                if(obj == null) return null;
-                String pattern = null;
-                try {
-                    pattern = obj.toString();
-                    if( "synthesized".equals(pattern) ) return null;
-                    final String[] parts = pattern.split(":");
-                    return new ElementLocation(
-                            Integer.parseInt(parts[0]),
-                            Integer.parseInt(parts[1]),
-                            Integer.parseInt(parts[3]),
-                            Integer.parseInt(parts[4])
-
-                    );
-                } catch (Exception e) {
-                    logger.warn(
-                            String.format("Unexpected string format for given 
augmentation: [%s]", pattern),
-                            e
-                    );
-                    return null;
+                @Override
+                protected Element createElementNode(QName qName) {
+                    final Element created = super.createElementNode(qName);
+                    if (qName.equals(currentQName) && currentAugmentations != 
null) {
+                        final ElementLocation elementLocation = 
createElementLocation(
+                                
currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
+                        );
+                        created.setUserData(ELEMENT_LOCATION, elementLocation, 
null);
+                    }
+                    return created;
                 }
-            }
-        };
-        parser.setFeature("http://xml.org/sax/features/namespaces";, false);
-        
parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims";,
 true);
-        parser.setFeature(AUGMENTATIONS_FEATURE, true);
-        if (this.encoding != null)
-            
parser.setProperty("http://cyberneko.org/html/properties/default-encoding";, 
this.encoding);
-
-        /*
-         * NOTE: the SpanCloserInputStream has been added to wrap the stream 
passed to the CyberNeko
-         *       parser. This will ensure the correct handling of inline HTML 
SPAN tags.
-         *       This fix is documented at issue #78.       
-         */
-        parser.parse(new InputSource( new SpanCloserInputStream(input)));
-        return parser.getDocument();
+
+                @Override
+                public void startElement(QName qName, XMLAttributes 
xmlAttributes, Augmentations augmentations)
+                        throws XNIException {
+                    super.startElement(qName, xmlAttributes, augmentations);
+                    currentQName = qName;
+                    currentAugmentations = augmentations;
+                }
+
+                private ElementLocation createElementLocation(Object obj) {
+                    if(obj == null) return null;
+                    String pattern = null;
+                    try {
+                        pattern = obj.toString();
+                        if( "synthesized".equals(pattern) ) return null;
+                        final String[] parts = pattern.split(":");
+                        return new ElementLocation(
+                                Integer.parseInt(parts[0]),
+                                Integer.parseInt(parts[1]),
+                                Integer.parseInt(parts[3]),
+                                Integer.parseInt(parts[4])
+
+                        );
+                    } catch (Exception e) {
+                        logger.warn(
+                                String.format("Unexpected string format for 
given augmentation: [%s]", pattern),
+                                e
+                        );
+                        return null;
+                    }
+                }
+            };
+            parser.setFeature("http://xml.org/sax/features/namespaces";, false);
+            
parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims";,
 true);
+            parser.setFeature(AUGMENTATIONS_FEATURE, true);
+            if (encoding != null)
+                
parser.setProperty("http://cyberneko.org/html/properties/default-encoding";, 
encoding);
+
+            /*
+             * NOTE: the SpanCloserInputStream has been added to wrap the 
stream passed to the CyberNeko
+             *       parser. This will ensure the correct handling of inline 
HTML SPAN tags.
+             *       This fix is documented at issue #78.
+             */
+            parser.parse(new InputSource( new SpanCloserInputStream(input)));
+            return parser.getDocument();
+        }
+
+
     }
 
+
+
     /**
      * Describes a <i>DOM Element</i> location.
      */

http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
 
b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
new file mode 100644
index 0000000..1cf2538
--- /dev/null
+++ 
b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java
@@ -0,0 +1,224 @@
+package org.apache.any23.extractor.html;
+
+import org.apache.any23.configuration.DefaultConfiguration;
+import org.jsoup.nodes.Attribute;
+import org.jsoup.parser.Parser;
+import org.jsoup.select.NodeTraversor;
+import org.jsoup.select.NodeVisitor;
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.Text;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.SequenceInputStream;
+import java.util.Arrays;
+
+abstract class TagSoupParsingConfiguration {
+
+    static final String LEGACY_PROPERTY = "any23.tagsoup.legacy";
+
+    String name() {
+        return getClass().getSimpleName();
+    }
+
+    abstract Document parse(InputStream input, String documentIRI, String 
encoding) throws IOException;
+
+
+    static TagSoupParsingConfiguration getDefault() {
+        return Default.instance;
+    }
+
+    private static class Default {
+
+        private static final TagSoupParsingConfiguration instance = 
DefaultConfiguration.singleton()
+                .getFlagProperty(LEGACY_PROPERTY) ? 
TagSoupParser.legacyConfig() : JsoupConfig.instance;
+
+    }
+
+
+    private static class JsoupConfig extends TagSoupParsingConfiguration {
+
+        private static final JsoupConfig instance = new JsoupConfig();
+
+
+        @Override
+        Document parse(InputStream input, String documentIRI, String encoding) 
throws IOException {
+            //Jsoup doesn't allow null document URIs
+
+            if (documentIRI == null) {
+                documentIRI = "";
+            }
+
+            //workaround for Jsoup issue #1009
+            if (encoding == null) {
+
+                int c;
+                do {
+                    c = input.read();
+                } while (c != -1 && Character.isWhitespace(c));
+
+                if (c != -1) {
+                    int capacity = 256;
+                    byte[] bytes = new byte[capacity];
+                    int length = 0;
+                    bytes[length++] = (byte)c;
+
+                    if (c == '<') {
+                        c = input.read();
+                        if (c != -1) {
+                            bytes[length++] = (byte)c;
+                            if (c == '?') {
+                                c = input.read();
+
+                                while (c != -1) {
+                                    if (length == capacity) {
+                                        capacity *= 2;
+                                        bytes = Arrays.copyOf(bytes, capacity);
+                                    }
+                                    bytes[length++] = (byte)c;
+
+                                    if (c == '>') {
+                                        if (length >= 20 && bytes[length - 2] 
== '?') {
+                                            String decl = "<" + new 
String(bytes, 2, length - 4) + ">";
+                                            org.jsoup.nodes.Document doc = 
org.jsoup.Jsoup.parse(decl, documentIRI, Parser.xmlParser());
+                                            for (org.jsoup.nodes.Element el : 
doc.children()) {
+                                                if 
("xml".equalsIgnoreCase(el.tagName())) {
+                                                    String enc = 
el.attr("encoding");
+                                                    if (enc != null && 
!enc.isEmpty()) {
+                                                        encoding = enc;
+                                                        break;
+                                                    }
+                                                }
+                                            }
+                                        }
+                                        break;
+                                    }
+
+                                    c = input.read();
+                                }
+                            }
+                        }
+
+                    }
+
+                    input = new SequenceInputStream(new 
ByteArrayInputStream(bytes, 0, length), input);
+                }
+
+            }
+
+            org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(input, 
encoding, documentIRI);
+
+            return convert(document);
+        }
+
+
+        private static Document convert(org.jsoup.nodes.Document document) {
+            Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();
+
+            for (org.jsoup.nodes.Element rootEl : document.children()) {
+                NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
+            }
+
+            return w3cDoc;
+        }
+
+        private static class DocumentConverter implements NodeVisitor {
+
+            private final Document doc;
+            private org.w3c.dom.Element dest;
+
+            DocumentConverter(Document doc) {
+                this.doc = doc;
+            }
+
+            @Override
+            public void head(org.jsoup.nodes.Node source, int depth) {
+                if (source instanceof org.jsoup.nodes.Element) {
+                    org.jsoup.nodes.Element sourceEl = 
(org.jsoup.nodes.Element) source;
+
+                    org.w3c.dom.Element el = 
doc.createElement(sourceEl.tagName());
+                    copyAttributes(sourceEl, el);
+                    if (dest == null) {
+                        doc.appendChild(el);
+                    } else {
+                        dest.appendChild(el);
+                    }
+                    dest = el;
+                } else if (source instanceof org.jsoup.nodes.TextNode) {
+                    org.jsoup.nodes.TextNode sourceText = 
(org.jsoup.nodes.TextNode) source;
+                    Text text = doc.createTextNode(sourceText.getWholeText());
+                    dest.appendChild(text);
+                } else if (source instanceof org.jsoup.nodes.Comment) {
+                    org.jsoup.nodes.Comment sourceComment = 
(org.jsoup.nodes.Comment) source;
+                    Comment comment = 
doc.createComment(sourceComment.getData());
+                    dest.appendChild(comment);
+                } else if (source instanceof org.jsoup.nodes.DataNode) {
+                    org.jsoup.nodes.DataNode sourceData = 
(org.jsoup.nodes.DataNode) source;
+                    Text node = 
doc.createTextNode(stripCDATA(sourceData.getWholeData()));
+                    dest.appendChild(node);
+                }
+            }
+
+            @Override
+            public void tail(org.jsoup.nodes.Node source, int depth) {
+                if (source instanceof org.jsoup.nodes.Element && 
dest.getParentNode() instanceof org.w3c.dom.Element) {
+                    dest = (org.w3c.dom.Element) dest.getParentNode();
+                }
+            }
+
+            private void copyAttributes(org.jsoup.nodes.Node source, 
org.w3c.dom.Element el) {
+                for (Attribute attribute : source.attributes()) {
+                    // valid xml attribute names are: 
^[a-zA-Z_:][-a-zA-Z0-9_:.]
+                    String key = 
attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
+                    if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
+                        el.setAttribute(key, attribute.getValue());
+                }
+            }
+        }
+
+        private static String stripCDATA(String string) {
+            return reduceToContent(string, "<![CDATA[", "]]>");
+        }
+
+        private static String reduceToContent(String string, String 
startMarker, String endMarker) {
+            int i = 0;
+            int startContent = -1;
+            int l1 = startMarker.length();
+
+            int l2;
+            char c;
+            for(l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) {
+                c = string.charAt(i);
+                if (!Character.isWhitespace(c)) {
+                    if (c == startMarker.charAt(0) && 
startMarker.equals(string.substring(i, l1 + i))) {
+                        startContent = i + l1;
+                        break;
+                    }
+
+                    return string;
+                }
+            }
+
+            if (startContent != -1) {
+                for(i = string.length() - 1; i > startContent + l2; --i) {
+                    c = string.charAt(i);
+                    if (!Character.isWhitespace(c)) {
+                        if (c == endMarker.charAt(l2 - 1) && 
endMarker.equals(string.substring(i - l2 + 1, i + 1))) {
+
+                            return string.substring(startContent, i - 2);
+                        }
+
+                        return string;
+                    }
+                }
+
+            }
+            return string;
+        }
+
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
----------------------------------------------------------------------
diff --git 
a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
 
b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
index 4fa237e..c58a92b 100644
--- 
a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
+++ 
b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java
@@ -275,10 +275,11 @@ public class MicrodataParserTest {
         }
 
         for(int i = 0; i < errors.length; i++) {
+            //Jsoup doesn't support element locations
             Assert.assertEquals(
                     "Error while comparing error [" + i + "]",
-                    resultContent.getProperty("error" + i),
-                    errors[i].toJSON()
+                    resultContent.getProperty("error" + i).replaceAll("_row\" 
: -?\\d+", "_row\" : -1").replaceAll("_col\" : -?\\d+", "_col\" : -1"),
+                    errors[i].toJSON().replaceAll("_row\" : -?\\d+", "_row\" : 
-1").replaceAll("_col\" : -?\\d+", "_col\" : -1")
             );
         }
     }

http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index dde7581..0b03914 100644
--- a/pom.xml
+++ b/pom.xml
@@ -364,6 +364,11 @@
         <artifactId>nekohtml</artifactId>
         <version>1.9.20</version>
       </dependency>
+      <dependency>
+        <groupId>org.jsoup</groupId>
+        <artifactId>jsoup</artifactId>
+        <version>1.11.2</version>
+      </dependency>
 
       <!-- BEGIN: Tika -->
       <dependency>

[1/2] any23 git commit: ANY23-324 Changed default html parser from NekoHTML to Jsoup. This also indirectly fixes ANY23-317, ANY23-273, ANY23-267, and ANY23-326.

Reply via email to