Repository: any23 Updated Branches: refs/heads/master f36c5e162 -> 07f7421cd
ANY23-324 Changed default html parser from NekoHTML to Jsoup. This also indirectly fixes ANY23-317, ANY23-273, ANY23-267, and ANY23-326. Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/2c76ada3 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/2c76ada3 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/2c76ada3 Branch: refs/heads/master Commit: 2c76ada3bc812c37a46863e0529363f42339582a Parents: f36c5e1 Author: Hans <[email protected]> Authored: Thu Jan 18 15:08:27 2018 -0600 Committer: Hans <[email protected]> Committed: Sun Jan 21 16:47:34 2018 -0600 ---------------------------------------------------------------------- .../resources/default-configuration.properties | 4 + core/pom.xml | 4 + .../extractor/html/EmbeddedJSONLDExtractor.java | 6 +- .../any23/extractor/html/HCardExtractor.java | 3 +- .../any23/extractor/html/HTMLMetaExtractor.java | 6 +- .../any23/extractor/html/TagSoupParser.java | 173 ++++++++------ .../html/TagSoupParsingConfiguration.java | 224 +++++++++++++++++++ .../microdata/MicrodataParserTest.java | 5 +- pom.xml | 5 + 9 files changed, 352 insertions(+), 78 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/api/src/main/resources/default-configuration.properties ---------------------------------------------------------------------- diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties index 4f68586..d1d35de 100644 --- a/api/src/main/resources/default-configuration.properties +++ b/api/src/main/resources/default-configuration.properties @@ -76,3 +76,7 @@ any23.extraction.csv.comment=# # A confidence threshold for the OpenIE extractions # Any extractions below this value will not be processed. any23.extraction.openie.confidence.threshold=0.5 + +# Use legacy setting to parse html +# with NekoHTML instead of Jsoup +any23.tagsoup.legacy=off \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index 554845a..59611d4 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -75,6 +75,10 @@ <artifactId>nekohtml</artifactId> </dependency> <dependency> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + </dependency> + <dependency> <groupId>com.beust</groupId> <artifactId>jcommander</artifactId> </dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java index 34728e5..1e6efdf 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java @@ -123,8 +123,10 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor { List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK"); for (Node linkNode : linkNodes) { NamedNodeMap attributes = linkNode.getAttributes(); - String rel = attributes.getNamedItem("rel").getTextContent(); - String href = attributes.getNamedItem("href").getTextContent(); + Node relNode = attributes.getNamedItem("rel"); + String rel = relNode == null ? null : relNode.getTextContent(); + Node hrefNode = attributes.getNamedItem("href"); + String href = hrefNode == null ? null : hrefNode.getTextContent(); if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) { prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href)); } http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java index c1160fa..822a8eb 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/HCardExtractor.java @@ -101,7 +101,8 @@ public class HCardExtractor extends EntityBasedMicroformatExtractor { report.notifyIssue( IssueReport.IssueLevel.WARNING, "Current node tries to include an ancestor node.", - nodeLocation[0], nodeLocation[1] + nodeLocation == null ? -1 : nodeLocation[0], + nodeLocation == null ? -1 : nodeLocation[1] ); continue; } http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java index a3c6550..3ca4f50 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/HTMLMetaExtractor.java @@ -139,8 +139,10 @@ public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor { List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK"); for(Node linkNode : linkNodes) { NamedNodeMap attributes = linkNode.getAttributes(); - String rel = attributes.getNamedItem("rel").getTextContent(); - String href = attributes.getNamedItem("href").getTextContent(); + Node relNode = attributes.getNamedItem("rel"); + String rel = relNode == null ? null : relNode.getTextContent(); + Node hrefNode = attributes.getNamedItem("href"); + String href = hrefNode == null ? null : hrefNode.getTextContent(); if(rel != null && href !=null && RDFUtils.isAbsoluteIRI(href)) { prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href)); } http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java index 9ef72f4..2147520 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java +++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java @@ -17,6 +17,7 @@ package org.apache.any23.extractor.html; +import org.apache.any23.configuration.DefaultConfiguration; import org.apache.any23.validator.DefaultValidator; import org.apache.any23.validator.Validator; import org.apache.any23.validator.ValidatorException; @@ -56,6 +57,7 @@ import java.nio.charset.UnsupportedCharsetException; * @author Michele Mostarda ([email protected]) * @author Davide Palmisano ([email protected]) */ + public class TagSoupParser { public static final String ELEMENT_LOCATION = "Element-Location"; @@ -69,24 +71,32 @@ public class TagSoupParser { private final String documentIRI; private final String encoding; - + + private final TagSoupParsingConfiguration config; + private Document result = null; + public TagSoupParser(InputStream input, String documentIRI) { this.input = input; this.documentIRI = documentIRI; this.encoding = null; + + config = TagSoupParsingConfiguration.getDefault(); } public TagSoupParser(InputStream input, String documentIRI, String encoding) { - if(encoding != null && !Charset.isSupported(encoding)) + if (encoding != null && !Charset.isSupported(encoding)) throw new UnsupportedCharsetException(String.format("Charset %s is not supported", encoding)); this.input = input; this.documentIRI = documentIRI; this.encoding = encoding; + + config = TagSoupParsingConfiguration.getDefault(); } + /** * Returns the DOM of the given document IRI. * @@ -97,22 +107,10 @@ public class TagSoupParser { if (result == null) { long startTime = System.currentTimeMillis(); try { - result = parse(); - } catch (SAXException ex) { - // should not happen, it's a tag soup parser - throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex); - } catch (TransformerException ex) { - // should not happen, it's a tag soup parser - throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex); - } catch (NullPointerException ex) { - if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) { - throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex); - } else { - throw ex; - } + result = config.parse(input, documentIRI, encoding); } finally { long elapsed = System.currentTimeMillis() - startTime; - logger.debug("Parsed " + documentIRI + " with NekoHTML, " + elapsed + "ms"); + logger.debug("Parsed " + documentIRI + " with " + config.name() + ", " + elapsed + "ms"); } } result.setDocumentURI(documentIRI); @@ -142,70 +140,103 @@ public class TagSoupParser { return new DocumentReport( validator.validate(dIRI, document, applyFix), document ); } - private Document parse() throws IOException, SAXException, TransformerException { - final DOMParser parser = new DOMParser() { - private QName currentQName; - private Augmentations currentAugmentations; + static TagSoupParsingConfiguration legacyConfig() { + return NekoHTML.instance; + } + + private static class NekoHTML extends TagSoupParsingConfiguration { + + private static final NekoHTML instance = new NekoHTML(); - @Override - protected Element createElementNode(QName qName) { - final Element created = super.createElementNode(qName); - if (qName.equals(currentQName) && currentAugmentations != null) { - final ElementLocation elementLocation = createElementLocation( - currentAugmentations.getItem(AUGMENTATIONS_FEATURE) - ); - created.setUserData(ELEMENT_LOCATION, elementLocation, null); + @Override + Document parse(InputStream input, String documentIRI, String encoding) throws IOException { + try { + return parse(input, encoding); + } catch (SAXException ex) { + // should not happen, it's a tag soup parser + throw new RuntimeException("Should not happen, it's a tag soup parser", ex); + } catch (TransformerException ex) { + // should not happen, it's a tag soup parser + throw new RuntimeException("Should not happen, it's a tag soup parser", ex); + } catch (NullPointerException ex) { + if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) { + throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex); + } else { + throw ex; } - return created; } + } - @Override - public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations) - throws XNIException { - super.startElement(qName, xmlAttributes, augmentations); - currentQName = qName; - currentAugmentations = augmentations; - } + private Document parse(InputStream input, String encoding) throws IOException, SAXException, TransformerException { + final DOMParser parser = new DOMParser() { + + private QName currentQName; + private Augmentations currentAugmentations; - private ElementLocation createElementLocation(Object obj) { - if(obj == null) return null; - String pattern = null; - try { - pattern = obj.toString(); - if( "synthesized".equals(pattern) ) return null; - final String[] parts = pattern.split(":"); - return new ElementLocation( - Integer.parseInt(parts[0]), - Integer.parseInt(parts[1]), - Integer.parseInt(parts[3]), - Integer.parseInt(parts[4]) - - ); - } catch (Exception e) { - logger.warn( - String.format("Unexpected string format for given augmentation: [%s]", pattern), - e - ); - return null; + @Override + protected Element createElementNode(QName qName) { + final Element created = super.createElementNode(qName); + if (qName.equals(currentQName) && currentAugmentations != null) { + final ElementLocation elementLocation = createElementLocation( + currentAugmentations.getItem(AUGMENTATIONS_FEATURE) + ); + created.setUserData(ELEMENT_LOCATION, elementLocation, null); + } + return created; } - } - }; - parser.setFeature("http://xml.org/sax/features/namespaces", false); - parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true); - parser.setFeature(AUGMENTATIONS_FEATURE, true); - if (this.encoding != null) - parser.setProperty("http://cyberneko.org/html/properties/default-encoding", this.encoding); - - /* - * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko - * parser. This will ensure the correct handling of inline HTML SPAN tags. - * This fix is documented at issue #78. - */ - parser.parse(new InputSource( new SpanCloserInputStream(input))); - return parser.getDocument(); + + @Override + public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations) + throws XNIException { + super.startElement(qName, xmlAttributes, augmentations); + currentQName = qName; + currentAugmentations = augmentations; + } + + private ElementLocation createElementLocation(Object obj) { + if(obj == null) return null; + String pattern = null; + try { + pattern = obj.toString(); + if( "synthesized".equals(pattern) ) return null; + final String[] parts = pattern.split(":"); + return new ElementLocation( + Integer.parseInt(parts[0]), + Integer.parseInt(parts[1]), + Integer.parseInt(parts[3]), + Integer.parseInt(parts[4]) + + ); + } catch (Exception e) { + logger.warn( + String.format("Unexpected string format for given augmentation: [%s]", pattern), + e + ); + return null; + } + } + }; + parser.setFeature("http://xml.org/sax/features/namespaces", false); + parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true); + parser.setFeature(AUGMENTATIONS_FEATURE, true); + if (encoding != null) + parser.setProperty("http://cyberneko.org/html/properties/default-encoding", encoding); + + /* + * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko + * parser. This will ensure the correct handling of inline HTML SPAN tags. + * This fix is documented at issue #78. + */ + parser.parse(new InputSource( new SpanCloserInputStream(input))); + return parser.getDocument(); + } + + } + + /** * Describes a <i>DOM Element</i> location. */ http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java new file mode 100644 index 0000000..1cf2538 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java @@ -0,0 +1,224 @@ +package org.apache.any23.extractor.html; + +import org.apache.any23.configuration.DefaultConfiguration; +import org.jsoup.nodes.Attribute; +import org.jsoup.parser.Parser; +import org.jsoup.select.NodeTraversor; +import org.jsoup.select.NodeVisitor; +import org.w3c.dom.Comment; +import org.w3c.dom.Document; +import org.w3c.dom.Text; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.SequenceInputStream; +import java.util.Arrays; + +abstract class TagSoupParsingConfiguration { + + static final String LEGACY_PROPERTY = "any23.tagsoup.legacy"; + + String name() { + return getClass().getSimpleName(); + } + + abstract Document parse(InputStream input, String documentIRI, String encoding) throws IOException; + + + static TagSoupParsingConfiguration getDefault() { + return Default.instance; + } + + private static class Default { + + private static final TagSoupParsingConfiguration instance = DefaultConfiguration.singleton() + .getFlagProperty(LEGACY_PROPERTY) ? TagSoupParser.legacyConfig() : JsoupConfig.instance; + + } + + + private static class JsoupConfig extends TagSoupParsingConfiguration { + + private static final JsoupConfig instance = new JsoupConfig(); + + + @Override + Document parse(InputStream input, String documentIRI, String encoding) throws IOException { + //Jsoup doesn't allow null document URIs + + if (documentIRI == null) { + documentIRI = ""; + } + + //workaround for Jsoup issue #1009 + if (encoding == null) { + + int c; + do { + c = input.read(); + } while (c != -1 && Character.isWhitespace(c)); + + if (c != -1) { + int capacity = 256; + byte[] bytes = new byte[capacity]; + int length = 0; + bytes[length++] = (byte)c; + + if (c == '<') { + c = input.read(); + if (c != -1) { + bytes[length++] = (byte)c; + if (c == '?') { + c = input.read(); + + while (c != -1) { + if (length == capacity) { + capacity *= 2; + bytes = Arrays.copyOf(bytes, capacity); + } + bytes[length++] = (byte)c; + + if (c == '>') { + if (length >= 20 && bytes[length - 2] == '?') { + String decl = "<" + new String(bytes, 2, length - 4) + ">"; + org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI, Parser.xmlParser()); + for (org.jsoup.nodes.Element el : doc.children()) { + if ("xml".equalsIgnoreCase(el.tagName())) { + String enc = el.attr("encoding"); + if (enc != null && !enc.isEmpty()) { + encoding = enc; + break; + } + } + } + } + break; + } + + c = input.read(); + } + } + } + + } + + input = new SequenceInputStream(new ByteArrayInputStream(bytes, 0, length), input); + } + + } + + org.jsoup.nodes.Document document = org.jsoup.Jsoup.parse(input, encoding, documentIRI); + + return convert(document); + } + + + private static Document convert(org.jsoup.nodes.Document document) { + Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl(); + + for (org.jsoup.nodes.Element rootEl : document.children()) { + NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl); + } + + return w3cDoc; + } + + private static class DocumentConverter implements NodeVisitor { + + private final Document doc; + private org.w3c.dom.Element dest; + + DocumentConverter(Document doc) { + this.doc = doc; + } + + @Override + public void head(org.jsoup.nodes.Node source, int depth) { + if (source instanceof org.jsoup.nodes.Element) { + org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; + + org.w3c.dom.Element el = doc.createElement(sourceEl.tagName()); + copyAttributes(sourceEl, el); + if (dest == null) { + doc.appendChild(el); + } else { + dest.appendChild(el); + } + dest = el; + } else if (source instanceof org.jsoup.nodes.TextNode) { + org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; + Text text = doc.createTextNode(sourceText.getWholeText()); + dest.appendChild(text); + } else if (source instanceof org.jsoup.nodes.Comment) { + org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; + Comment comment = doc.createComment(sourceComment.getData()); + dest.appendChild(comment); + } else if (source instanceof org.jsoup.nodes.DataNode) { + org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; + Text node = doc.createTextNode(stripCDATA(sourceData.getWholeData())); + dest.appendChild(node); + } + } + + @Override + public void tail(org.jsoup.nodes.Node source, int depth) { + if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof org.w3c.dom.Element) { + dest = (org.w3c.dom.Element) dest.getParentNode(); + } + } + + private void copyAttributes(org.jsoup.nodes.Node source, org.w3c.dom.Element el) { + for (Attribute attribute : source.attributes()) { + // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.] + String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", ""); + if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*")) + el.setAttribute(key, attribute.getValue()); + } + } + } + + private static String stripCDATA(String string) { + return reduceToContent(string, "<![CDATA[", "]]>"); + } + + private static String reduceToContent(String string, String startMarker, String endMarker) { + int i = 0; + int startContent = -1; + int l1 = startMarker.length(); + + int l2; + char c; + for(l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) { + c = string.charAt(i); + if (!Character.isWhitespace(c)) { + if (c == startMarker.charAt(0) && startMarker.equals(string.substring(i, l1 + i))) { + startContent = i + l1; + break; + } + + return string; + } + } + + if (startContent != -1) { + for(i = string.length() - 1; i > startContent + l2; --i) { + c = string.charAt(i); + if (!Character.isWhitespace(c)) { + if (c == endMarker.charAt(l2 - 1) && endMarker.equals(string.substring(i - l2 + 1, i + 1))) { + + return string.substring(startContent, i - 2); + } + + return string; + } + } + + } + return string; + } + + } + + +} http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java index 4fa237e..c58a92b 100644 --- a/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java +++ b/core/src/test/java/org/apache/any23/extractor/microdata/MicrodataParserTest.java @@ -275,10 +275,11 @@ public class MicrodataParserTest { } for(int i = 0; i < errors.length; i++) { + //Jsoup doesn't support element locations Assert.assertEquals( "Error while comparing error [" + i + "]", - resultContent.getProperty("error" + i), - errors[i].toJSON() + resultContent.getProperty("error" + i).replaceAll("_row\" : -?\\d+", "_row\" : -1").replaceAll("_col\" : -?\\d+", "_col\" : -1"), + errors[i].toJSON().replaceAll("_row\" : -?\\d+", "_row\" : -1").replaceAll("_col\" : -?\\d+", "_col\" : -1") ); } } http://git-wip-us.apache.org/repos/asf/any23/blob/2c76ada3/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index dde7581..0b03914 100644 --- a/pom.xml +++ b/pom.xml @@ -364,6 +364,11 @@ <artifactId>nekohtml</artifactId> <version>1.9.20</version> </dependency> + <dependency> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.11.2</version> + </dependency> <!-- BEGIN: Tika --> <dependency>
