Repository: any23 Updated Branches: refs/heads/master 99f3f0ad9 -> eb5bd0939
ANY23-356 Removed nekohtml dependency Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/3c8ee56f Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/3c8ee56f Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/3c8ee56f Branch: refs/heads/master Commit: 3c8ee56f55c97833d1f6a7a78c78ec15f8b5afd3 Parents: 1d5e0ec Author: Hans <[email protected]> Authored: Mon Jul 2 15:26:15 2018 -0500 Committer: Hans <[email protected]> Committed: Mon Jul 2 18:24:38 2018 -0500 ---------------------------------------------------------------------- .../resources/default-configuration.properties | 4 - cli/pom.xml | 4 - core/pom.xml | 10 +- .../any23/extractor/html/TagSoupParser.java | 115 +------------------ .../html/TagSoupParsingConfiguration.java | 12 +- plugins/html-scraper/pom.xml | 10 +- pom.xml | 6 +- 7 files changed, 13 insertions(+), 148 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/api/src/main/resources/default-configuration.properties ---------------------------------------------------------------------- diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties index a8ca0c2..4f68586 100644 --- a/api/src/main/resources/default-configuration.properties +++ b/api/src/main/resources/default-configuration.properties @@ -76,7 +76,3 @@ any23.extraction.csv.comment=# # A confidence threshold for the OpenIE extractions # Any extractions below this value will not be processed. any23.extraction.openie.confidence.threshold=0.5 - -# Use legacy setting to parse html -# with NekoHTML instead of Jsoup -any23.tagsoup.legacy=off http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/cli/pom.xml ---------------------------------------------------------------------- diff --git a/cli/pom.xml b/cli/pom.xml index 0cae013..0f04c62 100644 --- a/cli/pom.xml +++ b/cli/pom.xml @@ -110,10 +110,6 @@ <artifactId>commons-codec</artifactId> </dependency> <dependency> - <groupId>net.sourceforge.nekohtml</groupId> - <artifactId>nekohtml</artifactId> - </dependency> - <dependency> <groupId>com.beust</groupId> <artifactId>jcommander</artifactId> </dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index e492fb6..377a5ee 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -232,11 +232,6 @@ <artifactId>commons-lang</artifactId> </dependency> - <dependency> - <groupId>net.sourceforge.nekohtml</groupId> - <artifactId>nekohtml</artifactId> - </dependency> - <dependency> <!-- used by Tika --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> @@ -283,6 +278,11 @@ </dependency> <!-- END: POI --> + <dependency> + <groupId>xerces</groupId> + <artifactId>xercesImpl</artifactId> + </dependency> + <!-- BEGIN: Test Dependencies --> <dependency> <groupId>junit</groupId> http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java index d96a07b..4f54018 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java +++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java @@ -20,19 +20,10 @@ package org.apache.any23.extractor.html; import org.apache.any23.validator.DefaultValidator; import org.apache.any23.validator.Validator; import org.apache.any23.validator.ValidatorException; -import org.apache.xerces.xni.Augmentations; -import org.apache.xerces.xni.QName; -import org.apache.xerces.xni.XMLAttributes; -import org.apache.xerces.xni.XNIException; -import org.cyberneko.html.parsers.DOMParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; -import javax.xml.transform.TransformerException; import java.io.IOException; import java.io.InputStream; import java.net.URI; @@ -42,13 +33,12 @@ import java.nio.charset.UnsupportedCharsetException; /** * <p>Parses an {@link java.io.InputStream} - * into an <i>HTML DOM</i> tree using a <i>TagSoup</i> parser. + * into an <i>HTML DOM</i> tree. * </p> * <p><strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace * aware, and all element names will be upper case, while attributes - * will be lower case. This is because the - * <a href="http://nekohtml.sourceforge.net/">NekoHTML</a> based <i>TagSoup</i> parser - * by default uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a> + * will be lower case. This is because the HTML parser + * uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a> * implementation, which doesn't support namespaces and forces uppercase element names. This works * with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>, so we left it this way.</p> * @@ -61,8 +51,6 @@ public class TagSoupParser { public static final String ELEMENT_LOCATION = "Element-Location"; - private static final String AUGMENTATIONS_FEATURE = "http://cyberneko.org/html/features/augmentations"; - private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class); private final InputStream input; @@ -139,103 +127,6 @@ public class TagSoupParser { return new DocumentReport( validator.validate(dIRI, document, applyFix), document ); } - - static TagSoupParsingConfiguration legacyConfig() { - return NekoHTML.instance; - } - - private static class NekoHTML extends TagSoupParsingConfiguration { - - private static final NekoHTML instance = new NekoHTML(); - - @Override - Document parse(InputStream input, String documentIRI, String encoding) throws IOException { - try { - return parse(input, encoding); - } catch (SAXException ex) { - // should not happen, it's a tag soup parser - throw new RuntimeException("Should not happen, it's a tag soup parser", ex); - } catch (TransformerException ex) { - // should not happen, it's a tag soup parser - throw new RuntimeException("Should not happen, it's a tag soup parser", ex); - } catch (NullPointerException ex) { - if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) { - throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex); - } else { - throw ex; - } - } - } - - private Document parse(InputStream input, String encoding) throws IOException, SAXException, TransformerException { - final DOMParser parser = new DOMParser() { - - private QName currentQName; - private Augmentations currentAugmentations; - - @Override - protected Element createElementNode(QName qName) { - final Element created = super.createElementNode(qName); - if (qName.equals(currentQName) && currentAugmentations != null) { - final ElementLocation elementLocation = createElementLocation( - currentAugmentations.getItem(AUGMENTATIONS_FEATURE) - ); - created.setUserData(ELEMENT_LOCATION, elementLocation, null); - } - return created; - } - - @Override - public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations) - throws XNIException { - super.startElement(qName, xmlAttributes, augmentations); - currentQName = qName; - currentAugmentations = augmentations; - } - - private ElementLocation createElementLocation(Object obj) { - if(obj == null) return null; - String pattern = null; - try { - pattern = obj.toString(); - if( "synthesized".equals(pattern) ) return null; - final String[] parts = pattern.split(":"); - return new ElementLocation( - Integer.parseInt(parts[0]), - Integer.parseInt(parts[1]), - Integer.parseInt(parts[3]), - Integer.parseInt(parts[4]) - - ); - } catch (Exception e) { - logger.warn( - String.format("Unexpected string format for given augmentation: [%s]", pattern), - e - ); - return null; - } - } - }; - parser.setFeature("http://xml.org/sax/features/namespaces", false); - parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true); - parser.setFeature(AUGMENTATIONS_FEATURE, true); - if (encoding != null) - parser.setProperty("http://cyberneko.org/html/properties/default-encoding", encoding); - - /* - * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko - * parser. This will ensure the correct handling of inline HTML SPAN tags. - * This fix is documented at issue #78. - */ - parser.parse(new InputSource( new SpanCloserInputStream(input))); - return parser.getDocument(); - } - - - } - - - /** * Describes a <i>DOM Element</i> location. */ http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java index 2aeaac1..018a333 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java +++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParsingConfiguration.java @@ -17,7 +17,6 @@ package org.apache.any23.extractor.html; -import org.apache.any23.configuration.DefaultConfiguration; import org.jsoup.nodes.Attribute; import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; @@ -35,8 +34,6 @@ import java.io.InputStream; */ abstract class TagSoupParsingConfiguration { - static final String LEGACY_PROPERTY = "any23.tagsoup.legacy"; - String name() { return getClass().getSimpleName(); } @@ -45,14 +42,7 @@ abstract class TagSoupParsingConfiguration { static TagSoupParsingConfiguration getDefault() { - return Default.instance; - } - - private static class Default { - - private static final TagSoupParsingConfiguration instance = DefaultConfiguration.singleton() - .getFlagProperty(LEGACY_PROPERTY) ? TagSoupParser.legacyConfig() : JsoupConfig.instance; - + return JsoupConfig.instance; } http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/plugins/html-scraper/pom.xml ---------------------------------------------------------------------- diff --git a/plugins/html-scraper/pom.xml b/plugins/html-scraper/pom.xml index e24f6b6..b651d73 100644 --- a/plugins/html-scraper/pom.xml +++ b/plugins/html-scraper/pom.xml @@ -51,19 +51,11 @@ <dependency> <groupId>net.sourceforge.nekohtml</groupId> <artifactId>nekohtml</artifactId> - <scope>provided</scope> + <version>1.9.22</version> </dependency> <dependency> <groupId>xerces</groupId> <artifactId>xercesImpl</artifactId> - <version>2.12.0</version> - <scope>provided</scope> - <exclusions> - <exclusion> - <groupId>xml-apis</groupId> - <artifactId>xml-apis</artifactId> - </exclusion> - </exclusions> </dependency> <dependency> <groupId>de.l3s.boilerpipe</groupId> http://git-wip-us.apache.org/repos/asf/any23/blob/3c8ee56f/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 8d3d408..1e57b2c 100644 --- a/pom.xml +++ b/pom.xml @@ -382,9 +382,9 @@ <version>1.17</version> </dependency> <dependency> - <groupId>net.sourceforge.nekohtml</groupId> - <artifactId>nekohtml</artifactId> - <version>1.9.20</version> + <groupId>xerces</groupId> + <artifactId>xercesImpl</artifactId> + <version>2.12.0</version> </dependency> <dependency> <groupId>org.jsoup</groupId>
