ANY23-340 Removes doctypes to allow extraction of additional rdfa 1.1 triples
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/60d6f616 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/60d6f616 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/60d6f616 Branch: refs/heads/master Commit: 60d6f61644e307def7e6b5e193af2e2d46421b5d Parents: a1b72b7 Author: Hans <[email protected]> Authored: Fri Mar 30 15:03:17 2018 -0500 Committer: Hans <[email protected]> Committed: Fri Mar 30 15:03:17 2018 -0500 ---------------------------------------------------------------------- .../any23/extractor/rdf/BaseRDFExtractor.java | 21 +- .../extractor/rdfa/RDFa11ExtractorTest.java | 10 + .../test/resources/html/BBC_News_Scotland.html | 3780 ++++++++++++++++++ 3 files changed, 3802 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/60d6f616/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index 61b58c1..1882ed9 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -29,12 +29,14 @@ import org.eclipse.rdf4j.rio.RDFParser; import org.eclipse.rdf4j.rio.RDFHandlerException; import org.eclipse.rdf4j.rio.RioSetting; import org.eclipse.rdf4j.rio.helpers.BasicParserSettings; +import org.jsoup.nodes.Comment; import org.jsoup.nodes.DataNode; import org.jsoup.nodes.Document; +import org.jsoup.nodes.DocumentType; import org.jsoup.nodes.Entities; import org.jsoup.nodes.Node; +import org.jsoup.select.NodeFilter; import org.jsoup.select.NodeTraversor; -import org.jsoup.select.NodeVisitor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -131,17 +133,18 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { .syntax(Document.OutputSettings.Syntax.xml) .escapeMode(Entities.EscapeMode.xhtml) .charset(charset); - //Delete scripts. Json-ld in script tags is extracted first - //from tag soup dom, so we should be fine. - NodeTraversor.traverse(new NodeVisitor() { + // Delete scripts, comments, and doctypes + // See https://issues.apache.org/jira/browse/ANY23-317 + // and https://issues.apache.org/jira/browse/ANY23-340 + NodeTraversor.filter(new NodeFilter() { @Override - public void head(Node node, int depth) { - if (node instanceof DataNode) { - ((DataNode) node).setWholeData(""); - } + public FilterResult head(Node node, int depth) { + return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType + ? FilterResult.REMOVE : FilterResult.CONTINUE; } @Override - public void tail(Node node, int depth) { + public FilterResult tail(Node node, int depth) { + return FilterResult.CONTINUE; } }, doc); http://git-wip-us.apache.org/repos/asf/any23/blob/60d6f616/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java index 0599aaf..c0767c9 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFa11ExtractorTest.java @@ -61,6 +61,16 @@ public class RDFa11ExtractorTest extends AbstractRDFaExtractorTestCase { } @Test + public void testBBCNewsScotland() { + assertExtract("/html/BBC_News_Scotland.html"); + assertModelNotEmpty(); + assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#navigation"), 1); + assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#search"), 1); + assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#contentinfo"), 1); + assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"), RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#presentation"), 8); + } + + @Test public void testIssue326() { assertExtract("/html/rdfa/rdfa-issue326-and-267.html"); }
