This is an automated email from the ASF dual-hosted git repository. hansbrende pushed a commit to branch ANY23-442 in repository https://gitbox.apache.org/repos/asf/any23.git
commit 69e2306bd1876d33c4c3c7c676d6d69ee238c153 Author: Hans <[email protected]> AuthorDate: Sat Sep 14 20:14:21 2019 -0500 ANY23-442 move HTML preprocessing from BaseRDFExtractor to semargl extractors --- .../any23/extractor/rdf/BaseRDFExtractor.java | 109 +--------------- .../any23/extractor/rdfa/BaseRDFaExtractor.java | 139 +++++++++++++++++++++ .../any23/extractor/rdfa/RDFa11Extractor.java | 3 +- .../apache/any23/extractor/rdfa/RDFaExtractor.java | 3 +- core/src/test/java/org/apache/any23/Any23Test.java | 10 +- 5 files changed, 152 insertions(+), 112 deletions(-) diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index c11aa8c..0915940 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -102,9 +102,6 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { stopAtFirstError = b; } - private static final Pattern invalidXMLCharacters = Pattern.compile( - "[^\u0009\r\n\u0020-\uD7FF\uE000-\uFFFD\ud800\udc00-\udbff\udfff]"); - @Override public void run( ExtractionParameters extractionParameters, @@ -116,114 +113,12 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { final RDFParser parser = getParser(extractionContext, extractionResult); RDFFormat format = parser.getRDFFormat(); - String iri = extractionContext.getDocumentIRI().stringValue(); - - if (format.hasFileExtension("xhtml") || format.hasMIMEType("application/xhtml+xml")) { - Charset charset = format.getCharset(); - if (charset == null) { - charset = StandardCharsets.UTF_8; - } - Document doc = JsoupUtils.parse(in, iri, null); - doc.outputSettings() - .prettyPrint(false) - .syntax(Document.OutputSettings.Syntax.xml) - .escapeMode(Entities.EscapeMode.xhtml) - .charset(charset); - // Delete scripts, comments, and doctypes - // See https://issues.apache.org/jira/browse/ANY23-317 - // and https://issues.apache.org/jira/browse/ANY23-340 - NodeTraversor.filter(new NodeFilter() { - final HashSet<String> tmpAttributeKeys = new HashSet<>(); - - @Override - public FilterResult head(Node node, int depth) { - if (node instanceof Element) { - HashSet<String> attributeKeys = tmpAttributeKeys; - for (Iterator<Attribute> it = node.attributes().iterator(); it.hasNext(); ) { - // fix for ANY23-350: valid xml attribute names are ^[a-zA-Z_:][-a-zA-Z0-9_:.] - Attribute attr = it.next(); - String oldKey = attr.getKey(); - String newKey = oldKey.replaceAll("[^-a-zA-Z0-9_:.]", ""); - - // fix for ANY23-347: strip non-reserved xml namespaces - // See https://www.w3.org/TR/xml-names/#sec-namespaces - // "All other prefixes beginning with the three-letter sequence x, m, l, - // in any case combination, are reserved. This means that: - // * users SHOULD NOT use them except as defined by later specifications - // * processors MUST NOT treat them as fatal errors." - int prefixlen = newKey.lastIndexOf(':') + 1; - String prefix = newKey.substring(0, prefixlen).toLowerCase(); - newKey = (prefix.startsWith("xml") ? prefix : "") + newKey.substring(prefixlen); - - if (newKey.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") - //the namespace name for "xmlns" MUST NOT be declared - //the namespace name for "xml" need not be declared - && !newKey.startsWith("xmlns:xml") - // fix for ANY23-380: disallow duplicate attribute keys - && attributeKeys.add(newKey)) { - //avoid indexOf() operation if possible - if (!newKey.equals(oldKey)) { - attr.setKey(newKey); - } - } else { - it.remove(); - } - } - attributeKeys.clear(); - - String tagName = ((Element)node).tagName().replaceAll("[^-a-zA-Z0-9_:.]", ""); - tagName = tagName.substring(tagName.lastIndexOf(':') + 1); - ((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName : "div"); - - // fix for ANY23-389 - resolve_base: - if ("base".equalsIgnoreCase(tagName) && node.hasAttr("href")) { - String href = node.attr("href"); - String absHref; - try { - ParsedIRI parsedHref = ParsedIRI.create(href.trim()); - if (parsedHref.isAbsolute()) { - absHref = parsedHref.toString(); - } else { - parsedHref = ParsedIRI.create(iri.trim()).resolve(parsedHref); - if (parsedHref.isAbsolute()) { - absHref = parsedHref.toString(); - } else { - // shouldn't happen unless document IRI wasn't absolute - // ignore and let underlying RDFa parser report the issue - break resolve_base; - } - } - } catch (RuntimeException e) { - // can't parse href as a relative or absolute IRI: - // ignore and let underlying RDFa parser report the issue - break resolve_base; - } - if (!absHref.equals(href)) { - node.attr("href", absHref); - } - } - - return FilterResult.CONTINUE; - } - return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType - ? FilterResult.REMOVE : FilterResult.CONTINUE; - } - @Override - public FilterResult tail(Node node, int depth) { - return FilterResult.CONTINUE; - } - }, doc); - - // fix for ANY23-379: remove invalid xml characters from document - String finalOutput = invalidXMLCharacters.matcher(doc.toString()).replaceAll(""); - in = new ByteArrayInputStream(finalOutput.getBytes(charset)); - } else if (format.hasFileExtension("jsonld") || format.hasMIMEType("application/ld+json")) { + if (format.hasFileExtension("jsonld") || format.hasMIMEType("application/ld+json")) { in = new JsonCleaningInputStream(in); } - parser.parse(in, iri); + parser.parse(in, extractionContext.getDocumentIRI().stringValue()); } catch (Exception ex) { // ANY23-420: jsonld-java can sometimes throw IllegalArgumentException, // so don't limit catch block to RDFParseExceptions diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/BaseRDFaExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/BaseRDFaExtractor.java new file mode 100644 index 0000000..def3a37 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/BaseRDFaExtractor.java @@ -0,0 +1,139 @@ +package org.apache.any23.extractor.rdfa; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.html.JsoupUtils; +import org.apache.any23.extractor.rdf.BaseRDFExtractor; +import org.eclipse.rdf4j.common.net.ParsedIRI; +import org.jsoup.nodes.*; +import org.jsoup.select.NodeFilter; +import org.jsoup.select.NodeTraversor; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.HashSet; +import java.util.Iterator; +import java.util.regex.Pattern; + +abstract class BaseRDFaExtractor extends BaseRDFExtractor { + + + private static final Pattern invalidXMLCharacters = Pattern.compile( + "[^\u0009\r\n\u0020-\uD7FF\uE000-\uFFFD\ud800\udc00-\udbff\udfff]"); + + private static final Charset charset = StandardCharsets.UTF_8; + + BaseRDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) { + super(verifyDataType, stopAtFirstError); + } + + @Override + public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in, ExtractionResult extractionResult) throws IOException, ExtractionException { + + String iri = extractionContext.getDocumentIRI().stringValue(); + + Document doc = JsoupUtils.parse(in, iri, null); + doc.outputSettings() + .prettyPrint(false) + .syntax(Document.OutputSettings.Syntax.xml) + .escapeMode(Entities.EscapeMode.xhtml) + .charset(charset); + // Delete scripts, comments, and doctypes + // See https://issues.apache.org/jira/browse/ANY23-317 + // and https://issues.apache.org/jira/browse/ANY23-340 + NodeTraversor.filter(new NodeFilter() { + final HashSet<String> tmpAttributeKeys = new HashSet<>(); + + @Override + public FilterResult head(Node node, int depth) { + if (node instanceof Element) { + HashSet<String> attributeKeys = tmpAttributeKeys; + for (Iterator<Attribute> it = node.attributes().iterator(); it.hasNext(); ) { + // fix for ANY23-350: valid xml attribute names are ^[a-zA-Z_:][-a-zA-Z0-9_:.] + Attribute attr = it.next(); + String oldKey = attr.getKey(); + String newKey = oldKey.replaceAll("[^-a-zA-Z0-9_:.]", ""); + + // fix for ANY23-347: strip non-reserved xml namespaces + // See https://www.w3.org/TR/xml-names/#sec-namespaces + // "All other prefixes beginning with the three-letter sequence x, m, l, + // in any case combination, are reserved. This means that: + // * users SHOULD NOT use them except as defined by later specifications + // * processors MUST NOT treat them as fatal errors." + int prefixlen = newKey.lastIndexOf(':') + 1; + String prefix = newKey.substring(0, prefixlen).toLowerCase(); + newKey = (prefix.startsWith("xml") ? prefix : "") + newKey.substring(prefixlen); + + if (newKey.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") + //the namespace name for "xmlns" MUST NOT be declared + //the namespace name for "xml" need not be declared + && !newKey.startsWith("xmlns:xml") + // fix for ANY23-380: disallow duplicate attribute keys + && attributeKeys.add(newKey)) { + //avoid indexOf() operation if possible + if (!newKey.equals(oldKey)) { + attr.setKey(newKey); + } + } else { + it.remove(); + } + } + attributeKeys.clear(); + + String tagName = ((Element)node).tagName().replaceAll("[^-a-zA-Z0-9_:.]", ""); + tagName = tagName.substring(tagName.lastIndexOf(':') + 1); + ((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName : "div"); + + // fix for ANY23-389 + resolve_base: + if ("base".equalsIgnoreCase(tagName) && node.hasAttr("href")) { + String href = node.attr("href"); + String absHref; + try { + ParsedIRI parsedHref = ParsedIRI.create(href.trim()); + if (parsedHref.isAbsolute()) { + absHref = parsedHref.toString(); + } else { + parsedHref = ParsedIRI.create(iri.trim()).resolve(parsedHref); + if (parsedHref.isAbsolute()) { + absHref = parsedHref.toString(); + } else { + // shouldn't happen unless document IRI wasn't absolute + // ignore and let underlying RDFa parser report the issue + break resolve_base; + } + } + } catch (RuntimeException e) { + // can't parse href as a relative or absolute IRI: + // ignore and let underlying RDFa parser report the issue + break resolve_base; + } + if (!absHref.equals(href)) { + node.attr("href", absHref); + } + } + + return FilterResult.CONTINUE; + } + return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType + ? FilterResult.REMOVE : FilterResult.CONTINUE; + } + @Override + public FilterResult tail(Node node, int depth) { + return FilterResult.CONTINUE; + } + }, doc); + + // fix for ANY23-379: remove invalid xml characters from document + String finalOutput = invalidXMLCharacters.matcher(doc.toString()).replaceAll(""); + + in = new ByteArrayInputStream(finalOutput.getBytes(charset)); + + super.run(extractionParameters, extractionContext, in, extractionResult); + } +} diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/RDFa11Extractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/RDFa11Extractor.java index 037a362..f61bd35 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdfa/RDFa11Extractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/RDFa11Extractor.java @@ -20,7 +20,6 @@ package org.apache.any23.extractor.rdfa; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; -import org.apache.any23.extractor.rdf.BaseRDFExtractor; import org.apache.any23.extractor.rdf.RDFParserFactory; import org.eclipse.rdf4j.rio.RDFParser; @@ -30,7 +29,7 @@ import org.eclipse.rdf4j.rio.RDFParser; * * @author Michele Mostarda ([email protected]) */ -public class RDFa11Extractor extends BaseRDFExtractor { +public class RDFa11Extractor extends BaseRDFaExtractor { public RDFa11Extractor(boolean verifyDataType, boolean stopAtFirstError) { super(verifyDataType, stopAtFirstError); diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/RDFaExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/RDFaExtractor.java index 615b16f..8608491 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdfa/RDFaExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/RDFaExtractor.java @@ -20,7 +20,6 @@ package org.apache.any23.extractor.rdfa; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; -import org.apache.any23.extractor.rdf.BaseRDFExtractor; import org.apache.any23.extractor.rdf.RDFParserFactory; import org.eclipse.rdf4j.rio.RDFParser; @@ -30,7 +29,7 @@ import org.eclipse.rdf4j.rio.RDFParser; * * @author Michele Mostarda ([email protected]) */ -public class RDFaExtractor extends BaseRDFExtractor { +public class RDFaExtractor extends BaseRDFaExtractor { public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) { super(verifyDataType, stopAtFirstError); diff --git a/core/src/test/java/org/apache/any23/Any23Test.java b/core/src/test/java/org/apache/any23/Any23Test.java index 0d91d8b..c9b8814 100644 --- a/core/src/test/java/org/apache/any23/Any23Test.java +++ b/core/src/test/java/org/apache/any23/Any23Test.java @@ -19,6 +19,7 @@ package org.apache.any23; import org.apache.any23.extractor.ExtractorGroup; import org.apache.any23.extractor.rdf.NTriplesExtractorFactory; +import org.apache.http.conn.ConnectTimeoutException; import org.junit.Assert; import org.apache.any23.configuration.Configuration; import org.apache.any23.configuration.DefaultConfiguration; @@ -49,6 +50,7 @@ import org.apache.any23.writer.RepositoryWriter; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; import org.apache.commons.io.IOUtils; +import org.junit.AssumptionViolatedException; import org.junit.Test; import org.eclipse.rdf4j.model.Statement; import org.eclipse.rdf4j.repository.Repository; @@ -304,7 +306,13 @@ public class Any23Test extends Any23OnlineTestBase { "https://dev.w3.org/html5/rdfa/"); ByteArrayOutputStream out = new ByteArrayOutputStream(); TripleHandler handler = new NTriplesWriter(out); - runner.extract(source, handler); + try { + runner.extract(source, handler); + } catch (ConnectTimeoutException e) { + // This page is down as of 2019.09.14 + logger.error("Connection to " + source.getDocumentIRI() + " timed out; skipping test", e); + throw new AssumptionViolatedException(e.getMessage()); + } String n3 = out.toString("UTF-8"); logger.debug("N3 " + n3); Assert.assertTrue(n3.length() > 0);
