This is an automated email from the ASF dual-hosted git repository. hansbrende pushed a commit to branch ANY23-443 in repository https://gitbox.apache.org/repos/asf/any23.git
commit 50cfb2fd7f3112e27c44ab5850117bacda22a679 Author: Hans <[email protected]> AuthorDate: Sun Sep 15 20:40:26 2019 -0500 ANY23-443 improve speed & stability of RDFa extractors --- .../any23/extractor/rdf/BaseRDFExtractor.java | 2 +- .../any23/extractor/rdfa/BaseRDFaExtractor.java | 161 +++++---------------- .../apache/any23/extractor/rdfa/JsoupScanner.java | 159 ++++++++++++++++++++ .../any23/extractor/rdfa/RDFa11Extractor.java | 7 +- .../apache/any23/extractor/rdfa/RDFaExtractor.java | 7 +- .../apache/any23/extractor/rdfa/SemarglSink.java | 79 ++++++++++ 6 files changed, 286 insertions(+), 129 deletions(-) diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index 153fda5..25d105e 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -120,7 +120,7 @@ public abstract class BaseRDFExtractor implements Extractor.ContentExtractor { } } - private static String toString(Throwable th) { + protected static String toString(Throwable th) { StringWriter writer = new StringWriter(); try (PrintWriter pw = new PrintWriter(writer)) { th.printStackTrace(pw); diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/BaseRDFaExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/BaseRDFaExtractor.java index 654e093..c183499 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdfa/BaseRDFaExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/BaseRDFaExtractor.java @@ -4,143 +4,56 @@ import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionParameters; import org.apache.any23.extractor.ExtractionResult; -import org.apache.any23.extractor.html.JsoupUtils; +import org.apache.any23.extractor.IssueReport; import org.apache.any23.extractor.rdf.BaseRDFExtractor; -import org.eclipse.rdf4j.common.net.ParsedIRI; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Comment; -import org.jsoup.nodes.DataNode; +import org.apache.any23.rdf.Any23ValueFactoryWrapper; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.rio.helpers.RDFaParserSettings; +import org.jsoup.Jsoup; import org.jsoup.nodes.Document; -import org.jsoup.nodes.DocumentType; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Entities; -import org.jsoup.nodes.Node; -import org.jsoup.select.NodeFilter; -import org.jsoup.select.NodeTraversor; +import org.jsoup.parser.ParseSettings; +import org.jsoup.parser.Parser; +import org.semarglproject.rdf.rdfa.RdfaParser; +import org.semarglproject.rdf4j.rdf.rdfa.SemarglParserSettings; +import org.semarglproject.sink.XmlSink; +import org.semarglproject.source.StreamProcessor; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.HashSet; -import java.util.Iterator; -import java.util.regex.Pattern; abstract class BaseRDFaExtractor extends BaseRDFExtractor { + private final short version; - private static final Pattern invalidXMLCharacters = Pattern.compile( - "[^\u0009\r\n\u0020-\uD7FF\uE000-\uFFFD\ud800\udc00-\udbff\udfff]"); - - private static final Charset charset = StandardCharsets.UTF_8; - - BaseRDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) { - super(verifyDataType, stopAtFirstError); + BaseRDFaExtractor(short version) { + super(false, false); + this.version = version; } @Override public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in, ExtractionResult extractionResult) throws IOException, ExtractionException { - String iri = extractionContext.getDocumentIRI().stringValue(); - - Document doc = JsoupUtils.parse(in, iri, null); - doc.outputSettings() - .prettyPrint(false) - .syntax(Document.OutputSettings.Syntax.xml) - .escapeMode(Entities.EscapeMode.xhtml) - .charset(charset); - // Delete scripts, comments, and doctypes - // See https://issues.apache.org/jira/browse/ANY23-317 - // and https://issues.apache.org/jira/browse/ANY23-340 - NodeTraversor.filter(new NodeFilter() { - final HashSet<String> tmpAttributeKeys = new HashSet<>(); - - @Override - public FilterResult head(Node node, int depth) { - if (node instanceof Element) { - HashSet<String> attributeKeys = tmpAttributeKeys; - for (Iterator<Attribute> it = node.attributes().iterator(); it.hasNext(); ) { - // fix for ANY23-350: valid xml attribute names are ^[a-zA-Z_:][-a-zA-Z0-9_:.] - Attribute attr = it.next(); - String oldKey = attr.getKey(); - String newKey = oldKey.replaceAll("[^-a-zA-Z0-9_:.]", ""); - - // fix for ANY23-347: strip non-reserved xml namespaces - // See https://www.w3.org/TR/xml-names/#sec-namespaces - // "All other prefixes beginning with the three-letter sequence x, m, l, - // in any case combination, are reserved. This means that: - // * users SHOULD NOT use them except as defined by later specifications - // * processors MUST NOT treat them as fatal errors." - int prefixlen = newKey.lastIndexOf(':') + 1; - String prefix = newKey.substring(0, prefixlen).toLowerCase(); - newKey = (prefix.startsWith("xml") ? prefix : "") + newKey.substring(prefixlen); - - if (newKey.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") - //the namespace name for "xmlns" MUST NOT be declared - //the namespace name for "xml" need not be declared - && !newKey.startsWith("xmlns:xml") - // fix for ANY23-380: disallow duplicate attribute keys - && attributeKeys.add(newKey)) { - //avoid indexOf() operation if possible - if (!newKey.equals(oldKey)) { - attr.setKey(newKey); - } - } else { - it.remove(); - } - } - attributeKeys.clear(); - - String tagName = ((Element)node).tagName().replaceAll("[^-a-zA-Z0-9_:.]", ""); - tagName = tagName.substring(tagName.lastIndexOf(':') + 1); - ((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName : "div"); - - // fix for ANY23-389 - resolve_base: - if ("base".equalsIgnoreCase(tagName) && node.hasAttr("href")) { - String href = node.attr("href"); - String absHref; - try { - ParsedIRI parsedHref = ParsedIRI.create(href.trim()); - if (parsedHref.isAbsolute()) { - absHref = parsedHref.toString(); - } else { - parsedHref = ParsedIRI.create(iri.trim()).resolve(parsedHref); - if (parsedHref.isAbsolute()) { - absHref = parsedHref.toString(); - } else { - // shouldn't happen unless document IRI wasn't absolute - // ignore and let underlying RDFa parser report the issue - break resolve_base; - } - } - } catch (RuntimeException e) { - // can't parse href as a relative or absolute IRI: - // ignore and let underlying RDFa parser report the issue - break resolve_base; - } - if (!absHref.equals(href)) { - node.attr("href", absHref); - } - } - - return FilterResult.CONTINUE; - } - return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType - ? FilterResult.REMOVE : FilterResult.CONTINUE; - } - @Override - public FilterResult tail(Node node, int depth) { - return FilterResult.CONTINUE; - } - }, doc); - - // fix for ANY23-379: remove invalid xml characters from document - String finalOutput = invalidXMLCharacters.matcher(doc.toString()).replaceAll(""); - - in = new ByteArrayInputStream(finalOutput.getBytes(charset)); - - super.run(extractionParameters, extractionContext, in, extractionResult); + SemarglSink rdfaSink = new SemarglSink(extractionResult, new Any23ValueFactoryWrapper( + SimpleValueFactory.getInstance(), + extractionResult, + extractionContext.getDefaultLanguage() + )); + + XmlSink xmlSink = RdfaParser.connect(rdfaSink); + xmlSink.setProperty(StreamProcessor.PROCESSOR_GRAPH_HANDLER_PROPERTY, rdfaSink); + xmlSink.setProperty(RdfaParser.RDFA_VERSION_PROPERTY, version); + xmlSink.setProperty(RdfaParser.ENABLE_VOCAB_EXPANSION, RDFaParserSettings.VOCAB_EXPANSION_ENABLED.getDefaultValue()); + xmlSink.setProperty(RdfaParser.ENABLE_PROCESSOR_GRAPH, SemarglParserSettings.PROCESSOR_GRAPH_ENABLED.getDefaultValue()); + + String baseUri = extractionContext.getDocumentIRI().stringValue(); + xmlSink.setBaseUri(baseUri); + Document doc = Jsoup.parse(in, null, baseUri, Parser.htmlParser().settings(ParseSettings.preserveCase)); + try { + xmlSink.startDocument(); + doc.traverse(new JsoupScanner(xmlSink)); + xmlSink.endDocument(); + } catch (Exception e) { + extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, toString(e), -1, -1); + } } } diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/JsoupScanner.java b/core/src/main/java/org/apache/any23/extractor/rdfa/JsoupScanner.java new file mode 100644 index 0000000..7fec69c --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/JsoupScanner.java @@ -0,0 +1,159 @@ +package org.apache.any23.extractor.rdfa; + +import org.jsoup.nodes.CDataNode; +import org.jsoup.nodes.Comment; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; +import org.semarglproject.sink.XmlSink; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import org.xml.sax.helpers.NamespaceSupport; + +import java.util.ArrayList; + +class JsoupScanner implements NodeVisitor { + + private final NamespaceSupport ns = new NamespaceSupport(); + private final AttributesImpl attrs = new AttributesImpl(); + private final String[] nameParts = new String[3]; + + private final XmlSink handler; + + JsoupScanner(XmlSink handler) { + this.handler = handler; + } + + private static String orEmpty(String str) { + return str == null ? "" : str; + } + +// private static String orNull(String str) { +// return "".equals(str) ? null : str; +// } + + private void startElement(Element e) throws SAXException { + ns.pushContext(); + + attrs.clear(); + final ArrayList<String> remainingAttrs = new ArrayList<>(); + for (org.jsoup.nodes.Attribute attr : e.attributes()) { + String name = attr.getKey(); + String value = attr.getValue(); + if (name.startsWith("xmlns")) { + if (name.length() == 5) { + ns.declarePrefix("", value); + handler.startPrefixMapping("", value); + continue; + } else if (name.charAt(5) == ':') { + String localName = name.substring(6); + ns.declarePrefix(localName, value); + handler.startPrefixMapping(localName, value); + continue; + } + } + + remainingAttrs.add(name); + remainingAttrs.add(value); + } + + for (int i = 0, len = remainingAttrs.size(); i < len; i += 2) { + String name = remainingAttrs.get(i); + String value = remainingAttrs.get(i + 1); + String[] parts = ns.processName(name, nameParts, true); + if (parts != null) { + attrs.addAttribute(orEmpty(parts[0]), orEmpty(parts[1]), parts[2], "CDATA", value); + } + } + + String qName = e.tagName(); + + String[] parts = ns.processName(qName, nameParts, false); + if (parts == null) { + handler.startElement("", "", qName, attrs); + } else { + handler.startElement(orEmpty(parts[0]), orEmpty(parts[1]), parts[2], attrs); + } + + } + + private void endElement(Element e) throws SAXException { + + String qName = e.tagName(); + String[] parts = ns.processName(qName, nameParts, false); + if (parts == null) { + handler.endElement("", "", qName); + } else { + handler.endElement(orEmpty(parts[0]), orEmpty(parts[1]), parts[2]); + } + + for (org.jsoup.nodes.Attribute attr : e.attributes()) { + String name = attr.getKey(); + if (name.startsWith("xmlns")) { + if (name.length() == 5) { + handler.endPrefixMapping(""); + } else if (name.charAt(5) == ':') { + String localName = name.substring(6); + handler.endPrefixMapping(localName); + } + } + } + + ns.popContext(); + } + + private void handleText(String str) throws SAXException { + handler.characters(str.toCharArray(), 0, str.length()); + } + + private void handleComment(String str) throws SAXException { + handler.comment(str.toCharArray(), 0, str.length()); + } + + + @Override + public void head(Node node, int depth) { + try { + if (node instanceof Element) { + startElement((Element) node); + } else if (node instanceof CDataNode) { + handler.startCDATA(); + handleText(((CDataNode) node).text()); + } else if (node instanceof TextNode) { + handleText(((TextNode) node).text()); + // TODO support document types +// } else if (node instanceof DocumentType) { +// DocumentType dt = (DocumentType)node; +// handler.startDTD(dt.attr("name"), orNull(dt.attr("publicId")), orNull(dt.attr("systemId"))); + } else if (node instanceof Comment) { + handleComment(((Comment) node).getData()); + } + } catch (SAXException e) { + sneakyThrow(e); + } + } + + @Override + public void tail(Node node, int depth) { + try { + if (node instanceof Element) { + endElement((Element) node); + } else if (node instanceof CDataNode) { + handler.endCDATA(); +// } else if (node instanceof DocumentType) { +// handler.endDTD(); + } + } catch (SAXException e) { + sneakyThrow(e); + } + } + + + + @SuppressWarnings("unchecked") + private static <E extends Throwable> void sneakyThrow(Throwable e) throws E { + throw (E)e; + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/RDFa11Extractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/RDFa11Extractor.java index f61bd35..ae6c5ae 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdfa/RDFa11Extractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/RDFa11Extractor.java @@ -22,6 +22,7 @@ import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.rdf.RDFParserFactory; import org.eclipse.rdf4j.rio.RDFParser; +import org.semarglproject.vocab.RDFa; /** * {@link org.apache.any23.extractor.Extractor} implementation for @@ -31,12 +32,13 @@ import org.eclipse.rdf4j.rio.RDFParser; */ public class RDFa11Extractor extends BaseRDFaExtractor { + @Deprecated public RDFa11Extractor(boolean verifyDataType, boolean stopAtFirstError) { - super(verifyDataType, stopAtFirstError); + this(); } public RDFa11Extractor() { - this(false, false); + super(RDFa.VERSION_11); } @Override @@ -45,6 +47,7 @@ public class RDFa11Extractor extends BaseRDFaExtractor { } @Override + @Deprecated protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) { return RDFParserFactory.getInstance().getRDFa11Parser( isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/RDFaExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/RDFaExtractor.java index 8608491..1d8eda6 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdfa/RDFaExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/RDFaExtractor.java @@ -22,6 +22,7 @@ import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.rdf.RDFParserFactory; import org.eclipse.rdf4j.rio.RDFParser; +import org.semarglproject.vocab.RDFa; /** * {@link org.apache.any23.extractor.Extractor} implementation for @@ -31,12 +32,13 @@ import org.eclipse.rdf4j.rio.RDFParser; */ public class RDFaExtractor extends BaseRDFaExtractor { + @Deprecated public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) { - super(verifyDataType, stopAtFirstError); + this(); } public RDFaExtractor() { - this(false, false); + super(RDFa.VERSION_10); } @Override @@ -45,6 +47,7 @@ public class RDFaExtractor extends BaseRDFaExtractor { } @Override + @Deprecated protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) { return RDFParserFactory.getInstance().getRDFa10Parser( isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/SemarglSink.java b/core/src/main/java/org/apache/any23/extractor/rdfa/SemarglSink.java new file mode 100644 index 0000000..3e043f1 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/SemarglSink.java @@ -0,0 +1,79 @@ +package org.apache.any23.extractor.rdfa; + +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.IssueReport; +import org.eclipse.rdf4j.model.Resource; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.ValueFactory; + +final class SemarglSink implements org.semarglproject.sink.TripleSink, org.semarglproject.rdf.ProcessorGraphHandler { + + private static final String BNODE_PREFIX = org.semarglproject.vocab.RDF.BNODE_PREFIX; + + private final ExtractionResult handler; + private final ValueFactory valueFactory; + + SemarglSink(ExtractionResult handler, ValueFactory valueFactory) { + this.handler = handler; + this.valueFactory = valueFactory; + } + + private Resource createResource(String arg) { + if (arg.startsWith(BNODE_PREFIX)) { + return valueFactory.createBNode(arg.substring(BNODE_PREFIX.length())); + } + return valueFactory.createIRI(arg); + } + + private void writeTriple(String s, String p, Value o) { + handler.writeTriple(createResource(s), valueFactory.createIRI(p), o); + } + + @Override + public final void addNonLiteral(String s, String p, String o) { + writeTriple(s, p, createResource(o)); + } + + @Override + public final void addPlainLiteral(String s, String p, String o, String lang) { + writeTriple(s, p, lang == null ? valueFactory.createLiteral(o) : valueFactory.createLiteral(o, lang)); + } + + @Override + public final void addTypedLiteral(String s, String p, String o, String type) { + writeTriple(s, p, valueFactory.createLiteral(o, valueFactory.createIRI(type))); + } + + @Override + public void startStream() { + + } + + @Override + public void endStream() { + } + + @Override + public boolean setProperty(String key, Object value) { + return false; + } + + @Override + public void setBaseUri(String baseUri) { + } + + @Override + public void info(String infoClass, String message) { + + } + + @Override + public void warning(String warningClass, String message) { + handler.notifyIssue(IssueReport.IssueLevel.WARNING, message, -1, -1); + } + + @Override + public void error(String errorClass, String message) { + handler.notifyIssue(IssueReport.IssueLevel.ERROR, message, -1, -1); + } +}
