This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4259 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 97a0fb3ffd768bb82d8fbb37d4a240a8d0d5e6de Author: tallison <[email protected]> AuthorDate: Thu May 23 16:47:23 2024 -0400 TIKA-4259 -- refactor xml parser convenience methods out of ParseContext --- CHANGES.txt | 3 + .../java/org/apache/tika/parser/ParseContext.java | 170 --------------------- .../java/org/apache/tika/utils/XMLReaderUtils.java | 59 +++++++ .../org/apache/tika/parser/mock/MockParser.java | 3 +- .../org/apache/tika/parser/pdf/XFAExtractor.java | 3 +- 5 files changed, 66 insertions(+), 172 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index cc4575ff5..1f3f4cec3 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -5,6 +5,9 @@ Release 3.0.0-BETA2 - ??? * Updated PST parser to use standard Message metadata keys and improved handling of embedded files (TIKA-4248). + * Convenience methods for XML readers were moved from ParseContext to + XMLReaderUtils (TIKA-4259). + Other Changes * Add optional PST parser based on libpst/readpst (TIKA-4250). diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 531f1daa0..691bf7f9f 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -16,26 +16,9 @@ */ package org.apache.tika.parser; -import java.io.InputStream; import java.io.Serializable; import java.util.HashMap; import java.util.Map; -import javax.xml.XMLConstants; -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; -import javax.xml.stream.XMLInputFactory; -import javax.xml.transform.Transformer; - -import org.xml.sax.SAXNotRecognizedException; -import org.xml.sax.SAXNotSupportedException; -import org.xml.sax.XMLReader; -import org.xml.sax.helpers.DefaultHandler; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.utils.XMLReaderUtils; /** * Parse context. Used to pass context information to Tika parsers. @@ -99,157 +82,4 @@ public class ParseContext implements Serializable { return defaultValue; } } - - /** - * Returns the XMLReader specified in this parsing context. If a reader - * is not explicitly specified, then one is created using the specified - * or the default SAX parser. - * - * @return XMLReader - * @throws TikaException - * @see #getSAXParser() - * @since Apache Tika 1.13 - */ - public XMLReader getXMLReader() throws TikaException { - XMLReader reader = get(XMLReader.class); - if (reader != null) { - return reader; - } - return XMLReaderUtils.getXMLReader(); - } - - /** - * Returns the SAX parser specified in this parsing context. If a parser - * is not explicitly specified, then one is created using the specified - * or the default SAX parser factory. Consider using - * {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler, ParseContext)} - * for more efficient reuse of SAXParsers. - * - * @return SAX parser - * @throws TikaException if a SAX parser could not be created - * @see #getSAXParserFactory() - * @since Apache Tika 0.8 - */ - public SAXParser getSAXParser() throws TikaException { - SAXParser parser = get(SAXParser.class); - if (parser != null) { - return parser; - } else { - return XMLReaderUtils.getSAXParser(); - } - } - - /** - * Returns the SAX parser factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware, not validating, and to use - * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. - * - * @return SAX parser factory - * @since Apache Tika 0.8 - */ - public SAXParserFactory getSAXParserFactory() { - SAXParserFactory factory = get(SAXParserFactory.class); - if (factory == null) { - factory = SAXParserFactory.newInstance(); - factory.setNamespaceAware(true); - factory.setValidating(false); - try { - factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); - } catch (ParserConfigurationException | SAXNotSupportedException e) { - //swallow - } catch (SAXNotRecognizedException e) { - // TIKA-271: Some XML parsers do not support the - // secure-processing feature, even though it's required by - // JAXP in Java 5. Ignoring the exception is fine here, as - // deployments without this feature are inherently vulnerable - // to XML denial-of-service attacks. - } - } - return factory; - } - - /** - * Returns the DOM builder factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security - * features. - * - * @return DOM parser factory - * @since Apache Tika 1.13 - */ - private DocumentBuilderFactory getDocumentBuilderFactory() { - //borrowed from Apache POI - DocumentBuilderFactory documentBuilderFactory = get(DocumentBuilderFactory.class); - if (documentBuilderFactory != null) { - return documentBuilderFactory; - } else { - return XMLReaderUtils.getDocumentBuilderFactory(); - } - } - - /** - * Returns the DOM builder specified in this parsing context. - * If a builder is not explicitly specified, then a builder - * instance is created and returned. The builder instance is - * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER}, - * and it sets the ErrorHandler to <code>null</code>. - * Consider using {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)} - * instead for more efficient reuse of document builders. - * - * @return DOM Builder - * @since Apache Tika 1.13 - */ - public DocumentBuilder getDocumentBuilder() throws TikaException { - DocumentBuilder documentBuilder = get(DocumentBuilder.class); - if (documentBuilder != null) { - return documentBuilder; - } else { - return XMLReaderUtils.getDocumentBuilder(); - } - } - - /** - * Returns the StAX input factory specified in this parsing context. - * If a factory is not explicitly specified, then a default factory - * instance is created and returned. The default factory instance is - * configured to be namespace-aware and to apply reasonable security - * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}. - * - * @return StAX input factory - * @since Apache Tika 1.13 - */ - public XMLInputFactory getXMLInputFactory() { - XMLInputFactory factory = get(XMLInputFactory.class); - if (factory != null) { - return factory; - } - return XMLReaderUtils.getXMLInputFactory(); - } - - - /** - * Returns the transformer specified in this parsing context. - * <p> - * If a transformer is not explicitly specified, then a default transformer - * instance is created and returned. The default transformer instance is - * configured to to use - * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. - * - * @return Transformer - * @throws TikaException when the transformer can not be created - * @since Apache Tika 1.17 - */ - public Transformer getTransformer() throws TikaException { - - Transformer transformer = get(Transformer.class); - if (transformer != null) { - return transformer; - } - - return XMLReaderUtils.getTransformer(); - } - } diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java index 262ebfef9..87e4eec9b 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java @@ -1101,4 +1101,63 @@ public class XMLReaderUtils implements Serializable { trySetXercesSecurityManager(saxParser); } } + + /** + * Returns the DOM builder specified in this parsing context. + * If a builder is not explicitly specified, then a builder + * instance is created and returned. The builder instance is + * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER}, + * and it sets the ErrorHandler to <code>null</code>. + * Consider using {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)} + * instead for more efficient reuse of document builders. + * + * @return DOM Builder + */ + public static DocumentBuilder getDocumentBuilder(ParseContext context) throws TikaException { + DocumentBuilder documentBuilder = context.get(DocumentBuilder.class); + if (documentBuilder != null) { + return documentBuilder; + } else { + return XMLReaderUtils.getDocumentBuilder(); + } + } + + /** + * Returns the StAX input factory specified in this parsing context. + * If a factory is not explicitly specified, then a default factory + * instance is created and returned. The default factory instance is + * configured to be namespace-aware and to apply reasonable security + * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}. + * + * @return StAX input factory + */ + public static XMLInputFactory getXMLInputFactory(ParseContext context) { + XMLInputFactory factory = context.get(XMLInputFactory.class); + if (factory != null) { + return factory; + } + return XMLReaderUtils.getXMLInputFactory(); + } + + + /** + * Returns the transformer specified in this parsing context. + * <p> + * If a transformer is not explicitly specified, then a default transformer + * instance is created and returned. The default transformer instance is + * configured to to use + * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}. + * + * @return Transformer + * @throws TikaException when the transformer can not be created + */ + public static Transformer getTransformer(ParseContext context) throws TikaException { + + Transformer transformer = context.get(Transformer.class); + if (transformer != null) { + return transformer; + } + + return XMLReaderUtils.getTransformer(); + } } diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java index de464bca5..92666819c 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java @@ -64,6 +64,7 @@ import org.apache.tika.parser.ParseRecord; import org.apache.tika.parser.Parser; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.XMLReaderUtils; /** * This class enables mocking of parser behavior for use in testing @@ -120,7 +121,7 @@ public class MockParser implements Parser { } Document doc = null; try { - DocumentBuilder docBuilder = context.getDocumentBuilder(); + DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder(context); doc = docBuilder.parse(new CloseShieldInputStream(stream)); } catch (SAXException e) { //to distinguish between SAX on read vs SAX while writing diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java index a79e942e8..f22a4ddd3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java @@ -32,6 +32,7 @@ import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.XMLReaderUtils; /** * This class offers an initial capability to @@ -88,7 +89,7 @@ class XFAExtractor { // //As a final step, dump the merged fields and the values. - XMLStreamReader reader = context.getXMLInputFactory().createXMLStreamReader(xfaIs); + XMLStreamReader reader = XMLReaderUtils.getXMLInputFactory(context).createXMLStreamReader(xfaIs); while (reader.hasNext()) { switch (reader.next()) { case XMLStreamConstants.START_ELEMENT:
