This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 035682cdd TIKA-4259 (#1775)
035682cdd is described below
commit 035682cdd9e993cd441f005f62a3b36f410c50b6
Author: Tim Allison <[email protected]>
AuthorDate: Fri May 24 09:27:17 2024 -0400
TIKA-4259 (#1775)
* TIKA-4259 -- refactor xml parser convenience methods out of ParseContext
---
CHANGES.txt | 3 +
.../java/org/apache/tika/parser/ParseContext.java | 170 ---------------------
.../java/org/apache/tika/utils/XMLReaderUtils.java | 59 +++++++
.../org/apache/tika/parser/mock/MockParser.java | 3 +-
.../org/apache/tika/parser/pdf/XFAExtractor.java | 3 +-
5 files changed, 66 insertions(+), 172 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index cc4575ff5..1f3f4cec3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,9 @@ Release 3.0.0-BETA2 - ???
* Updated PST parser to use standard Message metadata keys and improved
handling of embedded files (TIKA-4248).
+ * Convenience methods for XML readers were moved from ParseContext to
+ XMLReaderUtils (TIKA-4259).
+
Other Changes
* Add optional PST parser based on libpst/readpst (TIKA-4250).
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index 531f1daa0..691bf7f9f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -16,26 +16,9 @@
*/
package org.apache.tika.parser;
-import java.io.InputStream;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
-import javax.xml.XMLConstants;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.transform.Transformer;
-
-import org.xml.sax.SAXNotRecognizedException;
-import org.xml.sax.SAXNotSupportedException;
-import org.xml.sax.XMLReader;
-import org.xml.sax.helpers.DefaultHandler;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.utils.XMLReaderUtils;
/**
* Parse context. Used to pass context information to Tika parsers.
@@ -99,157 +82,4 @@ public class ParseContext implements Serializable {
return defaultValue;
}
}
-
- /**
- * Returns the XMLReader specified in this parsing context. If a reader
- * is not explicitly specified, then one is created using the specified
- * or the default SAX parser.
- *
- * @return XMLReader
- * @throws TikaException
- * @see #getSAXParser()
- * @since Apache Tika 1.13
- */
- public XMLReader getXMLReader() throws TikaException {
- XMLReader reader = get(XMLReader.class);
- if (reader != null) {
- return reader;
- }
- return XMLReaderUtils.getXMLReader();
- }
-
- /**
- * Returns the SAX parser specified in this parsing context. If a parser
- * is not explicitly specified, then one is created using the specified
- * or the default SAX parser factory. Consider using
- * {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler,
ParseContext)}
- * for more efficient reuse of SAXParsers.
- *
- * @return SAX parser
- * @throws TikaException if a SAX parser could not be created
- * @see #getSAXParserFactory()
- * @since Apache Tika 0.8
- */
- public SAXParser getSAXParser() throws TikaException {
- SAXParser parser = get(SAXParser.class);
- if (parser != null) {
- return parser;
- } else {
- return XMLReaderUtils.getSAXParser();
- }
- }
-
- /**
- * Returns the SAX parser factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware, not validating, and to use
- * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
- *
- * @return SAX parser factory
- * @since Apache Tika 0.8
- */
- public SAXParserFactory getSAXParserFactory() {
- SAXParserFactory factory = get(SAXParserFactory.class);
- if (factory == null) {
- factory = SAXParserFactory.newInstance();
- factory.setNamespaceAware(true);
- factory.setValidating(false);
- try {
- factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING,
true);
- } catch (ParserConfigurationException | SAXNotSupportedException
e) {
- //swallow
- } catch (SAXNotRecognizedException e) {
- // TIKA-271: Some XML parsers do not support the
- // secure-processing feature, even though it's required by
- // JAXP in Java 5. Ignoring the exception is fine here, as
- // deployments without this feature are inherently vulnerable
- // to XML denial-of-service attacks.
- }
- }
- return factory;
- }
-
- /**
- * Returns the DOM builder factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware and to apply reasonable security
- * features.
- *
- * @return DOM parser factory
- * @since Apache Tika 1.13
- */
- private DocumentBuilderFactory getDocumentBuilderFactory() {
- //borrowed from Apache POI
- DocumentBuilderFactory documentBuilderFactory =
get(DocumentBuilderFactory.class);
- if (documentBuilderFactory != null) {
- return documentBuilderFactory;
- } else {
- return XMLReaderUtils.getDocumentBuilderFactory();
- }
- }
-
- /**
- * Returns the DOM builder specified in this parsing context.
- * If a builder is not explicitly specified, then a builder
- * instance is created and returned. The builder instance is
- * configured to apply an {@link
XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
- * and it sets the ErrorHandler to <code>null</code>.
- * Consider using {@link XMLReaderUtils#buildDOM(InputStream,
ParseContext)}
- * instead for more efficient reuse of document builders.
- *
- * @return DOM Builder
- * @since Apache Tika 1.13
- */
- public DocumentBuilder getDocumentBuilder() throws TikaException {
- DocumentBuilder documentBuilder = get(DocumentBuilder.class);
- if (documentBuilder != null) {
- return documentBuilder;
- } else {
- return XMLReaderUtils.getDocumentBuilder();
- }
- }
-
- /**
- * Returns the StAX input factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware and to apply reasonable security
- * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}.
- *
- * @return StAX input factory
- * @since Apache Tika 1.13
- */
- public XMLInputFactory getXMLInputFactory() {
- XMLInputFactory factory = get(XMLInputFactory.class);
- if (factory != null) {
- return factory;
- }
- return XMLReaderUtils.getXMLInputFactory();
- }
-
-
- /**
- * Returns the transformer specified in this parsing context.
- * <p>
- * If a transformer is not explicitly specified, then a default transformer
- * instance is created and returned. The default transformer instance is
- * configured to to use
- * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
- *
- * @return Transformer
- * @throws TikaException when the transformer can not be created
- * @since Apache Tika 1.17
- */
- public Transformer getTransformer() throws TikaException {
-
- Transformer transformer = get(Transformer.class);
- if (transformer != null) {
- return transformer;
- }
-
- return XMLReaderUtils.getTransformer();
- }
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 262ebfef9..87e4eec9b 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -1101,4 +1101,63 @@ public class XMLReaderUtils implements Serializable {
trySetXercesSecurityManager(saxParser);
}
}
+
+ /**
+ * Returns the DOM builder specified in this parsing context.
+ * If a builder is not explicitly specified, then a builder
+ * instance is created and returned. The builder instance is
+ * configured to apply an {@link
XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
+ * and it sets the ErrorHandler to <code>null</code>.
+ * Consider using {@link XMLReaderUtils#buildDOM(InputStream,
ParseContext)}
+ * instead for more efficient reuse of document builders.
+ *
+ * @return DOM Builder
+ */
+ public static DocumentBuilder getDocumentBuilder(ParseContext context)
throws TikaException {
+ DocumentBuilder documentBuilder = context.get(DocumentBuilder.class);
+ if (documentBuilder != null) {
+ return documentBuilder;
+ } else {
+ return XMLReaderUtils.getDocumentBuilder();
+ }
+ }
+
+ /**
+ * Returns the StAX input factory specified in this parsing context.
+ * If a factory is not explicitly specified, then a default factory
+ * instance is created and returned. The default factory instance is
+ * configured to be namespace-aware and to apply reasonable security
+ * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}.
+ *
+ * @return StAX input factory
+ */
+ public static XMLInputFactory getXMLInputFactory(ParseContext context) {
+ XMLInputFactory factory = context.get(XMLInputFactory.class);
+ if (factory != null) {
+ return factory;
+ }
+ return XMLReaderUtils.getXMLInputFactory();
+ }
+
+
+ /**
+ * Returns the transformer specified in this parsing context.
+ * <p>
+ * If a transformer is not explicitly specified, then a default transformer
+ * instance is created and returned. The default transformer instance is
+ * configured to to use
+ * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+ *
+ * @return Transformer
+ * @throws TikaException when the transformer can not be created
+ */
+ public static Transformer getTransformer(ParseContext context) throws
TikaException {
+
+ Transformer transformer = context.get(Transformer.class);
+ if (transformer != null) {
+ return transformer;
+ }
+
+ return XMLReaderUtils.getTransformer();
+ }
}
diff --git
a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
index de464bca5..92666819c 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
@@ -64,6 +64,7 @@ import org.apache.tika.parser.ParseRecord;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
/**
* This class enables mocking of parser behavior for use in testing
@@ -120,7 +121,7 @@ public class MockParser implements Parser {
}
Document doc = null;
try {
- DocumentBuilder docBuilder = context.getDocumentBuilder();
+ DocumentBuilder docBuilder =
XMLReaderUtils.getDocumentBuilder(context);
doc = docBuilder.parse(new CloseShieldInputStream(stream));
} catch (SAXException e) {
//to distinguish between SAX on read vs SAX while writing
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
index a79e942e8..f22a4ddd3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
@@ -32,6 +32,7 @@ import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
/**
* This class offers an initial capability to
@@ -88,7 +89,7 @@ class XFAExtractor {
//
//As a final step, dump the merged fields and the values.
- XMLStreamReader reader =
context.getXMLInputFactory().createXMLStreamReader(xfaIs);
+ XMLStreamReader reader =
XMLReaderUtils.getXMLInputFactory(context).createXMLStreamReader(xfaIs);
while (reader.hasNext()) {
switch (reader.next()) {
case XMLStreamConstants.START_ELEMENT: