This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4259
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 97a0fb3ffd768bb82d8fbb37d4a240a8d0d5e6de
Author: tallison <[email protected]>
AuthorDate: Thu May 23 16:47:23 2024 -0400

    TIKA-4259 -- refactor xml parser convenience methods out of ParseContext
---
 CHANGES.txt                                        |   3 +
 .../java/org/apache/tika/parser/ParseContext.java  | 170 ---------------------
 .../java/org/apache/tika/utils/XMLReaderUtils.java |  59 +++++++
 .../org/apache/tika/parser/mock/MockParser.java    |   3 +-
 .../org/apache/tika/parser/pdf/XFAExtractor.java   |   3 +-
 5 files changed, 66 insertions(+), 172 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index cc4575ff5..1f3f4cec3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -5,6 +5,9 @@ Release 3.0.0-BETA2 - ???
    * Updated PST parser to use standard Message metadata keys and improved
      handling of embedded files (TIKA-4248).
 
+   * Convenience methods for XML readers were moved from ParseContext to
+     XMLReaderUtils (TIKA-4259).
+
    Other Changes
 
    * Add optional PST parser based on libpst/readpst (TIKA-4250).
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java 
b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index 531f1daa0..691bf7f9f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -16,26 +16,9 @@
  */
 package org.apache.tika.parser;
 
-import java.io.InputStream;
 import java.io.Serializable;
 import java.util.HashMap;
 import java.util.Map;
-import javax.xml.XMLConstants;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.transform.Transformer;
-
-import org.xml.sax.SAXNotRecognizedException;
-import org.xml.sax.SAXNotSupportedException;
-import org.xml.sax.XMLReader;
-import org.xml.sax.helpers.DefaultHandler;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.utils.XMLReaderUtils;
 
 /**
  * Parse context. Used to pass context information to Tika parsers.
@@ -99,157 +82,4 @@ public class ParseContext implements Serializable {
             return defaultValue;
         }
     }
-
-    /**
-     * Returns the XMLReader specified in this parsing context. If a reader
-     * is not explicitly specified, then one is created using the specified
-     * or the default SAX parser.
-     *
-     * @return XMLReader
-     * @throws TikaException
-     * @see #getSAXParser()
-     * @since Apache Tika 1.13
-     */
-    public XMLReader getXMLReader() throws TikaException {
-        XMLReader reader = get(XMLReader.class);
-        if (reader != null) {
-            return reader;
-        }
-        return XMLReaderUtils.getXMLReader();
-    }
-
-    /**
-     * Returns the SAX parser specified in this parsing context. If a parser
-     * is not explicitly specified, then one is created using the specified
-     * or the default SAX parser factory. Consider using
-     * {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler, 
ParseContext)}
-     * for more efficient reuse of SAXParsers.
-     *
-     * @return SAX parser
-     * @throws TikaException if a SAX parser could not be created
-     * @see #getSAXParserFactory()
-     * @since Apache Tika 0.8
-     */
-    public SAXParser getSAXParser() throws TikaException {
-        SAXParser parser = get(SAXParser.class);
-        if (parser != null) {
-            return parser;
-        } else {
-            return XMLReaderUtils.getSAXParser();
-        }
-    }
-
-    /**
-     * Returns the SAX parser factory specified in this parsing context.
-     * If a factory is not explicitly specified, then a default factory
-     * instance is created and returned. The default factory instance is
-     * configured to be namespace-aware, not validating, and to use
-     * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
-     *
-     * @return SAX parser factory
-     * @since Apache Tika 0.8
-     */
-    public SAXParserFactory getSAXParserFactory() {
-        SAXParserFactory factory = get(SAXParserFactory.class);
-        if (factory == null) {
-            factory = SAXParserFactory.newInstance();
-            factory.setNamespaceAware(true);
-            factory.setValidating(false);
-            try {
-                factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, 
true);
-            } catch (ParserConfigurationException | SAXNotSupportedException 
e) {
-                //swallow
-            } catch (SAXNotRecognizedException e) {
-                // TIKA-271: Some XML parsers do not support the
-                // secure-processing feature, even though it's required by
-                // JAXP in Java 5. Ignoring the exception is fine here, as
-                // deployments without this feature are inherently vulnerable
-                // to XML denial-of-service attacks.
-            }
-        }
-        return factory;
-    }
-
-    /**
-     * Returns the DOM builder factory specified in this parsing context.
-     * If a factory is not explicitly specified, then a default factory
-     * instance is created and returned. The default factory instance is
-     * configured to be namespace-aware and to apply reasonable security
-     * features.
-     *
-     * @return DOM parser factory
-     * @since Apache Tika 1.13
-     */
-    private DocumentBuilderFactory getDocumentBuilderFactory() {
-        //borrowed from Apache POI
-        DocumentBuilderFactory documentBuilderFactory = 
get(DocumentBuilderFactory.class);
-        if (documentBuilderFactory != null) {
-            return documentBuilderFactory;
-        } else {
-            return XMLReaderUtils.getDocumentBuilderFactory();
-        }
-    }
-
-    /**
-     * Returns the DOM builder specified in this parsing context.
-     * If a builder is not explicitly specified, then a builder
-     * instance is created and returned. The builder instance is
-     * configured to apply an {@link 
XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
-     * and it sets the ErrorHandler to <code>null</code>.
-     * Consider using {@link XMLReaderUtils#buildDOM(InputStream, 
ParseContext)}
-     * instead for more efficient reuse of document builders.
-     *
-     * @return DOM Builder
-     * @since Apache Tika 1.13
-     */
-    public DocumentBuilder getDocumentBuilder() throws TikaException {
-        DocumentBuilder documentBuilder = get(DocumentBuilder.class);
-        if (documentBuilder != null) {
-            return documentBuilder;
-        } else {
-            return XMLReaderUtils.getDocumentBuilder();
-        }
-    }
-
-    /**
-     * Returns the StAX input factory specified in this parsing context.
-     * If a factory is not explicitly specified, then a default factory
-     * instance is created and returned. The default factory instance is
-     * configured to be namespace-aware and to apply reasonable security
-     * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}.
-     *
-     * @return StAX input factory
-     * @since Apache Tika 1.13
-     */
-    public XMLInputFactory getXMLInputFactory() {
-        XMLInputFactory factory = get(XMLInputFactory.class);
-        if (factory != null) {
-            return factory;
-        }
-        return XMLReaderUtils.getXMLInputFactory();
-    }
-
-
-    /**
-     * Returns the transformer specified in this parsing context.
-     * <p>
-     * If a transformer is not explicitly specified, then a default transformer
-     * instance is created and returned. The default transformer instance is
-     * configured to to use
-     * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
-     *
-     * @return Transformer
-     * @throws TikaException when the transformer can not be created
-     * @since Apache Tika 1.17
-     */
-    public Transformer getTransformer() throws TikaException {
-
-        Transformer transformer = get(Transformer.class);
-        if (transformer != null) {
-            return transformer;
-        }
-
-        return XMLReaderUtils.getTransformer();
-    }
-
 }
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 262ebfef9..87e4eec9b 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -1101,4 +1101,63 @@ public class XMLReaderUtils implements Serializable {
             trySetXercesSecurityManager(saxParser);
         }
     }
+
+    /**
+     * Returns the DOM builder specified in this parsing context.
+     * If a builder is not explicitly specified, then a builder
+     * instance is created and returned. The builder instance is
+     * configured to apply an {@link 
XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
+     * and it sets the ErrorHandler to <code>null</code>.
+     * Consider using {@link XMLReaderUtils#buildDOM(InputStream, 
ParseContext)}
+     * instead for more efficient reuse of document builders.
+     *
+     * @return DOM Builder
+     */
+    public static DocumentBuilder getDocumentBuilder(ParseContext context) 
throws TikaException {
+        DocumentBuilder documentBuilder = context.get(DocumentBuilder.class);
+        if (documentBuilder != null) {
+            return documentBuilder;
+        } else {
+            return XMLReaderUtils.getDocumentBuilder();
+        }
+    }
+
+    /**
+     * Returns the StAX input factory specified in this parsing context.
+     * If a factory is not explicitly specified, then a default factory
+     * instance is created and returned. The default factory instance is
+     * configured to be namespace-aware and to apply reasonable security
+     * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}.
+     *
+     * @return StAX input factory
+     */
+    public static XMLInputFactory getXMLInputFactory(ParseContext context) {
+        XMLInputFactory factory = context.get(XMLInputFactory.class);
+        if (factory != null) {
+            return factory;
+        }
+        return XMLReaderUtils.getXMLInputFactory();
+    }
+
+
+    /**
+     * Returns the transformer specified in this parsing context.
+     * <p>
+     * If a transformer is not explicitly specified, then a default transformer
+     * instance is created and returned. The default transformer instance is
+     * configured to to use
+     * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+     *
+     * @return Transformer
+     * @throws TikaException when the transformer can not be created
+     */
+    public static Transformer getTransformer(ParseContext context) throws 
TikaException {
+
+        Transformer transformer = context.get(Transformer.class);
+        if (transformer != null) {
+            return transformer;
+        }
+
+        return XMLReaderUtils.getTransformer();
+    }
 }
diff --git 
a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java 
b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
index de464bca5..92666819c 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
@@ -64,6 +64,7 @@ import org.apache.tika.parser.ParseRecord;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 
 /**
  * This class enables mocking of parser behavior for use in testing
@@ -120,7 +121,7 @@ public class MockParser implements Parser {
         }
         Document doc = null;
         try {
-            DocumentBuilder docBuilder = context.getDocumentBuilder();
+            DocumentBuilder docBuilder = 
XMLReaderUtils.getDocumentBuilder(context);
             doc = docBuilder.parse(new CloseShieldInputStream(stream));
         } catch (SAXException e) {
             //to distinguish between SAX on read vs SAX while writing
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
index a79e942e8..f22a4ddd3 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
@@ -32,6 +32,7 @@ import org.xml.sax.helpers.AttributesImpl;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 
 /**
  * This class offers an initial capability to
@@ -88,7 +89,7 @@ class XFAExtractor {
         //
         //As a final step, dump the merged fields and the values.
 
-        XMLStreamReader reader = 
context.getXMLInputFactory().createXMLStreamReader(xfaIs);
+        XMLStreamReader reader = 
XMLReaderUtils.getXMLInputFactory(context).createXMLStreamReader(xfaIs);
         while (reader.hasNext()) {
             switch (reader.next()) {
                 case XMLStreamConstants.START_ELEMENT:

Reply via email to