Author: jukka
Date: Fri Sep 11 20:36:27 2009
New Revision: 814019
URL: http://svn.apache.org/viewvc?rev=814019&view=rev
Log:
TIKA-275: Parse context
Make the XMLParser behaviour easier to customize via the parsing context.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=814019&r1=814018&r2=814019&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/XMLParser.java
Fri Sep 11 20:36:27 2009
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -36,9 +36,24 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
+import org.xml.sax.SAXNotSupportedException;
/**
- * XML parser
+ * XML parser.
+ * <p>
+ * This class uses the following parsing context entries:
+ * <dl>
+ * <dt>javax.xml.parsers.SAXParser</dt>
+ * <dd>
+ * The SAX parser ({...@link SAXParser} instance) to be used for parsing
+ * the XML input documents. Optional.
+ * </dd>
+ * <dt>javax.xml.parsers.SAXParserFactory</dt>
+ * <dd>
+ * The SAX parser factory ({...@link SAXParserFactory} instance) used to
+ * create a SAX parser if one has not been explicitly specified. Optional.
+ * </dd>
+ * </dl>
*/
public class XMLParser implements Parser {
@@ -55,26 +70,10 @@
xhtml.startDocument();
xhtml.startElement("p");
- try {
- SAXParserFactory factory = SAXParserFactory.newInstance();
- factory.setNamespaceAware(true);
- try {
- factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING,
true);
- } catch (SAXNotRecognizedException e) {
- // TIKA-271: Some XML parsers do not support the
secure-processing
- // feature, even though it's required by JAXP in Java 5.
Ignoring
- // the exception is fine here, deployments without this feature
- // are inherently vulnerable to XML denial-of-service attacks.
- }
-
- SAXParser parser = factory.newSAXParser();
- parser.parse(
- new CloseShieldInputStream(stream),
- new OfflineContentHandler(
- getContentHandler(handler, metadata)));
- } catch (ParserConfigurationException e) {
- throw new TikaException("XML parser configuration error", e);
- }
+ getSAXParser(context).parse(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(
+ getContentHandler(handler, metadata)));
xhtml.endElement("p");
xhtml.endDocument();
@@ -95,4 +94,71 @@
return new TextContentHandler(handler);
}
+ /**
+ * Returns the SAX parser specified in the parsing context. If a parse
+ * is not explicitly specified, then one is created using the specified
+ * or the default SAX parser factory.
+ *
+ * @see #getSAXParserFactory()
+ * @param context parsing context
+ * @return SAX parser
+ * @throws TikaException if a SAX parser could not be created
+ */
+ private SAXParser getSAXParser(Map<String, Object> context)
+ throws TikaException {
+ Object parser = context.get(SAXParser.class.getName());
+ if (parser instanceof SAXParser) {
+ return (SAXParser) parser;
+ } else {
+ try {
+ return getSAXParserFactory(context).newSAXParser();
+ } catch (ParserConfigurationException e) {
+ throw new TikaException("Unable to configure a SAX parser", e);
+ } catch (SAXException e) {
+ throw new TikaException("Unable to create a SAX parser", e);
+ }
+ }
+ }
+
+ /**
+ * Returns the SAX parser factory specified in the parsing context.
+ * If a factory is not explicitly specified, then a default factory
+ * instance is created and returned.
+ *
+ * @see #getDefaultSAXParserFactory()
+ * @param context parsing context
+ * @return SAX parser factory
+ */
+ private SAXParserFactory getSAXParserFactory(Map<String, Object> context) {
+ Object factory = context.get(SAXParserFactory.class.getName());
+ if (factory instanceof SAXParserFactory) {
+ return (SAXParserFactory) factory;
+ } else {
+ return getDefaultSAXParserFactory();
+ }
+ }
+
+ /**
+ * Creates and returns a default SAX parser factory. The factory is
+ * configured to be namespace-aware and to use secure XML processing.
+ *
+ * @see XMLConstants#FEATURE_SECURE_PROCESSING
+ * @return default SAX parser factory
+ */
+ private SAXParserFactory getDefaultSAXParserFactory() {
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ factory.setNamespaceAware(true);
+ try {
+ factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+ } catch (ParserConfigurationException e) {
+ } catch (SAXNotSupportedException e) {
+ } catch (SAXNotRecognizedException e) {
+ // TIKA-271: Some XML parsers do not support the secure-processing
+ // feature, even though it's required by JAXP in Java 5. Ignoring
+ // the exception is fine here, as deployments without this feature
+ // are inherently vulnerable to XML denial-of-service attacks.
+ }
+ return factory;
+ }
+
}