Author: jukka
Date: Tue Jun 2 01:36:44 2009
New Revision: 780897
URL: http://svn.apache.org/viewvc?rev=780897&view=rev
Log:
TIKA-238: Better handling of delegating parser implementations
Introduce the DelegatingParser concept and integrate it to the Tika
configuration mechanism and the AutoDetectParser class.
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=780897&r1=780896&r2=780897&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Tue Jun 2 01:36:44 2009
@@ -30,6 +30,8 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
+import org.apache.tika.parser.DelegatingParser;
+import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.Parser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@@ -46,7 +48,7 @@
"/org/apache/tika/tika-config.xml";
private final Map<String, Parser> parsers = new HashMap<String, Parser>();
-
+
private static MimeTypes mimeTypes;
public TikaConfig(String file)
@@ -66,14 +68,29 @@
public TikaConfig(InputStream stream)
throws TikaException, IOException, SAXException {
- this(getBuilder().parse(stream));
+ this(stream, null);
+ }
+
+ public TikaConfig(InputStream stream, Parser delegate)
+ throws TikaException, IOException, SAXException {
+ this(getBuilder().parse(stream), delegate);
}
public TikaConfig(Document document) throws TikaException, IOException {
- this(document.getDocumentElement());
+ this(document, null);
+ }
+
+ public TikaConfig(Document document, Parser delegate)
+ throws TikaException, IOException {
+ this(document.getDocumentElement(), delegate);
}
public TikaConfig(Element element) throws TikaException, IOException {
+ this(element, null);
+ }
+
+ public TikaConfig(Element element, Parser delegate)
+ throws TikaException, IOException {
Element mtr = getChild(element, "mimeTypeRepository");
if (mtr != null) {
mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
@@ -84,7 +101,13 @@
Element node = (Element) nodes.item(i);
String name = node.getAttribute("class");
try {
- Parser parser = (Parser) Class.forName(name).newInstance();
+ Class<?> parserClass = Class.forName(name);
+ Parser parser = (Parser) parserClass.newInstance();
+
+ if (delegate != null && parser instanceof DelegatingParser) {
+ ((DelegatingParser) parser).setDelegate(delegate);
+ }
+
NodeList mimes = node.getElementsByTagName("mime");
for (int j = 0; j < mimes.getLength(); j++) {
parsers.put(getText(mimes.item(j)).trim(), parser);
@@ -139,10 +162,15 @@
* @throws TikaException if the default configuration is not available
*/
public static TikaConfig getDefaultConfig() throws TikaException {
+ return getDefaultConfig(new EmptyParser());
+ }
+
+ public static TikaConfig getDefaultConfig(Parser delegate)
+ throws TikaException {
try {
InputStream stream =
TikaConfig.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION);
- return new TikaConfig(stream);
+ return new TikaConfig(stream, delegate);
} catch (IOException e) {
throw new TikaException("Unable to read default configuration", e);
} catch (SAXException e) {
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=780897&r1=780896&r2=780897&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Tue Jun 2 01:36:44 2009
@@ -40,7 +40,7 @@
*/
public AutoDetectParser() {
try {
- setConfig(TikaConfig.getDefaultConfig());
+ setConfig(TikaConfig.getDefaultConfig(this));
} catch (TikaException e) {
// FIXME: This should never happen
throw new RuntimeException(e);
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java?rev=780897&view=auto
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
(added)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
Tue Jun 2 01:36:44 2009
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Base class for parser implementations that want to delegate parts of the
+ * task of parsing an input document to another parser. The default base
+ * class implementation simply delegates the entire parsing task to a dummy
+ * {...@link EmptyParser} instance, but subclasses can implement more complex
+ * processing rules and a more complete delegate parser can be specified
+ * through the {...@link #setDelegate(Parser)} method.
+ * <p>
+ * The Tika configuration mechanism also contains a way to automatically
+ * set the delegate parser of all configured delegating parsers
+ * implementations. This feature is most notably used by the
+ * {...@link AutoDetectParser} class to make it the recursive target of all
+ * delegated parsing tasks.
+ *
+ * @since Apache Tika 0.4
+ */
+public class DelegatingParser implements Parser {
+
+ /**
+ * The parser to which parts of the parsing tasks are delegated.
+ */
+ private transient Parser delegate = new EmptyParser();
+
+ /**
+ * Returns delegate parser instance.
+ *
+ * @return delegate parser
+ */
+ public Parser getDelegate() {
+ return delegate;
+ }
+
+ /**
+ * Sets the delegate parser instance.
+ *
+ * @param delegate delegate parser
+ */
+ public void setDelegate(Parser delegate) {
+ if (delegate == null) {
+ throw new NullPointerException(
+ "Delegate parser of " + this + " can not be null");
+ } else {
+ this.delegate = delegate;
+ }
+ }
+
+ /**
+ * Parses the given document using the specified delegate parser.
+ * Subclasses should override this method with more complex delegation
+ * rules based on the structure of the input document. The default
+ * implementation simply delegates the entire parsing task to the
+ * specified delegate parser.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws SAXException, IOException, TikaException {
+ delegate.parse(stream, handler, metadata);
+ }
+
+}