Author: jukka
Date: Tue Jun  2 01:36:44 2009
New Revision: 780897

URL: http://svn.apache.org/viewvc?rev=780897&view=rev
Log:
TIKA-238: Better handling of delegating parser implementations

Introduce the DelegatingParser concept and integrate it to the Tika 
configuration mechanism and the AutoDetectParser class.

Added:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
Modified:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=780897&r1=780896&r2=780897&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
 Tue Jun  2 01:36:44 2009
@@ -30,6 +30,8 @@
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.mime.MimeTypesFactory;
+import org.apache.tika.parser.DelegatingParser;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.Parser;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -46,7 +48,7 @@
         "/org/apache/tika/tika-config.xml";
 
     private final Map<String, Parser> parsers = new HashMap<String, Parser>();
-    
+
     private static MimeTypes mimeTypes;
 
     public TikaConfig(String file)
@@ -66,14 +68,29 @@
 
     public TikaConfig(InputStream stream)
             throws TikaException, IOException, SAXException {
-        this(getBuilder().parse(stream));
+        this(stream, null);
+    }
+
+    public TikaConfig(InputStream stream, Parser delegate)
+            throws TikaException, IOException, SAXException {
+        this(getBuilder().parse(stream), delegate);
     }
 
     public TikaConfig(Document document) throws TikaException, IOException {
-        this(document.getDocumentElement());
+        this(document, null);
+    }
+
+    public TikaConfig(Document document, Parser delegate)
+            throws TikaException, IOException {
+        this(document.getDocumentElement(), delegate);
     }
 
     public TikaConfig(Element element) throws TikaException, IOException {
+        this(element, null);
+    }
+
+    public TikaConfig(Element element, Parser delegate)
+            throws TikaException, IOException {
         Element mtr = getChild(element, "mimeTypeRepository");
         if (mtr != null) {
             mimeTypes = MimeTypesFactory.create(mtr.getAttribute("resource"));
@@ -84,7 +101,13 @@
             Element node = (Element) nodes.item(i);
             String name = node.getAttribute("class");
             try {
-                Parser parser = (Parser) Class.forName(name).newInstance();
+                Class<?> parserClass = Class.forName(name);
+                Parser parser = (Parser) parserClass.newInstance();
+
+                if (delegate != null && parser instanceof DelegatingParser) {
+                    ((DelegatingParser) parser).setDelegate(delegate);
+                }
+
                 NodeList mimes = node.getElementsByTagName("mime");
                 for (int j = 0; j < mimes.getLength(); j++) {
                     parsers.put(getText(mimes.item(j)).trim(), parser);
@@ -139,10 +162,15 @@
      * @throws TikaException if the default configuration is not available
      */
     public static TikaConfig getDefaultConfig() throws TikaException {
+        return getDefaultConfig(new EmptyParser());
+    }
+
+    public static TikaConfig getDefaultConfig(Parser delegate)
+            throws TikaException {
         try {
             InputStream stream =
                 TikaConfig.class.getResourceAsStream(DEFAULT_CONFIG_LOCATION);
-            return new TikaConfig(stream);
+            return new TikaConfig(stream, delegate);
         } catch (IOException e) {
             throw new TikaException("Unable to read default configuration", e);
         } catch (SAXException e) {

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=780897&r1=780896&r2=780897&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
 Tue Jun  2 01:36:44 2009
@@ -40,7 +40,7 @@
      */
     public AutoDetectParser() {
         try {
-            setConfig(TikaConfig.getDefaultConfig());
+            setConfig(TikaConfig.getDefaultConfig(this));
         } catch (TikaException e) {
             // FIXME: This should never happen
             throw new RuntimeException(e);

Added: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java?rev=780897&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
 (added)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
 Tue Jun  2 01:36:44 2009
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Base class for parser implementations that want to delegate parts of the
+ * task of parsing an input document to another parser. The default base
+ * class implementation simply delegates the entire parsing task to a dummy
+ * {...@link EmptyParser} instance, but subclasses can implement more complex
+ * processing rules and a more complete delegate parser can be specified
+ * through the {...@link #setDelegate(Parser)} method.
+ * <p>
+ * The Tika configuration mechanism also contains a way to automatically
+ * set the delegate parser of all configured delegating parsers
+ * implementations. This feature is most notably used by the
+ * {...@link AutoDetectParser} class to make it the recursive target of all
+ * delegated parsing tasks.
+ *
+ * @since Apache Tika 0.4
+ */
+public class DelegatingParser implements Parser {
+
+    /**
+     * The parser to which parts of the parsing tasks are delegated.
+     */
+    private transient Parser delegate = new EmptyParser();
+
+    /**
+     * Returns delegate parser instance.
+     *
+     * @return delegate parser
+     */
+    public Parser getDelegate() {
+        return delegate;
+    }
+
+    /**
+     * Sets the delegate parser instance.
+     *
+     * @param delegate delegate parser
+     */
+    public void setDelegate(Parser delegate) {
+        if (delegate == null) {
+            throw new NullPointerException(
+                    "Delegate parser of " + this + " can not be null");
+        } else {
+            this.delegate = delegate;
+        }
+    }
+
+    /**
+     * Parses the given document using the specified delegate parser.
+     * Subclasses should override this method with more complex delegation
+     * rules based on the structure of the input document. The default
+     * implementation simply delegates the entire parsing task to the
+     * specified delegate parser.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws SAXException, IOException, TikaException {
+        delegate.parse(stream, handler, metadata);
+    }
+
+}


Reply via email to