Author: jukka
Date: Fri Apr 11 07:29:33 2008
New Revision: 647181
URL: http://svn.apache.org/viewvc?rev=647181&view=rev
Log:
TIKA-139: Add a composite parser
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=647181&r1=647180&r2=647181&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Fri Apr 11 07:29:33 2008
@@ -46,6 +46,9 @@
19. TIKA-113 - Metadata (such as title) should not be part of content
(Jukka Zitting)
+20. TIKA-139 - Add a composite parser (Jukka Zitting)
+
+
Release 0.1-incubating - 12/27/2007
1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java?rev=647181&r1=647180&r2=647181&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/config/TikaConfig.java
Fri Apr 11 07:29:33 2008
@@ -107,7 +107,11 @@
public Parser getParser(String mimeType) {
return parsers.get(mimeType);
}
-
+
+ public Map<String, Parser> getParsers() {
+ return parsers;
+ }
+
public MimeTypes getMimeRepository(){
return mimeTypes;
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=647181&r1=647180&r2=647181&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
Fri Apr 11 07:29:33 2008
@@ -30,9 +30,9 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-public class AutoDetectParser extends AbstractParser {
+public class AutoDetectParser extends CompositeParser {
- private TikaConfig config;
+ private MimeTypes types;
/**
* Creates an auto-detecting parser instance using the default Tika
@@ -40,7 +40,7 @@
*/
public AutoDetectParser() {
try {
- config = TikaConfig.getDefaultConfig();
+ setConfig(TikaConfig.getDefaultConfig());
} catch (TikaException e) {
// FIXME: This should never happen
throw new RuntimeException(e);
@@ -48,15 +48,20 @@
}
public AutoDetectParser(TikaConfig config) {
- this.config = config;
+ setConfig(config);
}
- public TikaConfig getConfig() {
- return config;
+ public void setConfig(TikaConfig config) {
+ setParsers(config.getParsers());
+ setMimeTypes(config.getMimeRepository());
}
- public void setConfig(TikaConfig config) {
- this.config = config;
+ public MimeTypes getMimeTypes() {
+ return types;
+ }
+
+ public void setMimeTypes(MimeTypes types) {
+ this.types = types;
}
public void parse(
@@ -71,17 +76,8 @@
MimeType type = getMimeType(stream, metadata);
metadata.set(Metadata.CONTENT_TYPE, type.getName());
- // Get the parser configured for the detected MIME type
- Parser parser = config.getParser(type.getName());
- if (parser == null) {
- parser = config.getParser(MimeTypes.DEFAULT);
- }
- if (parser == null) {
- throw new TikaException("No parsers available: " + type.getName());
- }
-
// Parse the document
- parser.parse(stream, handler, metadata);
+ super.parse(stream, handler, metadata);
}
/**
@@ -99,8 +95,6 @@
*/
private MimeType getMimeType(InputStream stream, Metadata metadata)
throws IOException {
- MimeTypes types = config.getMimeRepository();
-
// Get type based on magic prefix
stream.mark(types.getMinLength());
try {
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=647181&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/CompositeParser.java
Fri Apr 11 07:29:33 2008
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Composite parser that delegates parsing tasks to a component parser
+ * based on the declared content type of the incoming document. A fallback
+ * parser is defined for cases where a parser for the given content type is
+ * not available.
+ */
+public class CompositeParser implements Parser {
+
+ /**
+ * Set of component parsers, keyed by the supported media types.
+ */
+ private Map<String, Parser> parsers = new HashMap<String, Parser>();
+
+ /**
+ * The fallback parser, used when no better parser is available.
+ */
+ private Parser fallback = new EmptyParser();
+
+ /**
+ * Returns the component parsers.
+ *
+ * @return component parsers, keyed by media type
+ */
+ public Map<String, Parser> getParsers() {
+ return parsers;
+ }
+
+ /**
+ * Sets the component parsers.
+ *
+ * @param parsers component parsers, keyed by media type
+ */
+ public void setParsers(Map<String, Parser> parsers) {
+ this.parsers = parsers;
+ }
+
+ /**
+ * Returns the fallback parser.
+ *
+ * @return fallback parser
+ */
+ public Parser getFallback() {
+ return fallback;
+ }
+
+ /**
+ * Sets the fallback parser.
+ *
+ * @param fallback fallback parser
+ */
+ public void setFallback(Parser fallback) {
+ this.fallback = fallback;
+ }
+
+ /**
+ * Returns the parser that best matches the given metadata. By default
+ * looks for a parser that matches the content type metadata property,
+ * and uses the fallback parser if a better match is not found.
+ * <p>
+ * Subclasses can override this method to provide more accurate
+ * parser resolution.
+ *
+ * @param metadata document metadata
+ * @return matching parser
+ */
+ protected Parser getParser(Metadata metadata) {
+ Parser parser = parsers.get(metadata.get(Metadata.CONTENT_TYPE));
+ if (parser == null) {
+ parser = fallback;
+ }
+ return parser;
+ }
+
+ /**
+ * Delegates the call to the matching component parser.
+ */
+ public void parse(InputStream stream, Metadata metadata)
+ throws IOException, TikaException {
+ getParser(metadata).parse(stream, metadata);
+ }
+
+ /**
+ * Delegates the call to the matching component parser.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ getParser(metadata).parse(stream, handler, metadata);
+ }
+
+}