Author: jukka
Date: Mon Oct 1 00:21:21 2007
New Revision: 580874
URL: http://svn.apache.org/viewvc?rev=580874&view=rev
Log:
TIKA-33 - Stateless parsers
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Mon Oct 1 00:21:21 2007
@@ -50,6 +50,7 @@
23. TIKA-31 - protected Parser.parse(InputStream stream,
Iterable<Content> contents) (jukka & K. Bennett)
-24. TIKA-36 A convenience method for getting a document's content's text
+24. TIKA-36 - A convenience method for getting a document's content's text
would be helpful (K. Bennett & mattmann)
+25. TIKA-33 - Stateless parsers (jukka)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/Parser.java Mon
Oct 1 00:21:21 2007
@@ -34,22 +34,10 @@
private static final Logger logger = Logger.getLogger(Parser.class);
- private InputStream is;
-
private String mimeType;
private String namespace;
- private Map<String, Content> contents;
-
- private String contentStr;
-
- private boolean parsed = false;
-
- public void setInputStream(InputStream is) {
- this.is = is;
- }
-
/**
* Get document mime type
*/
@@ -72,87 +60,39 @@
this.namespace = namespace;
}
- /**
- * Get the string content of the document
- */
- public String getStrContent() {
- getContents();
- return contentStr;
- }
-
- /**
- * Get a content object, this object is configured from the TikaConfig Xml.
- * It could be a document metadata, XPath selection, regex selection or
- * fulltext
- */
- public Content getContent(String name) {
- return getContents().get(name);
- }
-
- /**
- * Returns the text associated with the Content named 'name',
- * or null if such a Content does not exist.
- *
- * @param name name of Content the caller wants the value of
- * @return the found Content's value, or null if not found
- */
- public String getContentValue(String name) {
- Content content = getContent(name);
-
- return content != null
- ? content.getValue()
- : null;
- }
-
- /**
- * Get a List of contents objects, this objects are configured from the
- * TikaConfig Xml file. It could be a document metadata, XPath selection,
- * regex selection or fulltext
- */
- public Map<String, Content> getContents() {
- if (!parsed) {
- try {
- try {
- contentStr = parse(is, contents.values());
- } finally {
- is.close();
- }
-
- for (Content content : contents.values()) {
- if ("fulltext".equalsIgnoreCase(content.getTextSelect())) {
- content.setValue(contentStr);
- } else if
("summary".equalsIgnoreCase(content.getTextSelect())) {
- int length = Math.min(contentStr.length(), 500);
- String summary = contentStr.substring(0, length);
- content.setValue(summary);
- } else if (content.getRegexSelect() != null) {
- String regex = content.getRegexSelect();
- try {
- List<String> values =
- RegexUtils.extract(contentStr, regex);
- if (values.size() > 0) {
- content.setValue(values.get(0));
- content.setValues(
- values.toArray(new
String[values.size()]));
- }
- } catch (MalformedPatternException e) {
- logger.error(
- "Invalid regular expression: " + regex, e);
+ public String getContents(InputStream stream, Map<String, Content>
contents) {
+ try {
+ String contentStr = parse(stream, contents.values());
+
+ for (Content content : contents.values()) {
+ if ("fulltext".equalsIgnoreCase(content.getTextSelect())) {
+ content.setValue(contentStr);
+ } else if
("summary".equalsIgnoreCase(content.getTextSelect())) {
+ int length = Math.min(contentStr.length(), 500);
+ String summary = contentStr.substring(0, length);
+ content.setValue(summary);
+ } else if (content.getRegexSelect() != null) {
+ String regex = content.getRegexSelect();
+ try {
+ List<String> values =
+ RegexUtils.extract(contentStr, regex);
+ if (values.size() > 0) {
+ content.setValue(values.get(0));
+ content.setValues(
+ values.toArray(new String[values.size()]));
}
+ } catch (MalformedPatternException e) {
+ logger.error(
+ "Invalid regular expression: " + regex, e);
}
}
- } catch (Exception e) {
- logger.error("Parse error: " + e.getMessage(), e);
- contentStr = "";
- } finally {
- parsed = true;
}
- }
- return contents;
- }
- public void setContents(Map<String, Content> contents) {
- this.contents = contents;
+ return contentStr;
+ } catch (Exception e) {
+ logger.error("Parse error: " + e.getMessage(), e);
+ return "";
+ }
}
protected abstract String parse(
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserFactory.java
Mon Oct 1 00:21:21 2007
@@ -16,11 +16,8 @@
*/
package org.apache.tika.parser;
-import java.io.InputStream;
-
import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.config.ParserConfig;
import org.apache.tika.exception.TikaException;
@@ -33,80 +30,19 @@
static Logger logger = Logger.getRootLogger();
-
-
- public static Parser getParser(
- InputStream inputStream, String mimeType, TikaConfig tc)
- throws TikaException {
-
- // Verify that all passed parameters are (probably) valid.
-
- if (StringUtils.isBlank(mimeType)) {
- throw new TikaException("Mime type not specified.");
- }
-
- if (inputStream == null) {
- throw new TikaException("Input stream is null.");
- }
-
- if (tc == null) {
- throw new TikaException("Configuration object is null.");
- }
-
- ParserConfig pc = getParserConfig(mimeType, tc);
- if (pc == null) {
- throw new TikaException(
- "Could not find parser config for mime type "
- + mimeType + ".");
- }
-
- String className = pc.getParserClass();
- Parser parser = null;
-
+ public static Parser getParser(ParserConfig config) throws TikaException {
+ String className = config.getParserClass();
if (StringUtils.isBlank(className)) {
throw new TikaException(
"Parser class name missing from ParserConfig.");
}
-
try {
- logger.info("Loading parser class = " + className
- + " MimeType = " + mimeType);
-
- Class<?> parserClass = Class.forName(className);
- parser = (Parser) parserClass.newInstance();
- parser.setMimeType(mimeType);
- parser.setContents(pc.getContents());
- parser.setInputStream(inputStream);
-
- } catch (ClassNotFoundException e) {
- logger.error(e.getMessage());
- throw new TikaException(e.getMessage());
- } catch (InstantiationException e) {
- logger.error(e.getMessage());
- throw new TikaException(e.getMessage());
- } catch (IllegalAccessException e) {
- logger.error(e.getMessage());
+ logger.info("Loading parser class = " + className);
+ return (Parser) Class.forName(className).newInstance();
+ } catch (Exception e) {
+ logger.error("Unable to instantiate parser: " + className, e);
throw new TikaException(e.getMessage());
}
-
- return parser;
}
-
- private static ParserConfig getParserConfig(String mimeType, TikaConfig tc)
- throws TikaException {
-
- ParserConfig pc = tc.getParserConfig(mimeType);
-
- if (pc == null) {
- String message =
- "Could not find parser configuration for mime type "
- + mimeType + ".";
-
- logger.error(message);
- throw new TikaException(message);
- }
-
- return pc;
- }
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
Mon Oct 1 00:21:21 2007
@@ -24,6 +24,7 @@
import java.net.URL;
// TIKA imports
+import org.apache.tika.config.ParserConfig;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.TikaMimeKeys;
@@ -41,54 +42,18 @@
* receive input from a stream opened from the specified URL. NB: Close the
* input stream when it is no longer needed!
*
- * @param inputStream
- * stream containing document data to parse
- * @param config
- * @param mimeType
- * the document's MIME type
- * @return a parser appropriate to this MIME type and ready to read input
- * from the specified document
- * @throws TikaException
- * @throws IOException
- */
- public static Parser getParser(InputStream inputStream, TikaConfig config,
- String mimeType) throws TikaException, IOException {
-
- if (inputStream == null) {
- throw new TikaException("Document input stream not provided.");
- }
-
- return ParserFactory.getParser(inputStream, mimeType, config);
- }
-
- // Note that we cannot provide a method that takes an InputStream
- // but not a MIME type, since we will not have a resource
- // name from which to derive it.
-
- /**
- * Returns a parser that can handle the specified MIME type, and is set to
- * receive input from a stream opened from the specified URL. NB: Close the
- * input stream when it is no longer needed!
- *
- * @param documentUrl
- * URL pointing to the document to parse
* @param config
* @param mimeType
* the document's MIME type
- * @return a parser appropriate to this MIME type and ready to read input
- * from the specified document
+ * @return a parser appropriate to this MIME type
* @throws TikaException
- * @throws IOException
*/
- public static Parser getParser(URL documentUrl, TikaConfig config,
- String mimeType) throws TikaException, IOException {
-
- if (documentUrl == null) {
- throw new TikaException("Document URL not provided.");
- }
-
- return ParserFactory.getParser(documentUrl.openStream(), mimeType,
- config);
+ public static Parser getParser(TikaConfig config, String mimeType)
+ throws TikaException {
+ ParserConfig pc = config.getParserConfig(mimeType);
+ Parser parser = ParserFactory.getParser(pc);
+ parser.setMimeType(mimeType);
+ return parser;
}
/**
@@ -103,48 +68,12 @@
* @return a parser appropriate to this MIME type and ready to read input
* from the specified document
* @throws TikaException
- * @throws IOException
*/
public static Parser getParser(URL documentUrl, TikaConfig config)
- throws TikaException, IOException {
-
- String mimetype = config.getMimeRepository().getMimeType(documentUrl)
- .getName();
- return getParser(documentUrl, config, mimetype);
- }
-
- /**
- * Returns a parser that can handle the specified MIME type, and is set to
- * receive input from a stream opened from the specified URL. NB: Close the
- * input stream when it is no longer needed!
- *
- * @param documentFile
- * File object pointing to the document to parse
- * @param config
- * @param mimeType
- * the document's MIME type
- * @return a parser appropriate to this MIME type and ready to read input
- * from the specified document
- * @throws TikaException
- * @throws IOException
- */
- public static Parser getParser(File documentFile, TikaConfig config,
- String mimeType) throws TikaException, IOException {
-
- if (documentFile == null) {
- throw new TikaException("Document file not provided.");
- }
-
- if (!documentFile.canRead()) {
- throw new TikaException(
- "Document file does not exist or is not readable.");
- }
-
- FileInputStream inputStream = new FileInputStream(documentFile);
- // TODO: Do we want to wrap a BufferedInputStream, or does the
- // file's buffering suffice?
-
- return ParserFactory.getParser(inputStream, mimeType, config);
+ throws TikaException {
+ String mimetype =
+ config.getMimeRepository().getMimeType(documentUrl).getName();
+ return getParser(config, mimetype);
}
/**
@@ -158,14 +87,12 @@
* @return a parser appropriate to this MIME type and ready to read input
* from the specified document
* @throws TikaException
- * @throws IOException
*/
public static Parser getParser(File documentFile, TikaConfig config)
- throws TikaException, IOException {
-
- String mimetype = config.getMimeRepository().getMimeType(documentFile)
- .getName();
- return getParser(documentFile, config, mimetype);
+ throws TikaException {
+ String mimetype =
+ config.getMimeRepository().getMimeType(documentFile).getName();
+ return getParser(config, mimetype);
}
/**
@@ -180,12 +107,13 @@
* @throws TikaException
* @throws IOException
*/
- public static String getStringContent(InputStream inputStream,
- TikaConfig config, String mimeType) throws TikaException,
- IOException {
-
- Parser parser = getParser(inputStream, config, mimeType);
- return getStringContent(parser);
+ public static String getStringContent(
+ InputStream inputStream, TikaConfig config, String mimeType)
+ throws TikaException, IOException {
+ ParserConfig pc = config.getParserConfig(mimeType);
+ Parser parser = ParserFactory.getParser(pc);
+ parser.setMimeType(mimeType);
+ return parser.getContents(inputStream, pc.getContents());
}
/**
@@ -200,9 +128,9 @@
*/
public static String getStringContent(URL documentUrl, TikaConfig config)
throws TikaException, IOException {
-
- Parser parser = getParser(documentUrl, config);
- return getStringContent(parser);
+ String mime =
+ config.getMimeRepository().getMimeType(documentUrl).getName();
+ return getStringContent(documentUrl, config, mime);
}
/**
@@ -217,11 +145,15 @@
* @throws TikaException
* @throws IOException
*/
- public static String getStringContent(URL documentUrl, TikaConfig config,
- String mimeType) throws TikaException, IOException {
-
- Parser parser = getParser(documentUrl, config, mimeType);
- return getStringContent(parser);
+ public static String getStringContent(
+ URL documentUrl, TikaConfig config, String mimeType)
+ throws TikaException, IOException {
+ InputStream stream = documentUrl.openStream();
+ try {
+ return getStringContent(stream, config, mimeType);
+ } finally {
+ stream.close();
+ }
}
/**
@@ -236,11 +168,15 @@
* @throws TikaException
* @throws IOException
*/
- public static String getStringContent(File documentFile, TikaConfig config,
- String mimeType) throws TikaException, IOException {
-
- Parser parser = getParser(documentFile, config, mimeType);
- return getStringContent(parser);
+ public static String getStringContent(
+ File documentFile, TikaConfig config, String mimeType)
+ throws TikaException, IOException {
+ InputStream stream = new FileInputStream(documentFile);
+ try {
+ return getStringContent(stream, config, mimeType);
+ } finally {
+ stream.close();
+ }
}
/**
@@ -255,12 +191,9 @@
*/
public static String getStringContent(File documentFile, TikaConfig config)
throws TikaException, IOException {
-
- Parser parser = getParser(documentFile, config);
- return getStringContent(parser);
+ String mime =
+ config.getMimeRepository().getMimeType(documentFile).getName();
+ return getStringContent(documentFile, config, mime);
}
- private static String getStringContent(Parser parser) throws IOException {
- return parser.getStrContent();
- }
}
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=580874&r1=580873&r2=580874&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Mon Oct
1 00:21:21 2007
@@ -17,12 +17,17 @@
package org.apache.tika;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
import org.apache.tika.config.Content;
+import org.apache.tika.config.ParserConfig;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.log.TikaLogger;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserFactory;
import org.apache.tika.utils.ParseUtils;
import org.apache.tika.utils.Utils;
import org.jdom.JDOMException;
@@ -69,12 +74,7 @@
File file = getTestFile("testPDF.pdf");
String s1 = ParseUtils.getStringContent(file, tc);
String s2 = ParseUtils.getStringContent(file, tc, "application/pdf");
-
- Parser parser = ParseUtils.getParser(file, tc);
- String s3 = parser.getStrContent();
-
assertEquals(s1, s2);
- assertEquals(s1, s3);
}
public void testTXTExtraction() throws Exception {
@@ -135,25 +135,33 @@
String s2 = ParseUtils.getStringContent(file, tc, "text/html");
assertEquals(s1, s2);
- Parser parser = ParseUtils.getParser(file, tc);
+ ParserConfig config = tc.getParserConfig("text/html");
+ Parser parser = ParserFactory.getParser(config);
assertNotNull(parser);
assertEquals("org.apache.tika.parser.html.HtmlParser",
parser.getClass().getName());
+ parser.setMimeType("text/html");
-
- Content content = parser.getContent("title");
- assertNotNull(content);
- assertEquals("Title : Test Indexation Html", content.getValue());
+ Map<String, Content> contents = config.getContents();
+ assertNotNull(contents);
+ InputStream stream = new FileInputStream(file);
+ try {
+ parser.getContents(stream, contents);
+ } finally {
+ stream.close();
+ }
+ assertEquals(
+ "Title : Test Indexation Html",
+ contents.get("title").getValue());
assertEquals("text/html", parser.getMimeType());
- final String text = Utils.toString(parser.getContents());
+ final String text = Utils.toString(contents);
final String expected = "Test Indexation Html";
- assertTrue("text contains '" + expected + "'",
- text.contains(expected));
+ assertTrue("text contains '" + expected + "'",
text.contains(expected));
}
private File getTestFile(String filename) {
- return new File(testFilesBaseDir, filename);
+ return new File(testFilesBaseDir, filename);
}
}