Author: jukka
Date: Sun Oct 7 04:43:42 2007
New Revision: 582612
URL: http://svn.apache.org/viewvc?rev=582612&view=rev
Log:
TIKA-43 - Parser interface
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
(with props)
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
(with props)
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=582612&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
Sun Oct 7 04:43:42 2007
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Decorator base class for the [EMAIL PROTECTED] Parser} interface. This class
+ * simply delegates all parsing calls to an underlying decorated parser
+ * instance. Subclasses can provide extra decoration by overriding the
+ * parse method.
+ */
+public class ParserDecorator implements Parser {
+
+ /**
+ * The decorated parser instance.
+ */
+ private final Parser parser;
+
+ /**
+ * Creates a decorator for the given parser.
+ *
+ * @param parser the parser instance to be decorated
+ */
+ public ParserDecorator(Parser parser) {
+ this.parser = parser;
+ }
+
+ /**
+ * Delegates the method call to the decorated parser. Subclasses should
+ * override this method (and use <code>super.parse()</code> to invoke
+ * the decorated parser) to implement extra decoration.
+ */
+ public String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ return parser.parse(stream, contents);
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=582612&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
Sun Oct 7 04:43:42 2007
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.apache.oro.text.regex.MalformedPatternException;
+import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.RegexUtils;
+
+/**
+ * Parser decorator that post-processes the results from a decorated parser.
+ * The post-processing takes care of filling in any "fulltext", "summary", and
+ * regexp [EMAIL PROTECTED] Content} objects with the full text content
returned by
+ * the decorated parser. The post-processing also catches and logs any
+ * exceptions thrown by the decorated parser.
+ */
+public class ParserPostProcessor extends ParserDecorator {
+
+ /**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ Logger.getLogger(ParserPostProcessor.class);
+
+ /**
+ * Creates a post-processing decorator for the given parser.
+ *
+ * @param parser the parser to be decorated
+ */
+ public ParserPostProcessor(Parser parser) {
+ super(parser);
+ }
+
+ /**
+ * Forwards the call to the delegated parser and post-processes the
+ * results as described above.
+ */
+ public String parse(InputStream stream, Iterable<Content> contents)
+ throws IOException, TikaException {
+ try {
+ String contentStr = super.parse(stream, contents);
+
+ for (Content content : contents) {
+ if ("fulltext".equalsIgnoreCase(content.getTextSelect())) {
+ content.setValue(contentStr);
+ } else if
("summary".equalsIgnoreCase(content.getTextSelect())) {
+ int length = Math.min(contentStr.length(), 500);
+ String summary = contentStr.substring(0, length);
+ content.setValue(summary);
+ } else if (content.getRegexSelect() != null) {
+ String regex = content.getRegexSelect();
+ try {
+ List<String> values =
+ RegexUtils.extract(contentStr, regex);
+ if (values.size() > 0) {
+ content.setValue(values.get(0));
+ content.setValues(
+ values.toArray(new String[values.size()]));
+ }
+ } catch (MalformedPatternException e) {
+ logger.error(
+ "Invalid regular expression: " + regex, e);
+ }
+ }
+ }
+
+ return contentStr;
+ } catch (Exception e) {
+ logger.error("Parse error: " + e.getMessage(), e);
+ return "";
+ }
+ }
+
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
------------------------------------------------------------------------------
svn:eol-style = native