Author: jukka
Date: Sun Oct  7 04:43:42 2007
New Revision: 582612

URL: http://svn.apache.org/viewvc?rev=582612&view=rev
Log:
TIKA-43 - Parser interface

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java  
 (with props)
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
   (with props)

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java?rev=582612&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java 
(added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java 
Sun Oct  7 04:43:42 2007
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
+
+/**
+ * Decorator base class for the [EMAIL PROTECTED] Parser} interface. This class
+ * simply delegates all parsing calls to an underlying decorated parser
+ * instance. Subclasses can provide extra decoration by overriding the
+ * parse method.
+ */
+public class ParserDecorator implements Parser {
+
+    /**
+     * The decorated parser instance.
+     */
+    private final Parser parser;
+
+    /**
+     * Creates a decorator for the given parser.
+     *
+     * @param parser the parser instance to be decorated
+     */
+    public ParserDecorator(Parser parser) {
+        this.parser = parser;
+    }
+
+    /**
+     * Delegates the method call to the decorated parser. Subclasses should
+     * override this method (and use <code>super.parse()</code> to invoke
+     * the decorated parser) to implement extra decoration.
+     */
+    public String parse(InputStream stream, Iterable<Content> contents)
+            throws IOException, TikaException {
+        return parser.parse(stream, contents);
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserDecorator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java?rev=582612&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
 Sun Oct  7 04:43:42 2007
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.log4j.Logger;
+import org.apache.oro.text.regex.MalformedPatternException;
+import org.apache.tika.config.Content;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.RegexUtils;
+
+/**
+ * Parser decorator that post-processes the results from a decorated parser.
+ * The post-processing takes care of filling in any "fulltext", "summary", and
+ * regexp [EMAIL PROTECTED] Content} objects with the full text content 
returned by
+ * the decorated parser. The post-processing also catches and logs any
+ * exceptions thrown by the decorated parser.
+ */
+public class ParserPostProcessor extends ParserDecorator {
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger logger =
+        Logger.getLogger(ParserPostProcessor.class);
+
+    /**
+     * Creates a post-processing decorator for the given parser.
+     *
+     * @param parser the parser to be decorated
+     */
+    public ParserPostProcessor(Parser parser) {
+        super(parser);
+    }
+
+    /**
+     * Forwards the call to the delegated parser and post-processes the
+     * results as described above.
+     */
+    public String parse(InputStream stream, Iterable<Content> contents)
+            throws IOException, TikaException {
+        try {
+            String contentStr = super.parse(stream, contents);
+
+            for (Content content : contents) {
+                if ("fulltext".equalsIgnoreCase(content.getTextSelect())) {
+                    content.setValue(contentStr);
+                } else if 
("summary".equalsIgnoreCase(content.getTextSelect())) {
+                    int length = Math.min(contentStr.length(), 500);
+                    String summary = contentStr.substring(0, length);
+                    content.setValue(summary);
+                } else if (content.getRegexSelect() != null) {
+                    String regex = content.getRegexSelect();
+                    try {
+                        List<String> values =
+                            RegexUtils.extract(contentStr, regex);
+                        if (values.size() > 0) {
+                            content.setValue(values.get(0));
+                            content.setValues(
+                                    values.toArray(new String[values.size()]));
+                        }
+                    } catch (MalformedPatternException e) {
+                        logger.error(
+                                "Invalid regular expression: " + regex, e);
+                    }
+                }
+            }
+
+            return contentStr;
+        } catch (Exception e) {
+            logger.error("Parse error: " + e.getMessage(), e);
+            return "";
+        }
+    }
+
+}

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to