Author: jukka
Date: Thu Sep  4 12:34:08 2008
New Revision: 692207

URL: http://svn.apache.org/viewvc?rev=692207&view=rev
Log:
TIKA-149: Parser for zip files 

Moved the ZipParser class to o.a.t.parser.pkg and extracted an abstract 
PackageParser base class to better support other package formats.

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/   (props 
changed)
      - copied from r692181, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
Removed:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/
Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
    incubator/tika/trunk/src/main/resources/tika-config.xml

Propchange: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/
------------------------------------------------------------------------------
    svn:mergeinfo = 

Added: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=692207&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 (added)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 Thu Sep  4 12:34:08 2008
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Abstract base class for parsers that deal with package formats.
+ * Subclasses can call the
+ * [EMAIL PROTECTED] #parseEntry(InputStream, XHTMLContentHandler, Metadata)}
+ * method to parse the given package entry using the configured
+ * entry parser. The entries will be written to the XHTML event stream
+ * as <div class="package-entry"> elements that contain the
+ * (optional) entry name as a <h1> element and the full
+ * structured body content of the parsed entry.
+ */
+public abstract class PackageParser extends AbstractParser {
+
+    /**
+     * The parser instance used to parse package entries.
+     */
+    private Parser parser;
+
+    /**
+     * Returns the parser instance used to parse package entries.
+     *
+     * @return entry parser
+     */
+    public Parser getParser() {
+        Parser parser = this.parser;
+        if (parser == null) {
+            parser = new AutoDetectParser();
+        }
+        return parser;
+    }
+
+    /**
+     * Sets the parser instance used to parse package entries.
+     *
+     * @param parser entry parser
+     */
+    public void setParser(Parser parser) {
+        this.parser = parser;
+    }
+
+    /**
+     * Parses the given entry entry using the underlying parser instance.
+     * It is not an error if the entry can not be parsed, in that case
+     * just the entry name (if given) is emitted.
+     *
+     * @param stream entry stream
+     * @param xhtml XHTML event handler
+     * @param metadata entry metadata
+     * @throws IOException if an IO error occurs
+     * @throws SAXException if a SAX error occurs
+     */
+    protected void parseEntry(
+            InputStream stream, XHTMLContentHandler xhtml, Metadata metadata)
+            throws IOException, SAXException {
+        xhtml.startElement("div", "class", "package-entry");
+
+        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (name != null) {
+            xhtml.element("h1", name);
+            xhtml.characters("\n");
+        }
+
+        try {
+            getParser().parse(
+                    new CloseShieldInputStream(stream),
+                    new BodyContentHandler(xhtml),
+                    metadata);
+            xhtml.characters("\n");
+        } catch (TikaException e) {
+            // Could not parse the entry, just skip the content
+        }
+
+        xhtml.endElement("div");
+    }
+
+}

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java?rev=692207&r1=692181&r2=692207&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java 
Thu Sep  4 12:34:08 2008
@@ -14,30 +14,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.zip;
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
 /**
  * Zip File Parser.
  */
-public class ZipParser extends AbstractParser {
-
-    private Parser parser;
+public class ZipParser extends PackageParser {
 
     /**
      * Parses the given stream as a Zip file.
@@ -57,7 +51,9 @@
         try {
             ZipEntry entry = zip.getNextEntry();
             while (entry != null) {
-                parseEntry(xhtml, entry, zip);
+                Metadata entrydata = new Metadata();
+                entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
+                parseEntry(zip, xhtml, entrydata);
                 entry = zip.getNextEntry();
             }
         } finally {
@@ -67,48 +63,4 @@
         xhtml.endDocument();
     }
 
-    /**
-     * Parses the given Zip entry using the underlying parser instance.
-     * It is not an error if the entry can not be parsed, in that case
-     * just the entry name is emitted.
-     *
-     * @param xhtml XHTML event handler
-     * @param entry zip entry
-     * @param stream zip stream
-     * @throws IOException if an IO error occurs
-     * @throws SAXException if a SAX error occurs
-     */
-    private void parseEntry(
-            XHTMLContentHandler xhtml, ZipEntry entry, InputStream stream)
-            throws IOException, SAXException {
-        xhtml.startElement("div", "class", "file");
-        xhtml.element("h1", entry.getName());
-        xhtml.characters("\n");
-
-        try {
-            Metadata metadata = new Metadata();
-            metadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
-            getParser().parse(
-                    new CloseShieldInputStream(stream),
-                    new BodyContentHandler(xhtml),
-                    metadata);
-        } catch (TikaException e) {
-            // Could not parse the entry, just skip the content
-        }
-
-        xhtml.characters("\n");
-        xhtml.endElement("div");
-    }
-
-    public Parser getParser() {
-        if (parser == null)
-        {
-            return new AutoDetectParser();
-        }
-        return parser;
-    }
-
-    public void setParser(Parser parser) {
-        this.parser = parser;
-    }
 }

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692207&r1=692206&r2=692207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep  4 12:34:08 
2008
@@ -105,7 +105,7 @@
                 <mime>image/x-xcf</mime>
         </parser>
 
-        <parser name="parse-zip" class="org.apache.tika.parser.zip.ZipParser">
+        <parser name="parse-zip" class="org.apache.tika.parser.pkg.ZipParser">
                 <mime>application/zip</mime>
         </parser>
 


Reply via email to