Author: jukka
Date: Thu Sep  4 10:03:51 2008
New Revision: 692165

URL: http://svn.apache.org/viewvc?rev=692165&view=rev
Log:
TIKA-149: Parser for zip files

Further refinement: add javadocs, set CONTENT_TYPE metadata, don't close the 
document stream, etc.

Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=692165&r1=692164&r2=692165&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Thu Sep  4 10:03:51 2008
@@ -60,7 +60,9 @@
 
 25. TIKA-146 - Upgrade to POI 3.1 (Jukka Zitting)
 
-26. TIKA-99 - Support external parser programs (Jukka Zitting)
+26. TIKA-99  - Support external parser programs (Jukka Zitting)
+
+27. TIKA-149 - Parser for Zip files (Dave Meikle & Jukka Zitting)
 
 Release 0.1-incubating - 12/27/2007
 

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java?rev=692165&r1=692164&r2=692165&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java 
(original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java 
Thu Sep  4 10:03:51 2008
@@ -39,33 +39,60 @@
 
     private Parser parser;
 
-    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata)
+    /**
+     * Parses the given stream as a Zip file.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, TikaException, SAXException {
+        metadata.set(Metadata.CONTENT_TYPE, "application/zip");
+
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 
-        ZipInputStream zis = new ZipInputStream(stream);
-        ZipEntry ze;
-        while ((ze = zis.getNextEntry()) != null) {
-            parseEntry(xhtml, ze, zis);
-            zis.closeEntry();
+        // At the end we want to close the Zip stream to release any associated
+        // resources, but the underlying document stream should not be closed
+        ZipInputStream zip =
+            new ZipInputStream(new CloseShieldInputStream(stream));
+        try {
+            ZipEntry entry = zip.getNextEntry();
+            while (entry != null) {
+                parseEntry(xhtml, entry, zip);
+                entry = zip.getNextEntry();
+            }
+        } finally {
+            zip.close();
         }
-        zis.close();
 
         xhtml.endDocument();
     }
 
+    /**
+     * Parses the given Zip entry using the underlying parser instance.
+     * It is not an error if the entry can not be parsed, in that case
+     * just the entry name is emitted.
+     *
+     * @param xhtml XHTML event handler
+     * @param entry zip entry
+     * @param stream zip stream
+     * @throws IOException if an IO error occurs
+     * @throws SAXException if a SAX error occurs
+     */
     private void parseEntry(
             XHTMLContentHandler xhtml, ZipEntry entry, InputStream stream)
-            throws IOException, TikaException, SAXException {
+            throws IOException, SAXException {
         xhtml.startElement("div", "class", "file");
         xhtml.element("h1", entry.getName());
 
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
-        ContentHandler content = new BodyContentHandler();
-        getParser().parse(new CloseShieldInputStream(stream), content, 
metadata);
-        xhtml.element("content", content.toString());
+        try {
+            Metadata metadata = new Metadata();
+            metadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
+            ContentHandler content = new BodyContentHandler();
+            getParser().parse(new CloseShieldInputStream(stream), content, 
metadata);
+            xhtml.element("p", content.toString());
+        } catch (TikaException e) {
+            // Could not parse the entry, just skip the content
+        }
 
         xhtml.endElement("div");
     }


Reply via email to