Author: jukka
Date: Mon Jun 15 23:12:26 2009
New Revision: 785022

URL: http://svn.apache.org/viewvc?rev=785022&view=rev
Log:
TIKA-204: Use commons-compress for parsing packages

Consolidate package parsing code to PackageParser based on the shared 
ArchiveInputStream functionality from Commons Compress.

Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
 Mon Jun 15 23:12:26 2009
@@ -19,12 +19,10 @@
 import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.commons.compress.archivers.ar.ArArchiveEntry;
 import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -39,28 +37,15 @@
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, TikaException, SAXException {
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
         // At the end we want to close the ar stream to release any associated
         // resources, but the underlying document stream should not be closed
         ArArchiveInputStream ar =
             new ArArchiveInputStream(new CloseShieldInputStream(stream));
         try {
-            ArArchiveEntry entry = ar.getNextArEntry();
-            while (entry != null) {
-                if (!entry.isDirectory()) {
-                    Metadata entrydata = new Metadata();
-                    entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
-                    parseEntry(ar, xhtml, entrydata);
-                }
-                entry = ar.getNextArEntry();
-            }
+            parseArchive(ar, handler, metadata);
         } finally {
             ar.close();
         }
-
-        xhtml.endDocument();
     }
 
 }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
 Mon Jun 15 23:12:26 2009
@@ -23,17 +23,20 @@
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.DelegatingParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
- * Gzip parser.
+ * Bzip2 parser.
  */
-public class Bzip2Parser extends PackageParser {
+public class Bzip2Parser extends DelegatingParser {
 
     /**
-     * Parses the given stream as a gzip file.
+     * Parses the given stream as a bzip2 file.
      */
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
@@ -45,7 +48,7 @@
 
         // At the end we want to close the bzip2 stream to release any 
associated
         // resources, but the underlying document stream should not be closed
-        InputStream gzip =
+        InputStream bzip2 =
             new BZip2CompressorInputStream(new CloseShieldInputStream(stream));
         try {
             Metadata entrydata = new Metadata();
@@ -62,9 +65,14 @@
                 }
                 entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
             }
-            parseEntry(gzip, xhtml, entrydata);
+            // Use the delegate parser to parse the compressed document
+            super.parse(
+                    new CloseShieldInputStream(bzip2),
+                    new EmbeddedContentHandler(
+                            new BodyContentHandler(xhtml)),
+                    entrydata);
         } finally {
-            gzip.close();
+            bzip2.close();
         }
 
         xhtml.endDocument();

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
 Mon Jun 15 23:12:26 2009
@@ -19,12 +19,10 @@
 import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.commons.compress.archivers.cpio.CpioArchiveEntry;
 import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -39,28 +37,15 @@
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, TikaException, SAXException {
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
         // At the end we want to close the cpio stream to release any 
associated
         // resources, but the underlying document stream should not be closed
         CpioArchiveInputStream cpio =
             new CpioArchiveInputStream(new CloseShieldInputStream(stream));
         try {
-            CpioArchiveEntry entry = cpio.getNextCPIOEntry();
-            while (entry != null) {
-                if (!entry.isDirectory()) {
-                    Metadata entrydata = new Metadata();
-                    entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
-                    parseEntry(cpio, xhtml, entrydata);
-                }
-                entry = cpio.getNextCPIOEntry();
-            }
+            parseArchive(cpio, handler, metadata);
         } finally {
             cpio.close();
         }
-
-        xhtml.endDocument();
     }
 
 }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
 Mon Jun 15 23:12:26 2009
@@ -24,6 +24,9 @@
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.DelegatingParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -31,7 +34,7 @@
 /**
  * Gzip parser.
  */
-public class GzipParser extends PackageParser {
+public class GzipParser extends DelegatingParser {
 
     /**
      * Parses the given stream as a gzip file.
@@ -51,12 +54,17 @@
         try {
             Metadata entrydata = new Metadata();
             String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-            if (name != null) {
+            if (name != null && name.length() > 0) {
                 entrydata.set(
                         Metadata.RESOURCE_NAME_KEY,
                         GzipUtils.getUncompressedFilename(name));
             }
-            parseEntry(gzip, xhtml, entrydata);
+            // Use the delegate parser to parse the compressed document
+            super.parse(
+                    new CloseShieldInputStream(gzip),
+                    new EmbeddedContentHandler(
+                            new BodyContentHandler(xhtml)),
+                    entrydata);
         } finally {
             gzip.close();
         }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 Mon Jun 15 23:12:26 2009
@@ -19,6 +19,8 @@
 import java.io.IOException;
 import java.io.InputStream;
 
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -26,6 +28,7 @@
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
@@ -41,36 +44,49 @@
 public abstract class PackageParser extends DelegatingParser {
 
     /**
-     * Parses the given entry entry using the delegate parser instance.
+     * Parses the given stream as a package of multiple underlying files.
+     * The package entries are parsed using the delegate parser instance.
      * It is not an error if the entry can not be parsed, in that case
      * just the entry name (if given) is emitted.
      *
-     * @param stream entry stream
-     * @param xhtml XHTML event handler
-     * @param metadata entry metadata
+     * @param stream package stream
+     * @param handler content handler
+     * @param metadata package metadata
      * @throws IOException if an IO error occurs
      * @throws SAXException if a SAX error occurs
      */
-    protected void parseEntry(
-            InputStream stream, XHTMLContentHandler xhtml, Metadata metadata)
+    protected void parseArchive(
+            ArchiveInputStream archive, ContentHandler handler, Metadata 
metadata)
             throws IOException, SAXException {
-        xhtml.startElement("div", "class", "package-entry");
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
 
-        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
-        if (name != null) {
-            xhtml.element("h1", name);
+        ArchiveEntry entry = archive.getNextEntry();
+        while (entry != null) {
+            if (!entry.isDirectory()) {
+                xhtml.startElement("div", "class", "package-entry");
+                Metadata entrydata = new Metadata();
+                String name = entry.getName();
+                if (name != null && name.length() > 0) {
+                    entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+                    xhtml.element("h1", name);
+                }
+                try {
+                    // Use the delegate parser to parse this entry
+                    super.parse(
+                            new CloseShieldInputStream(archive),
+                            new EmbeddedContentHandler(
+                                    new BodyContentHandler(xhtml)),
+                            entrydata);
+                } catch (TikaException e) {
+                    // Could not parse the entry, just skip the content
+                }
+                xhtml.endElement("div");
+            }
+            entry = archive.getNextEntry();
         }
 
-        try {
-            super.parse(
-                    new CloseShieldInputStream(stream),
-                    new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
-                    metadata);
-        } catch (TikaException e) {
-            // Could not parse the entry, just skip the content
-        }
-
-        xhtml.endElement("div");
+        xhtml.endDocument();
     }
 
 }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
 Mon Jun 15 23:12:26 2009
@@ -19,12 +19,10 @@
 import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -41,28 +39,15 @@
             throws IOException, TikaException, SAXException {
         metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
 
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
         // At the end we want to close the tar stream to release any associated
         // resources, but the underlying document stream should not be closed
         TarArchiveInputStream tar =
             new TarArchiveInputStream(new CloseShieldInputStream(stream));
         try {
-            TarArchiveEntry entry = tar.getNextTarEntry();
-            while (entry != null) {
-                if (!entry.isDirectory()) {
-                    Metadata entrydata = new Metadata();
-                    entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
-                    parseEntry(tar, xhtml, entrydata);
-                }
-                entry = tar.getNextTarEntry();
-            }
+            parseArchive(tar, handler, metadata);
         } finally {
             tar.close();
         }
-
-        xhtml.endDocument();
     }
 
 }

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
 Mon Jun 15 23:12:26 2009
@@ -19,12 +19,10 @@
 import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.commons.compress.archivers.ArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -41,26 +39,15 @@
             throws IOException, TikaException, SAXException {
         metadata.set(Metadata.CONTENT_TYPE, "application/zip");
 
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
         // At the end we want to close the Zip stream to release any associated
         // resources, but the underlying document stream should not be closed
         ZipArchiveInputStream zip =
             new ZipArchiveInputStream(new CloseShieldInputStream(stream));
         try {
-            ArchiveEntry entry = zip.getNextEntry();
-            while (entry != null) {
-                Metadata entrydata = new Metadata();
-                entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
-                parseEntry(zip, xhtml, entrydata);
-                entry = zip.getNextEntry();
-            }
+            parseArchive(zip, handler, metadata);
         } finally {
             zip.close();
         }
-
-        xhtml.endDocument();
     }
 
 }


Reply via email to