Author: jukka
Date: Mon Jun 15 23:12:26 2009
New Revision: 785022
URL: http://svn.apache.org/viewvc?rev=785022&view=rev
Log:
TIKA-204: Use commons-compress for parsing packages
Consolidate package parsing code to PackageParser based on the shared
ArchiveInputStream functionality from Commons Compress.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
Mon Jun 15 23:12:26 2009
@@ -19,12 +19,10 @@
import java.io.IOException;
import java.io.InputStream;
-import org.apache.commons.compress.archivers.ar.ArArchiveEntry;
import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -39,28 +37,15 @@
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, TikaException, SAXException {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
// At the end we want to close the ar stream to release any associated
// resources, but the underlying document stream should not be closed
ArArchiveInputStream ar =
new ArArchiveInputStream(new CloseShieldInputStream(stream));
try {
- ArArchiveEntry entry = ar.getNextArEntry();
- while (entry != null) {
- if (!entry.isDirectory()) {
- Metadata entrydata = new Metadata();
- entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
- parseEntry(ar, xhtml, entrydata);
- }
- entry = ar.getNextArEntry();
- }
+ parseArchive(ar, handler, metadata);
} finally {
ar.close();
}
-
- xhtml.endDocument();
}
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
Mon Jun 15 23:12:26 2009
@@ -23,17 +23,20 @@
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.DelegatingParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * Gzip parser.
+ * Bzip2 parser.
*/
-public class Bzip2Parser extends PackageParser {
+public class Bzip2Parser extends DelegatingParser {
/**
- * Parses the given stream as a gzip file.
+ * Parses the given stream as a bzip2 file.
*/
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
@@ -45,7 +48,7 @@
// At the end we want to close the bzip2 stream to release any
associated
// resources, but the underlying document stream should not be closed
- InputStream gzip =
+ InputStream bzip2 =
new BZip2CompressorInputStream(new CloseShieldInputStream(stream));
try {
Metadata entrydata = new Metadata();
@@ -62,9 +65,14 @@
}
entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
}
- parseEntry(gzip, xhtml, entrydata);
+ // Use the delegate parser to parse the compressed document
+ super.parse(
+ new CloseShieldInputStream(bzip2),
+ new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ entrydata);
} finally {
- gzip.close();
+ bzip2.close();
}
xhtml.endDocument();
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
Mon Jun 15 23:12:26 2009
@@ -19,12 +19,10 @@
import java.io.IOException;
import java.io.InputStream;
-import org.apache.commons.compress.archivers.cpio.CpioArchiveEntry;
import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -39,28 +37,15 @@
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, TikaException, SAXException {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
// At the end we want to close the cpio stream to release any
associated
// resources, but the underlying document stream should not be closed
CpioArchiveInputStream cpio =
new CpioArchiveInputStream(new CloseShieldInputStream(stream));
try {
- CpioArchiveEntry entry = cpio.getNextCPIOEntry();
- while (entry != null) {
- if (!entry.isDirectory()) {
- Metadata entrydata = new Metadata();
- entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
- parseEntry(cpio, xhtml, entrydata);
- }
- entry = cpio.getNextCPIOEntry();
- }
+ parseArchive(cpio, handler, metadata);
} finally {
cpio.close();
}
-
- xhtml.endDocument();
}
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
Mon Jun 15 23:12:26 2009
@@ -24,6 +24,9 @@
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.DelegatingParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -31,7 +34,7 @@
/**
* Gzip parser.
*/
-public class GzipParser extends PackageParser {
+public class GzipParser extends DelegatingParser {
/**
* Parses the given stream as a gzip file.
@@ -51,12 +54,17 @@
try {
Metadata entrydata = new Metadata();
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (name != null) {
+ if (name != null && name.length() > 0) {
entrydata.set(
Metadata.RESOURCE_NAME_KEY,
GzipUtils.getUncompressedFilename(name));
}
- parseEntry(gzip, xhtml, entrydata);
+ // Use the delegate parser to parse the compressed document
+ super.parse(
+ new CloseShieldInputStream(gzip),
+ new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ entrydata);
} finally {
gzip.close();
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
Mon Jun 15 23:12:26 2009
@@ -19,6 +19,8 @@
import java.io.IOException;
import java.io.InputStream;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -26,6 +28,7 @@
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
@@ -41,36 +44,49 @@
public abstract class PackageParser extends DelegatingParser {
/**
- * Parses the given entry entry using the delegate parser instance.
+ * Parses the given stream as a package of multiple underlying files.
+ * The package entries are parsed using the delegate parser instance.
* It is not an error if the entry can not be parsed, in that case
* just the entry name (if given) is emitted.
*
- * @param stream entry stream
- * @param xhtml XHTML event handler
- * @param metadata entry metadata
+ * @param stream package stream
+ * @param handler content handler
+ * @param metadata package metadata
* @throws IOException if an IO error occurs
* @throws SAXException if a SAX error occurs
*/
- protected void parseEntry(
- InputStream stream, XHTMLContentHandler xhtml, Metadata metadata)
+ protected void parseArchive(
+ ArchiveInputStream archive, ContentHandler handler, Metadata
metadata)
throws IOException, SAXException {
- xhtml.startElement("div", "class", "package-entry");
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (name != null) {
- xhtml.element("h1", name);
+ ArchiveEntry entry = archive.getNextEntry();
+ while (entry != null) {
+ if (!entry.isDirectory()) {
+ xhtml.startElement("div", "class", "package-entry");
+ Metadata entrydata = new Metadata();
+ String name = entry.getName();
+ if (name != null && name.length() > 0) {
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ xhtml.element("h1", name);
+ }
+ try {
+ // Use the delegate parser to parse this entry
+ super.parse(
+ new CloseShieldInputStream(archive),
+ new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ entrydata);
+ } catch (TikaException e) {
+ // Could not parse the entry, just skip the content
+ }
+ xhtml.endElement("div");
+ }
+ entry = archive.getNextEntry();
}
- try {
- super.parse(
- new CloseShieldInputStream(stream),
- new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
- metadata);
- } catch (TikaException e) {
- // Could not parse the entry, just skip the content
- }
-
- xhtml.endElement("div");
+ xhtml.endDocument();
}
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
Mon Jun 15 23:12:26 2009
@@ -19,12 +19,10 @@
import java.io.IOException;
import java.io.InputStream;
-import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -41,28 +39,15 @@
throws IOException, TikaException, SAXException {
metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
// At the end we want to close the tar stream to release any associated
// resources, but the underlying document stream should not be closed
TarArchiveInputStream tar =
new TarArchiveInputStream(new CloseShieldInputStream(stream));
try {
- TarArchiveEntry entry = tar.getNextTarEntry();
- while (entry != null) {
- if (!entry.isDirectory()) {
- Metadata entrydata = new Metadata();
- entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
- parseEntry(tar, xhtml, entrydata);
- }
- entry = tar.getNextTarEntry();
- }
+ parseArchive(tar, handler, metadata);
} finally {
tar.close();
}
-
- xhtml.endDocument();
}
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java?rev=785022&r1=785021&r2=785022&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
Mon Jun 15 23:12:26 2009
@@ -19,12 +19,10 @@
import java.io.IOException;
import java.io.InputStream;
-import org.apache.commons.compress.archivers.ArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -41,26 +39,15 @@
throws IOException, TikaException, SAXException {
metadata.set(Metadata.CONTENT_TYPE, "application/zip");
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
// At the end we want to close the Zip stream to release any associated
// resources, but the underlying document stream should not be closed
ZipArchiveInputStream zip =
new ZipArchiveInputStream(new CloseShieldInputStream(stream));
try {
- ArchiveEntry entry = zip.getNextEntry();
- while (entry != null) {
- Metadata entrydata = new Metadata();
- entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
- parseEntry(zip, xhtml, entrydata);
- entry = zip.getNextEntry();
- }
+ parseArchive(zip, handler, metadata);
} finally {
zip.close();
}
-
- xhtml.endDocument();
}
}