Author: jukka
Date: Thu Feb 18 15:37:08 2010
New Revision: 911448
URL: http://svn.apache.org/viewvc?rev=911448&view=rev
Log:
TIKA-378: TikaConfig should notify users if it cannot initialize some parser
Make the package parser loadable even when the commons-collections library is
not present. Consolidate handling of all package and compression formats into a
single class.
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java
(with props)
Removed:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ArParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/Bzip2Parser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CpioParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/GzipParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TarParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java?rev=911448&view=auto
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java
(added)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java
Thu Feb 18 15:37:08 2010
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
+import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Abstract base class for parsers that deal with package formats.
+ * Subclasses can call the
+ * {...@link #parseArchive(ArchiveInputStream, ContentHandler, Metadata,
ParseContext)}
+ * method to parse the package stream. Package entries will be written
+ * to the XHTML event stream as <div class="package-entry"> elements
+ * that contain the (optional) entry name as a <h1> element and the full
+ * structured body content of the parsed entry.
+ */
+class PackageExtractor {
+
+ private final ContentHandler handler;
+
+ private final Metadata metadata;
+
+ private final ParseContext context;
+
+ private final Parser parser;
+
+ public PackageExtractor(
+ ContentHandler handler, Metadata metadata, ParseContext context) {
+ this.handler = handler;
+ this.metadata = metadata;
+ this.context = context;
+ this.parser = context.get(Parser.class, EmptyParser.INSTANCE);
+ }
+
+ public void parse(InputStream stream)
+ throws IOException, SAXException, TikaException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // At the end we want to close the package/compression stream to
+ // release any associated resources, but the underlying document
+ // stream should not be closed
+ stream = new CloseShieldInputStream(stream);
+
+ // Capture the first byte to determine the packaging/compression format
+ if (!stream.markSupported()) {
+ stream = new BufferedInputStream(stream);
+ }
+ stream.mark(1);
+ int b = stream.read();
+ stream.reset();
+
+ if (b == 'B') { // BZh...
+ metadata.set(Metadata.CONTENT_TYPE, "application/x-bzip");
+ parseBZip2(stream, xhtml);
+ } else if (b == 0x1f) { // \037\213...
+ metadata.set(Metadata.CONTENT_TYPE, "application/x-gzip");
+ parseGZIP(stream, xhtml);
+ } else if (b == 'P') { // PK\003\004...
+ metadata.set(Metadata.CONTENT_TYPE, "application/zip");
+ parse(new ZipArchiveInputStream(stream), xhtml);
+ } else if (b == '0' || b == 0x71 || b == 0xc7) { // looks like cpio
+ metadata.set(Metadata.CONTENT_TYPE, "application/x-cpio");
+ parse(new CpioArchiveInputStream(stream), xhtml);
+ } else if (b == '=') { // =<ar> or =!<arch>
+ metadata.set(Metadata.CONTENT_TYPE, "application/x-archive");
+ parse(new ArArchiveInputStream(stream), xhtml);
+ } else { // assume tar
+ metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
+ parse(new TarArchiveInputStream(stream), xhtml);
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseGZIP(InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ InputStream gzip = new GZIPInputStream(stream);
+ try {
+ Metadata entrydata = new Metadata();
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null && name.length() > 0) {
+ entrydata.set(
+ Metadata.RESOURCE_NAME_KEY,
+ GzipUtils.getUncompressedFilename(name));
+ }
+ // Use the delegate parser to parse the compressed document
+ parser.parse(
+ new CloseShieldInputStream(gzip),
+ new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ entrydata, context);
+ } finally {
+ gzip.close();
+ }
+ }
+
+ private void parseBZip2(InputStream stream, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ InputStream bzip2 = new BZip2CompressorInputStream(stream);
+ try {
+ Metadata entrydata = new Metadata();
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ if (name.endsWith(".tbz")) {
+ name = name.substring(0, name.length() - 4) + ".tar";
+ } else if (name.endsWith(".tbz2")) {
+ name = name.substring(0, name.length() - 5) + ".tar";
+ } else if (name.endsWith(".bz")) {
+ name = name.substring(0, name.length() - 3);
+ } else if (name.endsWith(".bz2")) {
+ name = name.substring(0, name.length() - 4);
+ }
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ }
+ // Use the delegate parser to parse the compressed document
+ parser.parse(
+ new CloseShieldInputStream(bzip2),
+ new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ entrydata, context);
+ } finally {
+ bzip2.close();
+ }
+ }
+
+ /**
+ * Parses the given stream as a package of multiple underlying files.
+ * The package entries are parsed using the delegate parser instance.
+ * It is not an error if the entry can not be parsed, in that case
+ * just the entry name (if given) is emitted.
+ *
+ * @param stream package stream
+ * @param handler content handler
+ * @param metadata package metadata
+ * @throws IOException if an IO error occurs
+ * @throws SAXException if a SAX error occurs
+ */
+ public void parse(ArchiveInputStream archive, XHTMLContentHandler xhtml)
+ throws IOException, SAXException {
+ try {
+ ArchiveEntry entry = archive.getNextEntry();
+ while (entry != null) {
+ if (!entry.isDirectory()) {
+ xhtml.startElement("div", "class", "package-entry");
+ Metadata entrydata = new Metadata();
+ String name = entry.getName();
+ if (name != null && name.length() > 0) {
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
+ xhtml.element("h1", name);
+ }
+ try {
+ // Use the delegate parser to parse this entry
+ parser.parse(
+ new CloseShieldInputStream(archive),
+ new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ entrydata, context);
+ } catch (TikaException e) {
+ // Could not parse the entry, just skip the content
+ }
+ xhtml.endElement("div");
+ }
+ entry = archive.getNextEntry();
+ }
+ } finally {
+ archive.close();
+ }
+ }
+
+}
Propchange:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageExtractor.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=911448&r1=911447&r2=911448&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
Thu Feb 18 15:37:08 2010
@@ -17,76 +17,57 @@
package org.apache.tika.parser.pkg;
import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
-import org.apache.commons.compress.archivers.ArchiveEntry;
-import org.apache.commons.compress.archivers.ArchiveInputStream;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.DelegatingParser;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * Abstract base class for parsers that deal with package formats.
- * Subclasses can call the
- * {...@link #parseArchive(ArchiveInputStream, ContentHandler, Metadata,
ParseContext)}
- * method to parse the package stream. Package entries will be written
- * to the XHTML event stream as <div class="package-entry"> elements
- * that contain the (optional) entry name as a <h1> element and the full
- * structured body content of the parsed entry.
+ * Parser for various packaging and compression formats. Package entries will
+ * be written to the XHTML event stream as <div class="package-entry">
+ * elements that contain the (optional) entry name as a <h1> element
+ * and the full structured body content of the parsed entry.
*/
-public abstract class PackageParser extends DelegatingParser {
+public class PackageParser implements Parser {
- /**
- * Parses the given stream as a package of multiple underlying files.
- * The package entries are parsed using the delegate parser instance.
- * It is not an error if the entry can not be parsed, in that case
- * just the entry name (if given) is emitted.
- *
- * @param stream package stream
- * @param handler content handler
- * @param metadata package metadata
- * @throws IOException if an IO error occurs
- * @throws SAXException if a SAX error occurs
- */
- protected void parseArchive(
- ArchiveInputStream archive, ContentHandler handler,
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.application("x-archive"),
+ MediaType.application("x-bzip"),
+ MediaType.application("x-bzip2"),
+ MediaType.application("x-cpio"),
+ MediaType.application("x-gtar"),
+ MediaType.application("x-gzip"),
+ MediaType.application("x-tar"),
+ MediaType.application("zip"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
- throws IOException, SAXException {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- ArchiveEntry entry = archive.getNextEntry();
- while (entry != null) {
- if (!entry.isDirectory()) {
- xhtml.startElement("div", "class", "package-entry");
- Metadata entrydata = new Metadata();
- String name = entry.getName();
- if (name != null && name.length() > 0) {
- entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
- xhtml.element("h1", name);
- }
- try {
- // Use the delegate parser to parse this entry
- super.parse(
- new CloseShieldInputStream(archive),
- new EmbeddedContentHandler(
- new BodyContentHandler(xhtml)),
- entrydata, context);
- } catch (TikaException e) {
- // Could not parse the entry, just skip the content
- }
- xhtml.endElement("div");
- }
- entry = archive.getNextEntry();
- }
+ throws IOException, SAXException, TikaException {
+ new PackageExtractor(handler, metadata, context).parse(stream);
+ }
- xhtml.endDocument();
+ /**
+ * @deprecated This method will be removed in Apache Tika 1.0.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
}
}
Modified:
lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=911448&r1=911447&r2=911448&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Thu Feb 18 15:37:08 2010
@@ -26,10 +26,7 @@
org.apache.tika.parser.mp3.Mp3Parser
org.apache.tika.parser.odf.OpenDocumentParser
org.apache.tika.parser.pdf.PDFParser
-org.apache.tika.parser.pkg.Bzip2Parser
-org.apache.tika.parser.pkg.GzipParser
-org.apache.tika.parser.pkg.TarParser
-org.apache.tika.parser.pkg.ZipParser
+org.apache.tika.parser.pkg.PackageParser
org.apache.tika.parser.rtf.RTFParser
org.apache.tika.parser.txt.TXTParser
org.apache.tika.parser.video.FLVParser