Author: jukka
Date: Thu Sep 4 12:34:08 2008
New Revision: 692207
URL: http://svn.apache.org/viewvc?rev=692207&view=rev
Log:
TIKA-149: Parser for zip files
Moved the ZipParser class to o.a.t.parser.pkg and extracted an abstract
PackageParser base class to better support other package formats.
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ (props
changed)
- copied from r692181,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
incubator/tika/trunk/src/main/resources/tika-config.xml
Propchange: incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/
------------------------------------------------------------------------------
svn:mergeinfo =
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java?rev=692207&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
Thu Sep 4 12:34:08 2008
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Abstract base class for parsers that deal with package formats.
+ * Subclasses can call the
+ * [EMAIL PROTECTED] #parseEntry(InputStream, XHTMLContentHandler, Metadata)}
+ * method to parse the given package entry using the configured
+ * entry parser. The entries will be written to the XHTML event stream
+ * as <div class="package-entry"> elements that contain the
+ * (optional) entry name as a <h1> element and the full
+ * structured body content of the parsed entry.
+ */
+public abstract class PackageParser extends AbstractParser {
+
+ /**
+ * The parser instance used to parse package entries.
+ */
+ private Parser parser;
+
+ /**
+ * Returns the parser instance used to parse package entries.
+ *
+ * @return entry parser
+ */
+ public Parser getParser() {
+ Parser parser = this.parser;
+ if (parser == null) {
+ parser = new AutoDetectParser();
+ }
+ return parser;
+ }
+
+ /**
+ * Sets the parser instance used to parse package entries.
+ *
+ * @param parser entry parser
+ */
+ public void setParser(Parser parser) {
+ this.parser = parser;
+ }
+
+ /**
+ * Parses the given entry entry using the underlying parser instance.
+ * It is not an error if the entry can not be parsed, in that case
+ * just the entry name (if given) is emitted.
+ *
+ * @param stream entry stream
+ * @param xhtml XHTML event handler
+ * @param metadata entry metadata
+ * @throws IOException if an IO error occurs
+ * @throws SAXException if a SAX error occurs
+ */
+ protected void parseEntry(
+ InputStream stream, XHTMLContentHandler xhtml, Metadata metadata)
+ throws IOException, SAXException {
+ xhtml.startElement("div", "class", "package-entry");
+
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name != null) {
+ xhtml.element("h1", name);
+ xhtml.characters("\n");
+ }
+
+ try {
+ getParser().parse(
+ new CloseShieldInputStream(stream),
+ new BodyContentHandler(xhtml),
+ metadata);
+ xhtml.characters("\n");
+ } catch (TikaException e) {
+ // Could not parse the entry, just skip the content
+ }
+
+ xhtml.endElement("div");
+ }
+
+}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java?rev=692207&r1=692181&r2=692207&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
Thu Sep 4 12:34:08 2008
@@ -14,30 +14,24 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.zip;
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
/**
* Zip File Parser.
*/
-public class ZipParser extends AbstractParser {
-
- private Parser parser;
+public class ZipParser extends PackageParser {
/**
* Parses the given stream as a Zip file.
@@ -57,7 +51,9 @@
try {
ZipEntry entry = zip.getNextEntry();
while (entry != null) {
- parseEntry(xhtml, entry, zip);
+ Metadata entrydata = new Metadata();
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
+ parseEntry(zip, xhtml, entrydata);
entry = zip.getNextEntry();
}
} finally {
@@ -67,48 +63,4 @@
xhtml.endDocument();
}
- /**
- * Parses the given Zip entry using the underlying parser instance.
- * It is not an error if the entry can not be parsed, in that case
- * just the entry name is emitted.
- *
- * @param xhtml XHTML event handler
- * @param entry zip entry
- * @param stream zip stream
- * @throws IOException if an IO error occurs
- * @throws SAXException if a SAX error occurs
- */
- private void parseEntry(
- XHTMLContentHandler xhtml, ZipEntry entry, InputStream stream)
- throws IOException, SAXException {
- xhtml.startElement("div", "class", "file");
- xhtml.element("h1", entry.getName());
- xhtml.characters("\n");
-
- try {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
- getParser().parse(
- new CloseShieldInputStream(stream),
- new BodyContentHandler(xhtml),
- metadata);
- } catch (TikaException e) {
- // Could not parse the entry, just skip the content
- }
-
- xhtml.characters("\n");
- xhtml.endElement("div");
- }
-
- public Parser getParser() {
- if (parser == null)
- {
- return new AutoDetectParser();
- }
- return parser;
- }
-
- public void setParser(Parser parser) {
- this.parser = parser;
- }
}
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692207&r1=692206&r2=692207&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep 4 12:34:08
2008
@@ -105,7 +105,7 @@
<mime>image/x-xcf</mime>
</parser>
- <parser name="parse-zip" class="org.apache.tika.parser.zip.ZipParser">
+ <parser name="parse-zip" class="org.apache.tika.parser.pkg.ZipParser">
<mime>application/zip</mime>
</parser>