Author: jukka
Date: Thu Sep 4 09:40:24 2008
New Revision: 692148
URL: http://svn.apache.org/viewvc?rev=692148&view=rev
Log:
TIKA-149: Parser for zip files
Applied a patch by Dave Meikle (added the Apache header to ZipParser.java)
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
(with props)
Modified:
incubator/tika/trunk/src/main/resources/tika-config.xml
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java?rev=692148&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
Thu Sep 4 09:40:24 2008
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.zip;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+/**
+ * Zip File Parser.
+ */
+public class ZipParser extends AbstractParser {
+
+ private Parser parser;
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata)
+ throws IOException, TikaException, SAXException {
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ ZipInputStream zis = new ZipInputStream(stream);
+ ZipEntry ze;
+ while ((ze = zis.getNextEntry()) != null) {
+ xhtml.startElement("div", "class", "file");
+ xhtml.element("h1", ze.getName());
+
+ ContentHandler content = new BodyContentHandler();
+ getParser().parse(new CloseShieldInputStream(zis), content, new
Metadata());
+
+ xhtml.element("content", content.toString());
+ xhtml.endElement("div");
+
+ zis.closeEntry();
+ }
+ zis.close();
+ xhtml.endDocument();
+ }
+
+ public Parser getParser() {
+ if (parser == null)
+ {
+ return new AutoDetectParser();
+ }
+ return parser;
+ }
+
+ public void setParser(Parser parser) {
+ this.parser = parser;
+ }
+}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/zip/ZipParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692148&r1=692147&r2=692148&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep 4 09:40:24
2008
@@ -57,7 +57,7 @@
<mime>text/plain</mime>
</parser>
- <parser name="parse-openoffice"
class="org.apache.tika.parser.opendocument.OpenOfficeParser">
+ <parser name="parse-openoffice"
class="org.apache.tika.parser.opendocument.OpenOfficeParser">
<mime>application/vnd.sun.xml.writer</mime>
<mime>application/vnd.oasis.opendocument.text</mime>
<mime>application/vnd.oasis.opendocument.graphics</mime>
@@ -105,6 +105,10 @@
<mime>image/x-xcf</mime>
</parser>
+ <parser name="parse-zip" class="org.apache.tika.parser.zip.ZipParser">
+ <mime>application/zip</mime>
+ </parser>
+
</parsers>
</properties>
\ No newline at end of file
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=692148&r1=692147&r2=692148&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Thu Sep
4 09:40:24 2008
@@ -44,15 +44,15 @@
* FIXME the old mechanism does not work anymore when running the tests
* with Maven - need a resource-based one, but this means more changes
* to classes which rely on filenames.
- *
+ *
* String sep = File.separator; StringTokenizer st = new
* StringTokenizer(System.getProperty( "java.class.path"),
* File.pathSeparator);
- *
+ *
* classDir = new File(st.nextToken());
- *
+ *
* config = classDir.getParent() + sep + "config" + sep + "config.xml";
- *
+ *
* String log4j = classDir.getParent() + sep + "Config" + sep +
"log4j" +
* sep + "log4j.properties";
*/
@@ -171,6 +171,16 @@
assertNotNull(parser);
}
+ public void testZipFileExtraction() throws Exception {
+ File file = getTestFile("test-documents.zip");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc, "application/zip");
+ assertEquals(s1, s2);
+
+ Parser parser = tc.getParser("application/zip");
+ assertNotNull(parser);
+ }
+
public void testZipExtraction() throws Exception {
File zip = getTestFile("test-documents.zip");
List<Parser> parsers = ParseUtils.getParsersFromZip(zip, tc);