Author: jukka
Date: Thu Sep 4 13:33:07 2008
New Revision: 692227
URL: http://svn.apache.org/viewvc?rev=692227&view=rev
Log:
TIKA-150: Parser for tar files
Added a tar parser implementation based on tar parsing code from Apache Ant. I
preferred to copy the classes over to Tika instead of adding a dependency Ant.
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/
- copied from r692208, ant/core/trunk/src/main/org/apache/tools/tar/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ (props
changed)
- copied from r692181,
incubator/tika/trunk/src/test/java/org/apache/tika/parser/zip/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar
(with props)
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarOutputStream.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/zip/
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/main/resources/tika-config.xml
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=692227&r1=692226&r2=692227&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Thu Sep 4 13:33:07 2008
@@ -64,6 +64,8 @@
27. TIKA-149 - Parser for Zip files (Dave Meikle & Jukka Zitting)
+28. TIKA-150 - Parser for tar files (Jukka Zitting)
+
Release 0.1-incubating - 12/27/2007
1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java?rev=692227&view=auto
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java
(added)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/TarParser.java
Thu Sep 4 13:33:07 2008
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.pkg.tar.TarEntry;
+import org.apache.tika.parser.pkg.tar.TarInputStream;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Tar parser.
+ */
+public class TarParser extends PackageParser {
+
+ /**
+ * Parses the given stream as a tar file.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, TikaException, SAXException {
+ metadata.set(Metadata.CONTENT_TYPE, "application/x-tar");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // At the end we want to close the tar stream to release any associated
+ // resources, but the underlying document stream should not be closed
+ TarInputStream tar =
+ new TarInputStream(new CloseShieldInputStream(stream));
+ try {
+ TarEntry entry = tar.getNextEntry();
+ while (entry != null) {
+ if (!entry.isDirectory()) {
+ Metadata entrydata = new Metadata();
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, entry.getName());
+ parseEntry(tar, xhtml, entrydata);
+ }
+ entry = tar.getNextEntry();
+ }
+ } finally {
+ tar.close();
+ }
+
+ xhtml.endDocument();
+ }
+
+}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarBuffer.java
Thu Sep 4 13:33:07 2008
@@ -19,9 +19,11 @@
/*
* This package is based on the work done by Timothy Gerard Endres
* ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
*/
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
import java.io.InputStream;
import java.io.OutputStream;
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarConstants.java
Thu Sep 4 13:33:07 2008
@@ -19,9 +19,11 @@
/*
* This package is based on the work done by Timothy Gerard Endres
* ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
*/
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
/**
* This interface contains all the definitions used in the package.
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarEntry.java
Thu Sep 4 13:33:07 2008
@@ -19,9 +19,11 @@
/*
* This package is based on the work done by Timothy Gerard Endres
* ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
*/
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
import java.io.File;
import java.util.Date;
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarInputStream.java
Thu Sep 4 13:33:07 2008
@@ -19,9 +19,11 @@
/*
* This package is based on the work done by Timothy Gerard Endres
* ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
*/
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
import java.io.FilterInputStream;
import java.io.IOException;
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java?rev=692227&r1=692208&r2=692227&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/pkg/tar/TarUtils.java
Thu Sep 4 13:33:07 2008
@@ -19,9 +19,11 @@
/*
* This package is based on the work done by Timothy Gerard Endres
* ([EMAIL PROTECTED]) to whom the Ant project is very grateful for his great
code.
+ *
+ * This package has since been copied from Apache Ant to Apache Tika.
*/
-package org.apache.tools.tar;
+package org.apache.tika.parser.pkg.tar;
/**
* This class provides static utility methods to work with byte streams.
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=692227&r1=692226&r2=692227&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Thu Sep 4
13:33:07 2008
@@ -395,6 +395,16 @@
<glob pattern="*.zip" />
</mime-type>
+ <mime-type type="application/x-tar">
+ <magic priority="40">
+ <!-- POSIX tar archive -->
+ <match value="ustar\0" type="string" offset="257" />
+ <!-- GNU tar archive -->
+ <match value="ustar \0" type="string" offset="257" />
+ </magic>
+ <glob pattern="*.tar" />
+ </mime-type>
+
<mime-type type="application/msword">
<glob pattern="*.doc" />
<alias type="application/vnd.ms-word" />
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=692227&r1=692226&r2=692227&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Sep 4 13:33:07
2008
@@ -109,6 +109,10 @@
<mime>application/zip</mime>
</parser>
+ <parser name="parse-tar" class="org.apache.tika.parser.pkg.TarParser">
+ <mime>application/x-tar</mime>
+ </parser>
+
</parsers>
</properties>
\ No newline at end of file
Propchange: incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/
------------------------------------------------------------------------------
svn:mergeinfo =
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java?rev=692227&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/TarParserTest.java
Thu Sep 4 13:33:07 2008
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing tar files.
+ */
+public class TarParserTest extends TestCase {
+
+ public void testTarParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = TarParserTest.class.getResourceAsStream(
+ "/test-documents/test-documents.tar");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("application/x-tar", metadata.get(Metadata.CONTENT_TYPE));
+ String content = handler.toString();
+ System.out.println(content);
+ assertTrue(content.contains("test-documents/testEXCEL.xls"));
+ assertTrue(content.contains("Sample Excel Worksheet"));
+ assertTrue(content.contains("test-documents/testHTML.html"));
+ assertTrue(content.contains("Test Indexation Html"));
+ assertTrue(content.contains("test-documents/testOpenOffice2.odt"));
+ assertTrue(content.contains("This is a sample Open Office document"));
+ assertTrue(content.contains("test-documents/testPDF.pdf"));
+ assertTrue(content.contains("Apache Tika"));
+ assertTrue(content.contains("test-documents/testPPT.ppt"));
+ assertTrue(content.contains("Sample Powerpoint Slide"));
+ assertTrue(content.contains("test-documents/testRTF.rtf"));
+ assertTrue(content.contains("indexation Word"));
+ assertTrue(content.contains("test-documents/testTXT.txt"));
+ assertTrue(content.contains("Test d'indexation de Txt"));
+ assertTrue(content.contains("test-documents/testWORD.doc"));
+ assertTrue(content.contains("This is a sample Microsoft Word
Document"));
+ assertTrue(content.contains("test-documents/testXML.xml"));
+ assertTrue(content.contains("Rida Benjelloun"));
+ }
+
+}
Modified:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java?rev=692227&r1=692181&r2=692227&view=diff
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
(original)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
Thu Sep 4 13:33:07 2008
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.zip;
+package org.apache.tika.parser.pkg;
import java.io.InputStream;
Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar?rev=692227&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
incubator/tika/trunk/src/test/resources/test-documents/test-documents.tar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream