Author: nick
Date: Tue Jun 24 16:05:38 2014
New Revision: 1605124
URL: http://svn.apache.org/r1605124
Log:
TIKA-1353 If a File is available, parse ODF documents with it, so that the
metadata can always be processed first
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java?rev=1605124&r1=1605123&r2=1605124&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
Tue Jun 24 16:05:38 2014
@@ -20,15 +20,16 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
+import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
-//import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-//import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
@@ -84,6 +85,8 @@ public class OpenDocumentParser extends
MediaType.application("x-vnd.oasis.opendocument.image-template"),
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
+ private static final String META_NAME = "meta.xml";
+
private Parser meta = new OpenDocumentMetaParser();
private Parser content = new OpenDocumentContentParser();
@@ -113,11 +116,10 @@ public class OpenDocumentParser extends
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- // TODO: reuse the already opened ZIPFile, if
- // present
-
- /*
- ZipFile zipFile;
+ // Open the Zip stream
+ // Use a File if we can, and an already open zip is even better
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
if (stream instanceof TikaInputStream) {
TikaInputStream tis = (TikaInputStream) stream;
Object container = ((TikaInputStream) stream).getOpenContainer();
@@ -126,48 +128,40 @@ public class OpenDocumentParser extends
} else if (tis.hasFile()) {
zipFile = new ZipFile(tis.getFile());
}
+ } else {
+ zipStream = new ZipInputStream(stream);
}
- */
-
- // TODO: if incoming IS is a TIS with a file
- // associated, we should open ZipFile so we can
- // visit metadata, mimetype first; today we lose
- // all the metadata if meta.xml is hit after
- // content.xml in the stream. Then we can still
- // read-once for the content.xml.
+ // Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler,
metadata);
// As we don't know which of the metadata or the content
// we'll hit first, catch the endDocument call initially
EndDocumentShieldingContentHandler handler =
new EndDocumentShieldingContentHandler(xhtml);
-
- // Process the file in turn
- ZipInputStream zip = new ZipInputStream(stream);
- ZipEntry entry = zip.getNextEntry();
- while (entry != null) {
- if (entry.getName().equals("mimetype")) {
- String type = IOUtils.toString(zip, "UTF-8");
- metadata.set(Metadata.CONTENT_TYPE, type);
- } else if (entry.getName().equals("meta.xml")) {
- meta.parse(zip, new DefaultHandler(), metadata, context);
- } else if (entry.getName().endsWith("content.xml")) {
- if (content instanceof OpenDocumentContentParser) {
- ((OpenDocumentContentParser) content).parseInternal(zip,
handler, metadata, context);
- } else {
- // Foreign content parser was set:
- content.parse(zip, handler, metadata, context);
- }
- } else if (entry.getName().endsWith("styles.xml")) {
- if (content instanceof OpenDocumentContentParser) {
- ((OpenDocumentContentParser) content).parseInternal(zip,
handler, metadata, context);
- } else {
- // Foreign content parser was set:
- content.parse(zip, handler, metadata, context);
+
+ // If we can, process the metadata first, then the
+ // rest of the file afterwards
+ // Only possible to guarantee that when opened from a file not a stream
+ ZipEntry entry = null;
+ if (zipFile != null) {
+ entry = zipFile.getEntry(META_NAME);
+ handleZipEntry(entry, zipFile.getInputStream(entry), metadata,
context, handler);
+
+ Enumeration<? extends ZipEntry> entries = zipFile.entries();
+ while (entries.hasMoreElements()) {
+ entry = entries.nextElement();
+ if (! META_NAME.equals(entry.getName())) {
+ handleZipEntry(entry, zipFile.getInputStream(entry),
metadata, context, handler);
}
}
- entry = zip.getNextEntry();
+ zipFile.close();
+ } else {
+ do {
+ entry = zipStream.getNextEntry();
+ handleZipEntry(entry, zipStream, metadata, context, handler);
+ } while (entry != null);
+ zipStream.close();
}
// Only now call the end document
@@ -175,5 +169,31 @@ public class OpenDocumentParser extends
handler.reallyEndDocument();
}
}
-
+
+ private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata
metadata,
+ ParseContext context, EndDocumentShieldingContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ if (entry == null) return;
+
+ if (entry.getName().equals("mimetype")) {
+ String type = IOUtils.toString(zip, "UTF-8");
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ } else if (entry.getName().equals(META_NAME)) {
+ meta.parse(zip, new DefaultHandler(), metadata, context);
+ } else if (entry.getName().endsWith("content.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip,
handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ } else if (entry.getName().endsWith("styles.xml")) {
+ if (content instanceof OpenDocumentContentParser) {
+ ((OpenDocumentContentParser) content).parseInternal(zip,
handler, metadata, context);
+ } else {
+ // Foreign content parser was set:
+ content.parse(zip, handler, metadata, context);
+ }
+ }
+ }
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1605124&r1=1605123&r2=1605124&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
Tue Jun 24 16:05:38 2014
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertTru
import java.io.InputStream;
import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -326,4 +327,28 @@ public class ODFParserTest extends TikaT
input.close();
}
}
+
+ @Test
+ public void testFromFile() throws Exception {
+ TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
+ "/test-documents/testODFwithOOo3.odt"));
+ assertEquals(true, tis.hasFile());
+
+ OpenDocumentParser parser = new OpenDocumentParser();
+
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(tis, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/vnd.oasis.opendocument.text",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Tika is part of the Lucene project."));
+ } finally {
+ tis.close();
+ }
+ }
}