Author: nick
Date: Tue Jun 24 16:05:38 2014
New Revision: 1605124

URL: http://svn.apache.org/r1605124
Log:
TIKA-1353 If a File is available, parse ODF documents with it, so that the 
metadata can always be processed first

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java?rev=1605124&r1=1605123&r2=1605124&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
 Tue Jun 24 16:05:38 2014
@@ -20,15 +20,16 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
 import java.util.zip.ZipInputStream;
 
-//import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-//import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
@@ -84,6 +85,8 @@ public class OpenDocumentParser extends 
                 
MediaType.application("x-vnd.oasis.opendocument.image-template"),
                 
MediaType.application("x-vnd.oasis.opendocument.formula-template"))));
 
+    private static final String META_NAME = "meta.xml";
+    
     private Parser meta = new OpenDocumentMetaParser();
 
     private Parser content = new OpenDocumentContentParser();
@@ -113,11 +116,10 @@ public class OpenDocumentParser extends 
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
-        // TODO: reuse the already opened ZIPFile, if
-        // present
-
-        /*
-        ZipFile zipFile;
+        // Open the Zip stream
+        // Use a File if we can, and an already open zip is even better
+        ZipFile zipFile = null;
+        ZipInputStream zipStream = null;
         if (stream instanceof TikaInputStream) {
             TikaInputStream tis = (TikaInputStream) stream;
             Object container = ((TikaInputStream) stream).getOpenContainer();
@@ -126,48 +128,40 @@ public class OpenDocumentParser extends 
             } else if (tis.hasFile()) {
                 zipFile = new ZipFile(tis.getFile());                
             }
+        } else {
+            zipStream = new ZipInputStream(stream);
         }
-        */
-
-        // TODO: if incoming IS is a TIS with a file
-        // associated, we should open ZipFile so we can
-        // visit metadata, mimetype first; today we lose
-        // all the metadata if meta.xml is hit after
-        // content.xml in the stream.  Then we can still
-        // read-once for the content.xml.
 
+        // Prepare to handle the content
         XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, 
metadata);
 
         // As we don't know which of the metadata or the content
         //  we'll hit first, catch the endDocument call initially
         EndDocumentShieldingContentHandler handler = 
           new EndDocumentShieldingContentHandler(xhtml);
-
-        // Process the file in turn
-        ZipInputStream zip = new ZipInputStream(stream);
-        ZipEntry entry = zip.getNextEntry();
-        while (entry != null) {
-            if (entry.getName().equals("mimetype")) {
-                String type = IOUtils.toString(zip, "UTF-8");
-                metadata.set(Metadata.CONTENT_TYPE, type);
-            } else if (entry.getName().equals("meta.xml")) {
-                meta.parse(zip, new DefaultHandler(), metadata, context);
-            } else if (entry.getName().endsWith("content.xml")) {
-                if (content instanceof OpenDocumentContentParser) {
-                    ((OpenDocumentContentParser) content).parseInternal(zip, 
handler, metadata, context);
-                } else {
-                    // Foreign content parser was set:
-                    content.parse(zip, handler, metadata, context);
-                }
-            } else if (entry.getName().endsWith("styles.xml")) {
-                if (content instanceof OpenDocumentContentParser) {
-                    ((OpenDocumentContentParser) content).parseInternal(zip, 
handler, metadata, context);
-                } else {
-                    // Foreign content parser was set:
-                    content.parse(zip, handler, metadata, context);
+        
+        // If we can, process the metadata first, then the
+        //  rest of the file afterwards
+        // Only possible to guarantee that when opened from a file not a stream
+        ZipEntry entry = null;
+        if (zipFile != null) {
+            entry = zipFile.getEntry(META_NAME);
+            handleZipEntry(entry, zipFile.getInputStream(entry), metadata, 
context, handler);
+
+            Enumeration<? extends ZipEntry> entries = zipFile.entries();
+            while (entries.hasMoreElements()) {
+                entry = entries.nextElement();
+                if (! META_NAME.equals(entry.getName())) {
+                    handleZipEntry(entry, zipFile.getInputStream(entry), 
metadata, context, handler);
                 }
             }
-            entry = zip.getNextEntry();
+            zipFile.close();
+        } else {
+            do {
+                entry = zipStream.getNextEntry();
+                handleZipEntry(entry, zipStream, metadata, context, handler);
+            } while (entry != null);
+            zipStream.close();
         }
         
         // Only now call the end document
@@ -175,5 +169,31 @@ public class OpenDocumentParser extends 
            handler.reallyEndDocument();
         }
     }
-
+    
+    private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata 
metadata, 
+            ParseContext context, EndDocumentShieldingContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        if (entry == null) return;
+        
+        if (entry.getName().equals("mimetype")) {
+            String type = IOUtils.toString(zip, "UTF-8");
+            metadata.set(Metadata.CONTENT_TYPE, type);
+        } else if (entry.getName().equals(META_NAME)) {
+            meta.parse(zip, new DefaultHandler(), metadata, context);
+        } else if (entry.getName().endsWith("content.xml")) {
+            if (content instanceof OpenDocumentContentParser) {
+                ((OpenDocumentContentParser) content).parseInternal(zip, 
handler, metadata, context);
+            } else {
+                // Foreign content parser was set:
+                content.parse(zip, handler, metadata, context);
+            }
+        } else if (entry.getName().endsWith("styles.xml")) {
+            if (content instanceof OpenDocumentContentParser) {
+                ((OpenDocumentContentParser) content).parseInternal(zip, 
handler, metadata, context);
+            } else {
+                // Foreign content parser was set:
+                content.parse(zip, handler, metadata, context);
+            }
+        }
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java?rev=1605124&r1=1605123&r2=1605124&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
 Tue Jun 24 16:05:38 2014
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertTru
 import java.io.InputStream;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -326,4 +327,28 @@ public class ODFParserTest extends TikaT
             input.close();
         }
     }  
+    
+    @Test
+    public void testFromFile() throws Exception {
+       TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
+               "/test-documents/testODFwithOOo3.odt"));
+       assertEquals(true, tis.hasFile());
+
+       OpenDocumentParser parser = new OpenDocumentParser();
+
+       try {
+           Metadata metadata = new Metadata();
+           ContentHandler handler = new BodyContentHandler();
+           parser.parse(tis, handler, metadata, new ParseContext());
+
+           assertEquals(
+                   "application/vnd.oasis.opendocument.text",
+                   metadata.get(Metadata.CONTENT_TYPE));
+
+           String content = handler.toString();
+           assertTrue(content.contains("Tika is part of the Lucene project."));
+       } finally {
+           tis.close();
+       }
+    }
 }


Reply via email to