Author: nick
Date: Thu Jun 27 12:40:59 2013
New Revision: 1497332

URL: http://svn.apache.org/r1497332
Log:
Patch from Daniel Bonniot from TIKA-1109 - Fetch OOXML metadata earlier, to 
tidy code and make it available if required during parsing

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1497332&r1=1497331&r2=1497332&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 Thu Jun 27 12:40:59 2013
@@ -40,7 +40,6 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.pkg.ZipContainerDetector;
-import org.apache.tika.sax.EndDocumentShieldingContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -105,17 +104,12 @@ public class OOXMLExtractorFactory {
                 extractor = new POIXMLTextExtractorDecorator(context, 
poiExtractor);
             }
             
-            // We need to get the content first, but not end 
-            //  the document just yet
-            EndDocumentShieldingContentHandler handler = 
-               new EndDocumentShieldingContentHandler(baseHandler);
-            extractor.getXHTML(handler, metadata, context);
-
-            // Now we can get the metadata
+            // Get the bulk of the metadata first, so that it's accessible 
during
+            //  parsing if desired by the client (see TIKA-1109)
             extractor.getMetadataExtractor().extract(metadata);
             
-            // Then finish up
-            handler.reallyEndDocument();
+            // Extract the text, along with any in-document metadata
+            extractor.getXHTML(baseHandler, metadata, context);
         } catch (IllegalArgumentException e) {
             if (e.getMessage().startsWith("No supported documents found")) {
                 throw new TikaException(

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1497332&r1=1497331&r2=1497332&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 Thu Jun 27 12:40:59 2013
@@ -64,8 +64,8 @@ public class XSSFExcelExtractorDecorator
     private final XSSFEventBasedExcelExtractor extractor;
     private final DataFormatter formatter;
     private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
-    private final List<Boolean> sheetProtected = new ArrayList<Boolean>();
-    
+    private Metadata metadata;
+
     public XSSFExcelExtractorDecorator(
             ParseContext context, XSSFEventBasedExcelExtractor extractor, 
Locale locale) {
         super(context, extractor);
@@ -81,6 +81,17 @@ public class XSSFExcelExtractorDecorator
         }
     }
 
+    @Override
+    public void getXHTML(
+            ContentHandler handler, Metadata metadata, ParseContext context)
+            throws SAXException, XmlException, IOException, TikaException {
+
+       this.metadata = metadata;
+       metadata.set(TikaMetadataKeys.PROTECTED, "false");
+
+       super.getXHTML(handler, metadata, context);
+    }
+
     /**
      * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
      */
@@ -164,7 +175,9 @@ public class XSSFExcelExtractorDecorator
          sheetParser.parse(sheetSource);
          sheetInputStream.close();
          
-         sheetProtected.add(handler.hasProtection);
+         if (handler.hasProtection) {
+            metadata.set(TikaMetadataKeys.PROTECTED, "true");
+        }
       } catch(ParserConfigurationException e) {
          throw new RuntimeException("SAX parser appears to be broken - " + 
e.getMessage());
       }
@@ -344,21 +357,4 @@ public class XSSFExcelExtractorDecorator
 
        return parts;
     }
-
-    @Override
-    public MetadataExtractor getMetadataExtractor() {
-        return new MetadataExtractor(extractor) {
-            @Override
-            public void extract(Metadata metadata) throws TikaException {
-                super.extract(metadata);
-
-                metadata.set(TikaMetadataKeys.PROTECTED, "false");
-                for(boolean prot : sheetProtected) {
-                   if(prot) {
-                      metadata.set(TikaMetadataKeys.PROTECTED, "true");
-                   }
-                }
-            }
-        };
-    }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1497332&r1=1497331&r2=1497332&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Thu Jun 27 12:40:59 2013
@@ -227,6 +227,62 @@ public class OOXMLParserTest extends Tik
     }
     
     /**
+     * Test that the metadata is already extracted when the body is processed.
+     * See TIKA-1109
+     */
+    public void testPowerPointMetadataEarly() throws Exception {
+       String[] extensions = new String[] {
+             "pptx", "pptm", "ppsm", "ppsx", "potm"
+             //"thmx", // TIKA-418: Will be supported in POI 3.7 beta 2 
+             //"xps" // TIKA-418: Not yet supported by POI
+       };
+
+       final String[] mimeTypes = new String[] {
+                
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                "application/vnd.ms-powerpoint.presentation.macroenabled.12",
+                "application/vnd.ms-powerpoint.slideshow.macroenabled.12",
+                
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+                "application/vnd.ms-powerpoint.template.macroenabled.12"
+        };
+
+        for (int i=0; i<extensions.length; i++) {
+            String extension = extensions[i];
+            final String filename = "testPPT." + extension;
+
+            Parser parser = new AutoDetectParser();
+            final Metadata metadata = new Metadata();
+            // TODO: should auto-detect without the resource name
+            metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+
+           // Allow the value to be access from the inner class
+           final int currentI = i;
+            ContentHandler handler = new BodyContentHandler()
+               {
+                   public void startDocument ()
+                   {
+                       assertEquals(
+                                    "Mime-type checking for " + filename,
+                                    mimeTypes[currentI],
+                                    metadata.get(Metadata.CONTENT_TYPE));
+                       assertEquals("Attachment Test", 
metadata.get(TikaCoreProperties.TITLE));
+                       assertEquals("Rajiv", 
metadata.get(TikaCoreProperties.CREATOR));
+                       assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
+
+                   }
+
+               };
+            ParseContext context = new ParseContext();
+    
+            InputStream input = getTestDocument(filename);
+            try {
+                parser.parse(input, handler, metadata, context);
+            } finally {
+                input.close();
+            }
+        }
+    }
+    
+    /**
      * For the PowerPoint formats we don't currently support, ensure that
      *  we don't break either
      */


Reply via email to