Author: nick
Date: Fri Jan 13 15:01:54 2012
New Revision: 1231117

URL: http://svn.apache.org/viewvc?rev=1231117&view=rev
Log:
TIKA-840 Update the OOXML parsers, so that rather than hard coding the content 
type, the file specific one is feteched and set

Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 Fri Jan 13 15:01:54 2012
@@ -64,11 +64,8 @@ public abstract class AbstractOOXMLExtra
 
     private final EmbeddedDocumentExtractor embeddedExtractor;
 
-    private final String type;
-
-    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor 
extractor, String type) {
+    public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor 
extractor) {
         this.extractor = extractor;
-        this.type = type;
 
         EmbeddedDocumentExtractor ex = 
context.get(EmbeddedDocumentExtractor.class);
 
@@ -91,7 +88,7 @@ public abstract class AbstractOOXMLExtra
      * @see 
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
      */
     public MetadataExtractor getMetadataExtractor() {
-        return new MetadataExtractor(extractor, type);
+        return new MetadataExtractor(extractor);
     }
 
     /**

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
 Fri Jan 13 15:01:54 2012
@@ -44,16 +44,11 @@ public class MetadataExtractor {
 
     private final POIXMLTextExtractor extractor;
 
-    private final String type;
-
-    public MetadataExtractor(POIXMLTextExtractor extractor, String type) {
+    public MetadataExtractor(POIXMLTextExtractor extractor) {
         this.extractor = extractor;
-        this.type = type;
     }
 
     public void extract(Metadata metadata) throws TikaException {
-        addProperty(metadata, Metadata.CONTENT_TYPE, type);
-        
         if (extractor.getDocument() != null ||
               (extractor instanceof XSSFEventBasedExcelExtractor && 
                extractor.getPackage() != null)) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 Fri Jan 13 15:01:54 2012
@@ -35,7 +35,10 @@ import org.apache.tika.exception.TikaExc
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pkg.ZipContainerDetector;
 import org.apache.tika.sax.EndDocumentShieldingContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.xml.sax.ContentHandler;
@@ -56,21 +59,31 @@ public class OOXMLExtractorFactory {
         
         try {
             OOXMLExtractor extractor;
+            OPCPackage pkg;
 
-            POIXMLTextExtractor poiExtractor;
+            // Open the OPCPackage for the file
             TikaInputStream tis = TikaInputStream.cast(stream);
             if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
-                poiExtractor = ExtractorFactory.createExtractor(
-                        (OPCPackage) tis.getOpenContainer());
+                pkg = (OPCPackage) tis.getOpenContainer();
             } else if (tis != null && tis.hasFile()) {
-                poiExtractor = (POIXMLTextExtractor)
-                        ExtractorFactory.createExtractor(tis.getFile());
+                pkg = OPCPackage.open( tis.getFile().getPath() );
             } else {
                 InputStream shield = new CloseShieldInputStream(stream);
-                poiExtractor = (POIXMLTextExtractor)
-                        ExtractorFactory.createExtractor(shield);
+                pkg = OPCPackage.open(shield); 
             }
+            
+            // Get the type, and ensure it's one we handle
+            MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
+            if (type != null && 
OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
+               // Not a supported type, delegate to Empty Parser 
+               EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, 
context);
+               return;
+            }
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
 
+            // Have the appropriate OOXML text extractor picked
+            POIXMLTextExtractor poiExtractor = 
ExtractorFactory.createExtractor(pkg);
+            
             POIXMLDocument document = poiExtractor.getDocument();
             if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
                extractor = new XSSFExcelExtractorDecorator(

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
 Fri Jan 13 15:01:54 2012
@@ -27,7 +27,6 @@ import org.apache.tika.exception.TikaExc
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -40,7 +39,7 @@ public class OOXMLParser extends Abstrac
     /** Serial version UID */
     private static final long serialVersionUID = 6535995710857776481L;
    
-    private static final Set<MediaType> SUPPORTED_TYPES =
+    protected static final Set<MediaType> SUPPORTED_TYPES =
         Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                 MediaType.application("x-tika-ooxml"),
                 
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"),
@@ -65,7 +64,7 @@ public class OOXMLParser extends Abstrac
      * This list is used to decline certain formats that are not yet supported
      *  by Tika and/or POI.
      */
-    private static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = 
+    protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = 
        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                 
MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"),
                 MediaType.application("vnd.ms-xpsdocument")
@@ -79,14 +78,6 @@ public class OOXMLParser extends Abstrac
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        // Is this an OOXML derived type that we can't help with?
-        String type = metadata.get(Metadata.CONTENT_TYPE);
-        if (type != null && 
UNSUPPORTED_OOXML_TYPES.contains(MediaType.parse(type))) {
-           // Not a supported type, delegate to Empty Parser 
-           EmptyParser.INSTANCE.parse(stream, handler, metadata, context);
-           return;
-        }
-
         // Have the OOXML file processed
         OOXMLExtractorFactory.parse(stream, handler, metadata, context);
     }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
 Fri Jan 13 15:01:54 2012
@@ -28,7 +28,7 @@ import org.xml.sax.SAXException;
 public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
 
     public POIXMLTextExtractorDecorator(ParseContext context, 
POIXMLTextExtractor extractor) {
-        super(context, extractor, null);
+        super(context, extractor);
     }
 
     @Override

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
 Fri Jan 13 15:01:54 2012
@@ -45,12 +45,8 @@ import org.openxmlformats.schemas.presen
 import org.xml.sax.SAXException;
 
 public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
-    // TODO Have this detected rather than hard coded
-    //private static final String TYPE = 
"application/vnd.openxmlformats-officedocument.presentationml.presentation";
-    private static final String TYPE = null;
-
     public XSLFPowerPointExtractorDecorator(ParseContext context, 
XSLFPowerPointExtractor extractor) {
-        super(context, extractor, TYPE);
+        super(context, extractor);
     }
 
     /**

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 Fri Jan 13 15:01:54 2012
@@ -66,12 +66,9 @@ public class XSSFExcelExtractorDecorator
     private final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
     private final List<Boolean> sheetProtected = new ArrayList<Boolean>();
     
-    // TODO Have this detected rather than hard coded
-    private static final String TYPE = 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
-
     public XSSFExcelExtractorDecorator(
             ParseContext context, XSSFEventBasedExcelExtractor extractor, 
Locale locale) {
-        super(context, extractor, TYPE);
+        super(context, extractor);
 
         this.extractor = extractor;
         extractor.setFormulasNotResults(false);
@@ -350,7 +347,7 @@ public class XSSFExcelExtractorDecorator
 
     @Override
     public MetadataExtractor getMetadataExtractor() {
-        return new MetadataExtractor(extractor, TYPE) {
+        return new MetadataExtractor(extractor) {
             @Override
             public void extract(Metadata metadata) throws TikaException {
                 super.extract(metadata);

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
 Fri Jan 13 15:01:54 2012
@@ -24,7 +24,22 @@ import org.apache.poi.openxml4j.opc.Pack
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
 import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
-import org.apache.poi.xwpf.usermodel.*;
+import org.apache.poi.xwpf.usermodel.BodyType;
+import org.apache.poi.xwpf.usermodel.IBody;
+import org.apache.poi.xwpf.usermodel.IBodyElement;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
+import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.poi.xwpf.usermodel.XWPFPicture;
+import org.apache.poi.xwpf.usermodel.XWPFPictureData;
+import org.apache.poi.xwpf.usermodel.XWPFRun;
+import org.apache.poi.xwpf.usermodel.XWPFStyle;
+import org.apache.poi.xwpf.usermodel.XWPFStyles;
+import org.apache.poi.xwpf.usermodel.XWPFTable;
+import org.apache.poi.xwpf.usermodel.XWPFTableCell;
+import org.apache.poi.xwpf.usermodel.XWPFTableRow;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.WordExtractor;
 import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
@@ -40,8 +55,7 @@ public class XWPFWordExtractorDecorator 
     private XWPFStyles styles;
 
     public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor 
extractor) {
-        // TODO Have the type detected rather than hard coded
-        super(context, extractor, 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+        super(context, extractor);
         
         document = (XWPFDocument) extractor.getDocument();
         styles = document.getStyles();

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1231117&r1=1231116&r2=1231117&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Fri Jan 13 15:01:54 2012
@@ -749,10 +749,9 @@ public class OOXMLParserTest extends Tik
           input.close();
        }
 
-       // When detection / type is fixed, re-enable this
-//       assertEquals(
-//             
"application/vnd.openxmlformats-officedocument.presentationml.presentation", 
-//             metadata.get(Metadata.CONTENT_TYPE));
+       assertEquals(
+             
"application/vnd.openxmlformats-officedocument.presentationml.presentation", 
+             metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("JOUVIN ETIENNE",       metadata.get(Metadata.AUTHOR));
        assertEquals("EJ04325S",             
metadata.get(Metadata.LAST_AUTHOR));
        assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.DATE));


Reply via email to