Author: nick Date: Fri Jan 13 15:01:54 2012 New Revision: 1231117 URL: http://svn.apache.org/viewvc?rev=1231117&view=rev Log: TIKA-840 Update the OOXML parsers, so that rather than hard coding the content type, the file specific one is feteched and set
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Fri Jan 13 15:01:54 2012 @@ -64,11 +64,8 @@ public abstract class AbstractOOXMLExtra private final EmbeddedDocumentExtractor embeddedExtractor; - private final String type; - - public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor, String type) { + public AbstractOOXMLExtractor(ParseContext context, POIXMLTextExtractor extractor) { this.extractor = extractor; - this.type = type; EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); @@ -91,7 +88,7 @@ public abstract class AbstractOOXMLExtra * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor() */ public MetadataExtractor getMetadataExtractor() { - return new MetadataExtractor(extractor, type); + return new MetadataExtractor(extractor); } /** Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Fri Jan 13 15:01:54 2012 @@ -44,16 +44,11 @@ public class MetadataExtractor { private final POIXMLTextExtractor extractor; - private final String type; - - public MetadataExtractor(POIXMLTextExtractor extractor, String type) { + public MetadataExtractor(POIXMLTextExtractor extractor) { this.extractor = extractor; - this.type = type; } public void extract(Metadata metadata) throws TikaException { - addProperty(metadata, Metadata.CONTENT_TYPE, type); - if (extractor.getDocument() != null || (extractor instanceof XSSFEventBasedExcelExtractor && extractor.getPackage() != null)) { Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java Fri Jan 13 15:01:54 2012 @@ -35,7 +35,10 @@ import org.apache.tika.exception.TikaExc import org.apache.tika.io.CloseShieldInputStream; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.pkg.ZipContainerDetector; import org.apache.tika.sax.EndDocumentShieldingContentHandler; import org.apache.xmlbeans.XmlException; import org.xml.sax.ContentHandler; @@ -56,21 +59,31 @@ public class OOXMLExtractorFactory { try { OOXMLExtractor extractor; + OPCPackage pkg; - POIXMLTextExtractor poiExtractor; + // Open the OPCPackage for the file TikaInputStream tis = TikaInputStream.cast(stream); if (tis != null && tis.getOpenContainer() instanceof OPCPackage) { - poiExtractor = ExtractorFactory.createExtractor( - (OPCPackage) tis.getOpenContainer()); + pkg = (OPCPackage) tis.getOpenContainer(); } else if (tis != null && tis.hasFile()) { - poiExtractor = (POIXMLTextExtractor) - ExtractorFactory.createExtractor(tis.getFile()); + pkg = OPCPackage.open( tis.getFile().getPath() ); } else { InputStream shield = new CloseShieldInputStream(stream); - poiExtractor = (POIXMLTextExtractor) - ExtractorFactory.createExtractor(shield); + pkg = OPCPackage.open(shield); } + + // Get the type, and ensure it's one we handle + MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg); + if (type != null && OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) { + // Not a supported type, delegate to Empty Parser + EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context); + return; + } + metadata.set(Metadata.CONTENT_TYPE, type.toString()); + // Have the appropriate OOXML text extractor picked + POIXMLTextExtractor poiExtractor = ExtractorFactory.createExtractor(pkg); + POIXMLDocument document = poiExtractor.getDocument(); if (poiExtractor instanceof XSSFEventBasedExcelExtractor) { extractor = new XSSFExcelExtractorDecorator( Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java Fri Jan 13 15:01:54 2012 @@ -27,7 +27,6 @@ import org.apache.tika.exception.TikaExc import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -40,7 +39,7 @@ public class OOXMLParser extends Abstrac /** Serial version UID */ private static final long serialVersionUID = 6535995710857776481L; - private static final Set<MediaType> SUPPORTED_TYPES = + protected static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( MediaType.application("x-tika-ooxml"), MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"), @@ -65,7 +64,7 @@ public class OOXMLParser extends Abstrac * This list is used to decline certain formats that are not yet supported * by Tika and/or POI. */ - private static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = + protected static final Set<MediaType> UNSUPPORTED_OOXML_TYPES = Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"), MediaType.application("vnd.ms-xpsdocument") @@ -79,14 +78,6 @@ public class OOXMLParser extends Abstrac InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - // Is this an OOXML derived type that we can't help with? - String type = metadata.get(Metadata.CONTENT_TYPE); - if (type != null && UNSUPPORTED_OOXML_TYPES.contains(MediaType.parse(type))) { - // Not a supported type, delegate to Empty Parser - EmptyParser.INSTANCE.parse(stream, handler, metadata, context); - return; - } - // Have the OOXML file processed OOXMLExtractorFactory.parse(stream, handler, metadata, context); } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java Fri Jan 13 15:01:54 2012 @@ -28,7 +28,7 @@ import org.xml.sax.SAXException; public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor { public POIXMLTextExtractorDecorator(ParseContext context, POIXMLTextExtractor extractor) { - super(context, extractor, null); + super(context, extractor); } @Override Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java Fri Jan 13 15:01:54 2012 @@ -45,12 +45,8 @@ import org.openxmlformats.schemas.presen import org.xml.sax.SAXException; public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { - // TODO Have this detected rather than hard coded - //private static final String TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; - private static final String TYPE = null; - public XSLFPowerPointExtractorDecorator(ParseContext context, XSLFPowerPointExtractor extractor) { - super(context, extractor, TYPE); + super(context, extractor); } /** Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java Fri Jan 13 15:01:54 2012 @@ -66,12 +66,9 @@ public class XSSFExcelExtractorDecorator private final List<PackagePart> sheetParts = new ArrayList<PackagePart>(); private final List<Boolean> sheetProtected = new ArrayList<Boolean>(); - // TODO Have this detected rather than hard coded - private static final String TYPE = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; - public XSSFExcelExtractorDecorator( ParseContext context, XSSFEventBasedExcelExtractor extractor, Locale locale) { - super(context, extractor, TYPE); + super(context, extractor); this.extractor = extractor; extractor.setFormulasNotResults(false); @@ -350,7 +347,7 @@ public class XSSFExcelExtractorDecorator @Override public MetadataExtractor getMetadataExtractor() { - return new MetadataExtractor(extractor, TYPE) { + return new MetadataExtractor(extractor) { @Override public void extract(Metadata metadata) throws TikaException { super.extract(metadata); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Fri Jan 13 15:01:54 2012 @@ -24,7 +24,22 @@ import org.apache.poi.openxml4j.opc.Pack import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.model.XWPFCommentsDecorator; import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy; -import org.apache.poi.xwpf.usermodel.*; +import org.apache.poi.xwpf.usermodel.BodyType; +import org.apache.poi.xwpf.usermodel.IBody; +import org.apache.poi.xwpf.usermodel.IBodyElement; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter; +import org.apache.poi.xwpf.usermodel.XWPFHyperlink; +import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.apache.poi.xwpf.usermodel.XWPFPicture; +import org.apache.poi.xwpf.usermodel.XWPFPictureData; +import org.apache.poi.xwpf.usermodel.XWPFRun; +import org.apache.poi.xwpf.usermodel.XWPFStyle; +import org.apache.poi.xwpf.usermodel.XWPFStyles; +import org.apache.poi.xwpf.usermodel.XWPFTable; +import org.apache.poi.xwpf.usermodel.XWPFTableCell; +import org.apache.poi.xwpf.usermodel.XWPFTableRow; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.WordExtractor; import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle; @@ -40,8 +55,7 @@ public class XWPFWordExtractorDecorator private XWPFStyles styles; public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) { - // TODO Have the type detected rather than hard coded - super(context, extractor, "application/vnd.openxmlformats-officedocument.wordprocessingml.document"); + super(context, extractor); document = (XWPFDocument) extractor.getDocument(); styles = document.getStyles(); Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1231117&r1=1231116&r2=1231117&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Jan 13 15:01:54 2012 @@ -749,10 +749,9 @@ public class OOXMLParserTest extends Tik input.close(); } - // When detection / type is fixed, re-enable this -// assertEquals( -// "application/vnd.openxmlformats-officedocument.presentationml.presentation", -// metadata.get(Metadata.CONTENT_TYPE)); + assertEquals( + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + metadata.get(Metadata.CONTENT_TYPE)); assertEquals("JOUVIN ETIENNE", metadata.get(Metadata.AUTHOR)); assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR)); assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.DATE));