Author: jukka
Date: Fri Feb 13 23:46:55 2009
New Revision: 744290
URL: http://svn.apache.org/viewvc?rev=744290&view=rev
Log:
TIKA-152: Support for Office XML files
Patch by Guillermo Arribas.
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ooxml/
lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
lucene/tika/trunk/src/test/resources/test-documents/testEXCEL-formats.xlsx
(with props)
lucene/tika/trunk/src/test/resources/test-documents/testEXCEL.xlsx (with
props)
lucene/tika/trunk/src/test/resources/test-documents/testPPT.pptx (with
props)
lucene/tika/trunk/src/test/resources/test-documents/testWORD.docx (with
props)
Modified:
lucene/tika/trunk/CHANGES.txt
lucene/tika/trunk/pom.xml
lucene/tika/trunk/src/main/java/org/apache/tika/metadata/MSOffice.java
lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
lucene/tika/trunk/src/main/resources/tika-config.xml
Modified: lucene/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=744290&r1=744289&r2=744290&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Fri Feb 13 23:46:55 2009
@@ -6,6 +6,9 @@
The most notable changes in Tika 0.3 over the previous release are:
+ * Tika now supports the Office Open XML format used by
+ Microsoft Office 2007. (TIKA-152)
+
* Automatic detection of document types in Tika has been improved.
For example Tika can now detect plain text just by looking at the first
few bytes of the document. (TIKA-154)
Modified: lucene/tika/trunk/pom.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/pom.xml?rev=744290&r1=744289&r2=744290&view=diff
==============================================================================
--- lucene/tika/trunk/pom.xml (original)
+++ lucene/tika/trunk/pom.xml Fri Feb 13 23:46:55 2009
@@ -211,6 +211,11 @@
<version>3.5-beta4</version>
</dependency>
<dependency>
+ <groupId>org.apache.poi</groupId>
+ <artifactId>poi-ooxml</artifactId>
+ <version>3.5-beta4</version>
+ </dependency>
+ <dependency>
<groupId>net.sourceforge.nekohtml</groupId>
<artifactId>nekohtml</artifactId>
<version>1.9.9</version>
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/metadata/MSOffice.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/metadata/MSOffice.java?rev=744290&r1=744289&r2=744290&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/metadata/MSOffice.java
(original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/metadata/MSOffice.java Fri
Feb 13 23:46:55 2009
@@ -46,5 +46,29 @@
public static final String TEMPLATE = "Template";
public static final String AUTHOR = "Author";
+
+ public static final String TOTAL_TIME = "Total-Time";
+
+ public static final String SLIDE_COUNT = "Slide-Count";
+
+ public static final String PRESENTATION_FORMAT = "Presentation-Format";
+
+ public static final String PARAGRAPH_COUNT = "Paragraph-Count";
+
+ public static final String NOTES = "Notes";
+
+ public static final String MANAGER = "Manager";
+
+ public static final String LINE_COUNT = "Line-Count";
+
+ public static final String CHARACTER_COUNT_WITH_SPACES =
"Character-Count-With-Spaces";
+
+ public static final String APPLICATION_VERSION = "Application-Version";
+
+ public static final String VERSION = "version";
+
+ public static final String CONTENT_STATUS = "Content-Status";
+
+ public static final String CATEGORY = "category";
}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Base class for all Tika OOXML extractors.
+ *
+ * Tika extractors decorate POI extractors so that the parsed content of
+ * documents is returned as a sequence of XHTML SAX events. Subclasses must
+ * implement the buildXHTML method {...@link #buildXHTML(XHTMLContentHandler)}
that
+ * populates the {...@link XHTMLContentHandler} object received as parameter.
+ */
+public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
+ protected POIXMLTextExtractor extractor;
+
+ public AbstractOOXMLExtractor(POIXMLTextExtractor extractor) {
+ this.extractor = extractor;
+ }
+
+ /**
+ * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getDocument()
+ */
+ public POIXMLDocument getDocument() {
+ return extractor.getDocument();
+ }
+
+ /**
+ * @see
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getMetadataExtractor()
+ */
+ public MetadataExtractor getMetadataExtractor() {
+ return new MetadataExtractor(extractor);
+ }
+
+ /**
+ * @see
org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
+ * org.apache.tika.metadata.Metadata)
+ */
+ public XHTMLContentHandler getXHTML(ContentHandler handler,
+ Metadata metadata) throws SAXException, XmlException, IOException {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ buildXHTML(xhtml);
+ xhtml.endDocument();
+ return xhtml;
+ }
+
+ /**
+ * Populates the {...@link XHTMLContentHandler} object received as
parameter.
+ */
+ protected abstract void buildXHTML(XHTMLContentHandler xhtml)
+ throws SAXException, XmlException, IOException;
+}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.POIXMLProperties.CoreProperties;
+import org.apache.poi.POIXMLProperties.ExtendedProperties;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.internal.PackagePropertiesPart;
+import org.openxml4j.util.Nullable;
+import
org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+
+/**
+ * OOXML metadata extractor.
+ *
+ * Currently POI doesn't support metadata extraction for OOXML.
+ *
+ * @see OOXMLExtractor#getMetadataExtractor()
+ */
+public class MetadataExtractor {
+
+ private POIXMLTextExtractor extractor;
+
+ public MetadataExtractor(POIXMLTextExtractor extractor) {
+ this.extractor = extractor;
+ }
+
+ public void extract(Metadata metadata) throws TikaException {
+ try {
+ extractMetadata(extractor.getCoreProperties(), metadata);
+ extractMetadata(extractor.getExtendedProperties(), metadata);
+ } catch (IOException e) {
+ throw new TikaException("Error extracting OOXML metadata", e);
+ } catch (OpenXML4JException e) {
+ throw new TikaException("Error extracting OOXML metadata", e);
+ } catch (XmlException e) {
+ throw new TikaException("Error extracting OOXML metadata", e);
+ }
+ }
+
+ private void extractMetadata(CoreProperties properties, Metadata metadata)
{
+ PackagePropertiesPart propsHolder = properties
+ .getUnderlyingProperties();
+
+ addProperty(metadata, Metadata.CATEGORY,
propsHolder.getCategoryProperty());
+ addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
+ .getContentStatusProperty());
+ addProperty(metadata, Metadata.CONTENT_TYPE, propsHolder
+ .getContentType());
+ addProperty(metadata, Metadata.DATE, propsHolder
+ .getCreatedPropertyString());
+ addProperty(metadata, Metadata.CREATOR, propsHolder
+ .getCreatorProperty());
+ addProperty(metadata, Metadata.AUTHOR, propsHolder
+ .getCreatorProperty());
+ addProperty(metadata, Metadata.DESCRIPTION, propsHolder
+ .getDescriptionProperty());
+ addProperty(metadata, Metadata.IDENTIFIER, propsHolder
+ .getIdentifierProperty());
+ addProperty(metadata, Metadata.KEYWORDS, propsHolder
+ .getKeywordsProperty());
+ addProperty(metadata, Metadata.LANGUAGE, propsHolder
+ .getLanguageProperty());
+ addProperty(metadata, Metadata.LAST_AUTHOR, propsHolder
+ .getLastModifiedByProperty());
+ addProperty(metadata, Metadata.LAST_PRINTED, propsHolder
+ .getLastPrintedPropertyString());
+ addProperty(metadata, Metadata.LAST_MODIFIED, propsHolder
+ .getModifiedPropertyString());
+ addProperty(metadata, Metadata.REVISION_NUMBER, propsHolder
+ .getRevisionProperty());
+ addProperty(metadata, Metadata.SUBJECT, propsHolder
+ .getSubjectProperty());
+ addProperty(metadata, Metadata.TITLE, propsHolder.getTitleProperty());
+ addProperty(metadata, Metadata.VERSION,
propsHolder.getVersionProperty());
+ }
+
+ private void extractMetadata(ExtendedProperties properties,
+ Metadata metadata) {
+ CTProperties propsHolder = properties.getUnderlyingProperties();
+
+ addProperty(metadata, Metadata.APPLICATION_NAME, propsHolder
+ .getApplication());
+ addProperty(metadata, Metadata.APPLICATION_VERSION, propsHolder
+ .getAppVersion());
+ addProperty(metadata, Metadata.CHARACTER_COUNT, propsHolder
+ .getCharacters());
+ addProperty(metadata, Metadata.CHARACTER_COUNT_WITH_SPACES, propsHolder
+ .getCharactersWithSpaces());
+ addProperty(metadata, Metadata.PUBLISHER, propsHolder.getCompany());
+ addProperty(metadata, Metadata.LINE_COUNT, propsHolder.getLines());
+ addProperty(metadata, Metadata.MANAGER, propsHolder.getManager());
+ addProperty(metadata, Metadata.NOTES, propsHolder.getNotes());
+ addProperty(metadata, Metadata.PAGE_COUNT, propsHolder.getPages());
+ addProperty(metadata, Metadata.PARAGRAPH_COUNT,
propsHolder.getParagraphs());
+ addProperty(metadata, Metadata.PRESENTATION_FORMAT, propsHolder
+ .getPresentationFormat());
+ addProperty(metadata, Metadata.SLIDE_COUNT, propsHolder.getSlides());
+ addProperty(metadata, Metadata.TEMPLATE, propsHolder.getTemplate());
+ addProperty(metadata, Metadata.TOTAL_TIME, propsHolder.getTotalTime());
+ addProperty(metadata, Metadata.WORD_COUNT, propsHolder.getWords());
+ }
+
+ private void addProperty(Metadata metadata, String name, Nullable<?>
value) {
+ if (value.getValue() != null) {
+ addProperty(metadata, name, value.getValue().toString());
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, String value) {
+ if (value != null) {
+ metadata.set(name, value);
+ }
+ }
+
+ private void addProperty(Metadata metadata, String name, long value) {
+ if (value > 0) {
+ metadata.set(name, Long.toString(value));
+ }
+ }
+}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractor.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Interface implemented by all Tika OOXML extractors.
+ *
+ * @see POIXMLTextExtractor
+ */
+public interface OOXMLExtractor {
+
+ /**
+ * Returns the opened document.
+ *
+ * @see POIXMLTextExtractor#getDocument()
+ */
+ POIXMLDocument getDocument();
+
+ /**
+ * {...@link POIXMLTextExtractor#getMetadataTextExtractor()} not yet
supported
+ * for OOXML by POI.
+ */
+ MetadataExtractor getMetadataExtractor();
+
+ /**
+ * Returns to clients a {...@link XHTMLContentHandler} object representing
the
+ * parsed content of a document as XHTML SAX events.
+ */
+ XHTMLContentHandler getXHTML(ContentHandler handler, Metadata metadata)
+ throws SAXException, XmlException, IOException;
+}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+
+/**
+ * Figures out the correct {...@link OOXMLExtractor} for the supplied document
and
+ * returns it.
+ */
+public class OOXMLExtractorFactory {
+
+ public static OOXMLExtractor createExtractor(POIXMLTextExtractor
extractor) {
+ POIXMLDocument document = extractor.getDocument();
+
+ if (document instanceof XSLFSlideShow) {
+ return new XSLFPowerPointExtractorDecorator(
+ (XSLFPowerPointExtractor) extractor);
+ } else if (document instanceof XSSFWorkbook) {
+ return new XSSFExcelExtractorDecorator(
+ (XSSFExcelExtractor) extractor);
+ } else if (document instanceof XWPFDocument) {
+ return new XWPFWordExtractorDecorator((XWPFWordExtractor)
extractor);
+ } else {
+ return new POIXMLTextExtractorDecorator(extractor);
+ }
+ }
+}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParser.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.extractor.ExtractorFactory;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.InvalidFormatException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Office Open XML (OOXML) parser.
+ *
+ */
+public class OOXMLParser implements Parser {
+
+ /**
+ * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata)
+ */
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException
{
+
+ try {
+ OOXMLExtractor extractor = OOXMLExtractorFactory
+ .createExtractor((POIXMLTextExtractor) ExtractorFactory
+ .createExtractor(stream));
+ extractor.getXHTML(handler, metadata);
+ extractor.getMetadataExtractor().extract(metadata);
+
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (OpenXML4JException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ } catch (XmlException e) {
+ throw new TikaException("Error creating OOXML extractor", e);
+ }
+ }
+
+}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/POIXMLTextExtractorDecorator.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+public class POIXMLTextExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public POIXMLTextExtractorDecorator(POIXMLTextExtractor extractor) {
+ super(extractor);
+ }
+
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException {
+ // extract document content as a single string (not structured)
+ xhtml.element("p", extractor.getText());
+ }
+}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFPowerPointExtractorDecorator.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+
+import org.apache.poi.xslf.XSLFSlideShow;
+import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
+import org.apache.poi.xslf.usermodel.XMLSlideShow;
+import org.apache.poi.xslf.usermodel.XSLFSlide;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTComment;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTCommentList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.xml.sax.SAXException;
+
+public class XSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public XSLFPowerPointExtractorDecorator(XSLFPowerPointExtractor extractor)
{
+ super(extractor);
+ }
+
+ /**
+ * @see org.apache.poi.xslf.extractor.XSLFPowerPointExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ XSLFSlideShow slideShow = (XSLFSlideShow) extractor.getDocument();
+ XMLSlideShow xmlSlideShow = new XMLSlideShow(slideShow);
+
+ XSLFSlide[] slides = xmlSlideShow.getSlides();
+ for (XSLFSlide slide : slides) {
+ CTSlide rawSlide = slide._getCTSlide();
+ CTSlideIdListEntry slideId = slide._getCTSlideId();
+
+ CTNotesSlide notes = xmlSlideShow._getXSLFSlideShow().getNotes(
+ slideId);
+ CTCommentList comments = xmlSlideShow._getXSLFSlideShow()
+ .getSlideComments(slideId);
+
+ xhtml.startElement("div");
+ extractShapeContent(rawSlide.getCSld().getSpTree(), xhtml);
+
+ if (comments != null) {
+ for (CTComment comment : comments.getCmArray()) {
+ xhtml.element("p", comment.getText());
+ }
+ }
+
+ if (notes != null) {
+ extractShapeContent(notes.getCSld().getSpTree(), xhtml);
+ }
+ xhtml.endElement("div");
+ }
+ }
+
+ private void extractShapeContent(CTGroupShape gs, XHTMLContentHandler
xhtml)
+ throws SAXException {
+ CTShape[] shapes = gs.getSpArray();
+ for (CTShape shape : shapes) {
+ CTTextBody textBody = shape.getTxBody();
+ if (textBody != null) {
+ CTTextParagraph[] paras = textBody.getPArray();
+ for (CTTextParagraph textParagraph : paras) {
+ CTRegularTextRun[] textRuns = textParagraph.getRArray();
+ for (CTRegularTextRun textRun : textRuns) {
+ xhtml.element("p", textRun.getT());
+ }
+ }
+ }
+ }
+ }
+}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.Comment;
+import org.apache.poi.ss.usermodel.HeaderFooter;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
+import org.apache.poi.xssf.usermodel.XSSFCell;
+import org.apache.poi.xssf.usermodel.XSSFSheet;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.xml.sax.SAXException;
+
+public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public XSSFExcelExtractorDecorator(XSSFExcelExtractor extractor) {
+ super(extractor);
+ }
+
+ /**
+ * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ XSSFWorkbook document = (XSSFWorkbook) extractor.getDocument();
+
+ for (int i = 0; i < document.getNumberOfSheets(); i++) {
+ xhtml.startElement("div");
+ XSSFSheet sheet = (XSSFSheet) document.getSheetAt(i);
+ xhtml.element("h1", document.getSheetName(i));
+
+ // Header(s), if present
+ extractHeaderFooter(sheet.getFirstHeader(), xhtml);
+ extractHeaderFooter(sheet.getOddHeader(), xhtml);
+ extractHeaderFooter(sheet.getEvenHeader(), xhtml);
+
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+
+ // Rows and cells
+ for (Object rawR : sheet) {
+ xhtml.startElement("tr");
+ Row row = (Row) rawR;
+ for (Iterator<Cell> ri = row.cellIterator(); ri.hasNext();) {
+ xhtml.startElement("td");
+ Cell cell = ri.next();
+
+ if (cell.getCellType() == Cell.CELL_TYPE_FORMULA
+ || cell.getCellType() == Cell.CELL_TYPE_STRING) {
+ xhtml.characters(cell.getRichStringCellValue()
+ .getString());
+ } else {
+ XSSFCell xc = (XSSFCell) cell;
+ String rawValue = xc.getRawValue();
+ if (rawValue != null) {
+ xhtml.characters(rawValue);
+ }
+
+ }
+
+ // Output the comment in the same cell as the content
+ Comment comment = cell.getCellComment();
+ if (comment != null) {
+ xhtml.characters(comment.getString().getString());
+ }
+
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+
+ // Finally footer(s), if present
+ extractHeaderFooter(sheet.getFirstFooter(), xhtml);
+ extractHeaderFooter(sheet.getOddFooter(), xhtml);
+ extractHeaderFooter(sheet.getEvenFooter(), xhtml);
+
+ xhtml.endElement("div");
+ }
+ }
+
+ private void extractHeaderFooter(HeaderFooter hf, XHTMLContentHandler
xhtml)
+ throws SAXException {
+ String content = ExcelExtractor._extractHeaderFooter(hf);
+ if (content.length() > 0) {
+ xhtml.element("p", content);
+ }
+ }
+}
Added:
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
(added)
+++
lucene/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
+import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
+import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
+import org.apache.poi.xwpf.model.XWPFHyperlinkDecorator;
+import org.apache.poi.xwpf.model.XWPFParagraphDecorator;
+import org.apache.poi.xwpf.usermodel.XWPFDocument;
+import org.apache.poi.xwpf.usermodel.XWPFParagraph;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xmlbeans.XmlException;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTRow;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTc;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+import org.xml.sax.SAXException;
+
+public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
+
+ public XWPFWordExtractorDecorator(XWPFWordExtractor extractor) {
+ super(extractor);
+ }
+
+ /**
+ * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
+ */
+ @Override
+ protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
+ XmlException, IOException {
+ XWPFDocument document = (XWPFDocument) extractor.getDocument();
+ XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
+
+ // headers
+ if (hfPolicy.getFirstPageHeader() != null) {
+ xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
+ }
+ if (hfPolicy.getEvenPageHeader() != null) {
+ xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
+ }
+ if (hfPolicy.getDefaultHeader() != null) {
+ xhtml.element("p", hfPolicy.getDefaultHeader().getText());
+ }
+
+ // first all paragraphs
+ Iterator<XWPFParagraph> i = document.getParagraphsIterator();
+ while (i.hasNext()) {
+ XWPFParagraphDecorator decorator = new XWPFCommentsDecorator(
+ new XWPFHyperlinkDecorator(i.next(), null, true));
+ xhtml.element("p", decorator.getText());
+ }
+
+ // then all document tables
+ extractTableContent(document.getDocument().getBody().getTblArray(),
+ xhtml);
+
+ // footers
+ if (hfPolicy.getFirstPageFooter() != null) {
+ xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
+ }
+ if (hfPolicy.getEvenPageFooter() != null) {
+ xhtml.element("p", hfPolicy.getEvenPageFooter().getText());
+ }
+ if (hfPolicy.getDefaultFooter() != null) {
+ xhtml.element("p", hfPolicy.getDefaultFooter().getText());
+ }
+ }
+
+ /**
+ * Low level structured parsing of document tables.
+ */
+ private void extractTableContent(CTTbl[] tables, XHTMLContentHandler xhtml)
+ throws SAXException {
+ for (CTTbl table : tables) {
+ xhtml.startElement("table");
+ xhtml.startElement("tbody");
+ CTRow[] rows = table.getTrArray();
+ for (CTRow row : rows) {
+ xhtml.startElement("tr");
+ CTTc[] cells = row.getTcArray();
+ for (CTTc tc : cells) {
+ xhtml.startElement("td");
+ CTP[] content = tc.getPArray();
+ for (CTP ctp : content) {
+ CTR[] inner = ctp.getRArray();
+ for (CTR ctr : inner) {
+ CTText[] text = ctr.getTArray();
+ for (CTText textContent : text) {
+ xhtml.characters(textContent.getStringValue());
+ }
+ }
+ }
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("tbody");
+ xhtml.endElement("table");
+ }
+ }
+}
Modified: lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=744290&r1=744289&r2=744290&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ lucene/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Fri Feb 13
23:46:55 2009
@@ -202,6 +202,13 @@
<glob pattern="*.msg" />
</mime-type>
+ <mime-type type="application/vnd.openxmlformats-package.core-properties+xml">
+ <sub-class-of type="application/zip"/>
+ <glob pattern="*.docx" />
+ <glob pattern="*.pptx" />
+ <glob pattern="*.xlsx" />
+ </mime-type>
+
<!-- =====================================================================
-->
<!-- Open Document Format for Office Applications (OpenDocument) v1.0
-->
<!-- http://www.oasis-open.org/specs/index.php#opendocumentv1.0
-->
Modified: lucene/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/resources/tika-config.xml?rev=744290&r1=744289&r2=744290&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/resources/tika-config.xml (original)
+++ lucene/tika/trunk/src/main/resources/tika-config.xml Fri Feb 13 23:46:55
2009
@@ -36,6 +36,13 @@
<mime>application/vnd.visio</mime>
<mime>application/vnd.ms-outlook</mime>
</parser>
+
+ <parser name="parse-ooxml"
class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
+
<mime>application/vnd.openxmlformats-package.core-properties+xml</mime>
+
<mime>application/vnd.openxmlformats-officedocument.spreadsheetml.sheet</mime>
+
<mime>application/vnd.openxmlformats-officedocument.presentationml.presentation</mime>
+
<mime>application/vnd.openxmlformats-officedocument.wordprocessingml.document</mime>
+ </parser>
<parser name="parse-html"
class="org.apache.tika.parser.html.HtmlParser">
<mime>text/html</mime>
Added:
lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=744290&view=auto
==============================================================================
---
lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(added)
+++
lucene/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Fri Feb 13 23:46:55 2009
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.opendocument.OpenOfficeParserTest;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.parser.AutoDetectParser;
+
+public class OOXMLParserTest extends TestCase {
+
+ public void testExcel() throws Exception {
+ InputStream input = OpenOfficeParserTest.class
+ .getResourceAsStream("/test-documents/testEXCEL.xlsx");
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ // TODO: should auto-detect without the resource name
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "testEXCEL.xlsx");
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(input, handler, metadata);
+
+ assertEquals(
+
"application/vnd.openxmlformats-package.core-properties+xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Simple Excel document",
metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ String content = handler.toString();
+ assertTrue(content.contains("Sample Excel Worksheet"));
+ assertTrue(content.contains("Numbers and their Squares"));
+ assertTrue(content.contains("9"));
+ assertFalse(content.contains("9.0"));
+ assertTrue(content.contains("196"));
+ assertFalse(content.contains("196.0"));
+ } finally {
+ input.close();
+ }
+ }
+
+ public void testPowerPoint() throws Exception {
+ InputStream input = OpenOfficeParserTest.class
+ .getResourceAsStream("/test-documents/testPPT.pptx");
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ // TODO: should auto-detect without the resource name
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "testPPT.pptx");
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(input, handler, metadata);
+
+ assertEquals(
+
"application/vnd.openxmlformats-package.core-properties+xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Powerpoint Slide",
metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ String content = handler.toString();
+ assertTrue(content.contains("Sample Powerpoint Slide"));
+ assertTrue(content.contains("Powerpoint X for Mac"));
+ } finally {
+ input.close();
+ }
+
+ }
+
+ public void testWord() throws Exception {
+ InputStream input = OpenOfficeParserTest.class
+ .getResourceAsStream("/test-documents/testWORD.docx");
+
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ // TODO: should auto-detect without the resource name
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "testWORD.docx");
+ ContentHandler handler = new BodyContentHandler();
+
+ try {
+ parser.parse(input, handler, metadata);
+
+ assertEquals(
+
"application/vnd.openxmlformats-package.core-properties+xml",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ assertTrue(handler.toString().contains("Sample Word Document"));
+ } finally {
+ input.close();
+ }
+ }
+
+}
\ No newline at end of file
Added:
lucene/tika/trunk/src/test/resources/test-documents/testEXCEL-formats.xlsx
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/resources/test-documents/testEXCEL-formats.xlsx?rev=744290&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
lucene/tika/trunk/src/test/resources/test-documents/testEXCEL-formats.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/tika/trunk/src/test/resources/test-documents/testEXCEL.xlsx
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/resources/test-documents/testEXCEL.xlsx?rev=744290&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/src/test/resources/test-documents/testEXCEL.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/tika/trunk/src/test/resources/test-documents/testPPT.pptx
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/resources/test-documents/testPPT.pptx?rev=744290&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/src/test/resources/test-documents/testPPT.pptx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/tika/trunk/src/test/resources/test-documents/testWORD.docx
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/src/test/resources/test-documents/testWORD.docx?rev=744290&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/src/test/resources/test-documents/testWORD.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream