Author: jukka
Date: Sun Oct 14 08:41:54 2007
New Revision: 584558
URL: http://svn.apache.org/viewvc?rev=584558&view=rev
Log:
TIKA-63 - Avoid multiple passes over the input stream in Microsoft parsers
- Use POIFSFileSystem as the source of both metadata and text content
- Added separate test case classes for the Microsoft parsers
- Got rid of some extra listeners and exceptions
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
- copied, changed from r584532,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
(with props)
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
(with props)
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
(with props)
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/FastSavedException.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PasswordProtectedException.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PropertiesReaderListener.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Oct 14 08:41:54 2007
@@ -101,3 +101,6 @@
45. TIKA-58 - Replace jtidy html parser with nekohtml based parser (siren)
46. TIKA-60 - Rename Microsoft parser classes (jukka)
+
+47. TIKA-63 - Avoid multiple passes over the input stream in Microsoft parsers
+ (jukka)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java
Sun Oct 14 08:41:54 2007
@@ -16,21 +16,26 @@
*/
package org.apache.tika.parser.microsoft;
-import java.io.InputStream;
+import java.io.IOException;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Excel parser
*/
public class ExcelParser extends OfficeParser {
- protected String extractText(InputStream input) throws Exception {
+ protected String getContentType() {
+ return "application/vnd.ms-excel";
+ }
+
+ protected String extractText(POIFSFileSystem filesystem) throws
IOException{
StringBuilder builder = new StringBuilder();
- extractText(new HSSFWorkbook(input), builder);
+ extractText(new HSSFWorkbook(filesystem), builder);
return builder.toString();
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Sun Oct 14 08:41:54 2007
@@ -20,13 +20,16 @@
import java.io.IOException;
import java.io.InputStream;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.HPSFException;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.SummaryInformation;
-import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.XHTMLContentHandler;
-import org.apache.tika.utils.RereadableInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -35,49 +38,143 @@
*/
public abstract class OfficeParser implements Parser {
- private final int MEMORY_THRESHOLD = 1024 * 1024;
-
/**
* Extracts properties and text from an MS Document input stream
*/
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
- RereadableInputStream ris =
- new RereadableInputStream(stream, MEMORY_THRESHOLD, true, false);
- try {
- // First, extract properties
- POIFSReader reader = new POIFSReader();
- reader.registerListener(
- new PropertiesReaderListener(metadata),
- SummaryInformation.DEFAULT_STREAM_NAME);
+ POIFSFileSystem filesystem = new POIFSFileSystem(stream);
- if (stream.available() > 0) {
- reader.read(ris);
- }
- while (ris.read() != -1) {
- }
- ris.rewind();
- // Extract document full text
- XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
- xhtml.element("p", extractText(ris));
- xhtml.endDocument();
- } catch (IOException e) {
- throw e;
- } catch (TikaException e) {
- throw e;
- } catch (Exception e) {
- throw new TikaException("Parse error", e);
- } finally {
- ris.close();
- }
+ metadata.set(Metadata.CONTENT_TYPE, getContentType());
+ getMetadata(
+ filesystem, SummaryInformation.DEFAULT_STREAM_NAME, metadata);
+ getMetadata(
+ filesystem, DocumentSummaryInformation.DEFAULT_STREAM_NAME,
+ metadata);
+
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.element("p", extractText(filesystem));
+ xhtml.endDocument();
}
/**
+ * The content type of the document being parsed.
+ *
+ * @return MIME content type
+ */
+ protected abstract String getContentType();
+
+ /**
* Extracts the text content from a Microsoft document input stream.
*/
- protected abstract String extractText(InputStream input) throws Exception;
+ protected abstract String extractText(POIFSFileSystem filesystem)
+ throws IOException, TikaException;
+
+ private void getMetadata(
+ POIFSFileSystem filesystem, String name, Metadata metadata) {
+ try {
+ InputStream stream = filesystem.createDocumentInputStream(name);
+ try {
+ getMetadata(stream, metadata);
+ } finally {
+ stream.close();
+ }
+ } catch (Exception e) {
+ // summary information not available, ignore
+ }
+ }
+
+ private void getMetadata(InputStream stream, Metadata metadata)
+ throws HPSFException, IOException {
+ PropertySet set = PropertySetFactory.create(stream);
+ if (set instanceof SummaryInformation) {
+ getMetadata((SummaryInformation) set, metadata);
+ } else if (set instanceof DocumentSummaryInformation) {
+ getMetadata((DocumentSummaryInformation) set, metadata);
+ }
+ }
+
+ private void getMetadata(
+ SummaryInformation information, Metadata metadata) {
+ if (information.getTitle() != null) {
+ metadata.set(Metadata.TITLE, information.getTitle());
+ }
+ if (information.getAuthor() != null) {
+ metadata.set(Metadata.AUTHOR, information.getAuthor());
+ }
+ if (information.getKeywords() != null) {
+ metadata.set(Metadata.KEYWORDS, information.getKeywords());
+ }
+ if (information.getSubject() != null) {
+ metadata.set(Metadata.SUBJECT, information.getSubject());
+ }
+ if (information.getLastAuthor() != null) {
+ metadata.set(Metadata.LAST_AUTHOR, information.getLastAuthor());
+ }
+ if (information.getComments() != null) {
+ metadata.set(Metadata.COMMENTS, information.getComments());
+ }
+ if (information.getTemplate() != null) {
+ metadata.set(Metadata.TEMPLATE, information.getTemplate());
+ }
+ if (information.getApplicationName() != null) {
+ metadata.set(
+ Metadata.APPLICATION_NAME,
+ information.getApplicationName());
+ }
+ if (information.getRevNumber() != null) {
+ metadata.set(Metadata.REVISION_NUMBER, information.getRevNumber());
+ }
+ if (information.getCreateDateTime() != null) {
+ metadata.set(
+ "creationdate",
+ information.getCreateDateTime().toString());
+ }
+ if (information.getCharCount() > 0) {
+ metadata.set(
+ Metadata.CHARACTER_COUNT,
+ Integer.toString(information.getCharCount()));
+ }
+ if (information.getEditTime() > 0) {
+ metadata.set("edittime", Long.toString(information.getEditTime()));
+ }
+ if (information.getLastSaveDateTime() != null) {
+ metadata.set(
+ Metadata.LAST_SAVED,
+ information.getLastSaveDateTime().toString());
+ }
+ if (information.getPageCount() > 0) {
+ metadata.set(
+ Metadata.PAGE_COUNT,
+ Integer.toString(information.getPageCount()));
+ }
+ if (information.getSecurity() > 0) {
+ metadata.set(
+ "security", Integer.toString(information.getSecurity()));
+ }
+ if (information.getWordCount() > 0) {
+ metadata.set(
+ Metadata.WORD_COUNT,
+ Integer.toString(information.getWordCount()));
+ }
+ if (information.getLastPrinted() != null) {
+ metadata.set(
+ Metadata.LAST_PRINTED,
+ information.getLastPrinted().toString());
+ }
+ }
+
+ private void getMetadata(
+ DocumentSummaryInformation information, Metadata metadata) {
+ if (information.getCompany() != null) {
+ metadata.set("company", information.getCompany());
+ }
+ if (information.getManager() != null) {
+ metadata.set("manager", information.getManager());
+ }
+ }
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PPTConstants.java
Sun Oct 14 08:41:54 2007
@@ -47,11 +47,6 @@
/** ATOM ID of drawing group area */
public static final long PPT_ATOM_DRAWINGGROUP = 61448L;
- /** Name for PowerPoint Documents within the file */
- public static final String POWERPOINT_DOCUMENT = "PowerPoint Document";
-
-
-
/**
* Protected constructor to prevent instantiation.
*/
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
(from r584532,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java&r1=584532&r2=584558&rev=584558&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ContentReaderListener.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java
Sun Oct 14 08:41:54 2007
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.InputStream;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
@@ -23,17 +24,10 @@
import org.apache.log4j.Logger;
import org.apache.poi.hdf.extractor.Utils;
-import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
-import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
-/**
- * Listener to read the content of PowerPoint file and transfers it to the
- * passed <code>StringBuilder</code>.
- */
-class ContentReaderListener implements POIFSReaderListener {
+class PowerPointExtractor {
static Logger LOG = Logger.getRootLogger();
@@ -43,31 +37,15 @@
/**
* Constructs Listener to get content of PowerPoint file.
*/
- public ContentReaderListener(StringBuilder builder) {
+ public PowerPointExtractor(StringBuilder builder) {
this.builder = builder;
}
/**
* Reads the internal PowerPoint document stream.
- *
- * @see
org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
*/
- public void processPOIFSReaderEvent(final POIFSReaderEvent event) {
-
- if (event == null
- || event.getName() == null
- || !event.getName()
- .startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
-
- LOG
- .warn("Stream not processed. It is not a PowerPoint
document: : "
- + event.getName());
-
- return;
- }
-
+ public void extract(InputStream dis) {
try {
- final DocumentInputStream dis = event.getStream();
final byte pptdata[] = new byte[dis.available()];
dis.read(pptdata, 0, dis.available());
int offset = 0;
@@ -457,4 +435,5 @@
return slides;
}
+
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java
Sun Oct 14 08:41:54 2007
@@ -16,22 +16,35 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.IOException;
import java.io.InputStream;
-import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Power point parser
*/
public class PowerPointParser extends OfficeParser {
- protected String extractText(InputStream input) throws Exception {
+ /**
+ * Name of a PowerPoint document within a POIFS file system
+ */
+ private static final String POWERPOINT = "PowerPoint Document";
+
+ protected String getContentType() {
+ return "application/vnd.ms-powerpoint";
+ }
+
+ protected String extractText(POIFSFileSystem filesystem) throws
IOException {
StringBuilder builder = new StringBuilder();
- POIFSReader reader = new POIFSReader();
- reader.registerListener(
- new ContentReaderListener(builder),
- PPTConstants.POWERPOINT_DOCUMENT);
- reader.read(input);
+
+ InputStream stream = filesystem.createDocumentInputStream(POWERPOINT);
+ try {
+ new PowerPointExtractor(builder).extract(stream);
+ } finally {
+ stream.close();
+ }
+
return builder.toString();
}
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java
Sun Oct 14 08:41:54 2007
@@ -20,6 +20,7 @@
import org.apache.poi.util.LittleEndian;
import org.apache.poi.hwpf.model.*;
+import java.io.IOException;
import java.util.*;
/**
@@ -44,8 +45,7 @@
* @return The text from the document
* @throws Exception If there are any unexpected exceptions.
*/
- public String extractText(byte[] mainStream) throws Exception
- {
+ public String extractText(byte[] mainStream) throws IOException {
int fcMin = LittleEndian.getInt(mainStream, 0x18);
int fcMax = LittleEndian.getInt(mainStream, 0x1C);
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=584558&r1=584557&r2=584558&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java
Sun Oct 14 08:41:54 2007
@@ -16,7 +16,7 @@
*/
package org.apache.tika.parser.microsoft;
-import java.io.InputStream;
+import java.io.IOException;
import java.util.Iterator;
import java.util.List;
@@ -31,20 +31,24 @@
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;
+import org.apache.tika.exception.TikaException;
/**
* Word parser
*/
public class WordParser extends OfficeParser {
+ protected String getContentType() {
+ return "application/msword";
+ }
+
/**
* Gets the text from a Word document.
*
* @param in The InputStream representing the Word file.
*/
- public String extractText(InputStream in) throws Exception {
- POIFSFileSystem fsys = new POIFSFileSystem(in);
-
+ public String extractText(POIFSFileSystem fsys)
+ throws IOException, TikaException {
// load our POIFS document streams.
DocumentEntry headerProps =
(DocumentEntry) fsys.getRoot().getEntry("WordDocument");
@@ -56,12 +60,10 @@
int info = LittleEndian.getShort(header, 0xa);
if ((info & 0x4) != 0) {
- throw new FastSavedException(
- "Fast-saved files are unsupported at this time");
+ throw new TikaException("Fast-saved files are unsupported");
}
if ((info & 0x100) != 0) {
- throw new PasswordProtectedException(
- "This document is password protected");
+ throw new TikaException("This document is password protected");
}
// determine the version of Word this document came from.
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=584558&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Sun Oct 14 08:41:54 2007
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class ExcelParserTest extends TestCase {
+
+ public void testExcelParser() throws Exception {
+ InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL.xls");
+ try {
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ ContentHandler handler = new WriteOutContentHandler(writer);
+ new ExcelParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Simple Excel document",
metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ String content = writer.toString();
+ assertTrue(content.contains("Sample Excel Worksheet"));
+ assertTrue(content.contains("Numbers and their Squares"));
+ assertTrue(content.contains("9.0"));
+ assertTrue(content.contains("196.0"));
+ } finally {
+ input.close();
+ }
+ }
+
+}
Propchange:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java?rev=584558&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
Sun Oct 14 08:41:54 2007
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class PowerPointParserTest extends TestCase {
+
+ public void testPowerPointParser() throws Exception {
+ InputStream input = PowerPointParserTest.class.getResourceAsStream(
+ "/test-documents/testPPT.ppt");
+ try {
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ ContentHandler handler = new WriteOutContentHandler(writer);
+ new PowerPointParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/vnd.ms-powerpoint",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Powerpoint Slide",
metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ String content = writer.toString();
+ assertTrue(content.contains("Sample Powerpoint Slide"));
+ assertTrue(content.contains("Powerpoint X for Mac"));
+ } finally {
+ input.close();
+ }
+ }
+
+}
Propchange:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=584558&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Sun Oct 14 08:41:54 2007
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
+
+import junit.framework.TestCase;
+
+public class WordParserTest extends TestCase {
+
+ public void testWordParser() throws Exception {
+ InputStream input = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD.doc");
+ try {
+ Metadata metadata = new Metadata();
+ StringWriter writer = new StringWriter();
+ ContentHandler handler = new WriteOutContentHandler(writer);
+ new WordParser().parse(input, handler, metadata);
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ String content = writer.toString();
+ assertTrue(content.contains("Sample Word Document"));
+ } finally {
+ input.close();
+ }
+ }
+
+}
Propchange:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native