Author: dmeikle
Date: Sun Mar 28 22:46:52 2010
New Revision: 928505
URL: http://svn.apache.org/viewvc?rev=928505&view=rev
Log:
TIKA-395: Update to allow OutlookParser to support new format Outlook messages.
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg
(with props)
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=928505&r1=928504&r2=928505&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
Sun Mar 28 22:46:52 2010
@@ -33,13 +33,14 @@ import org.xml.sax.SAXException;
*/
class OutlookExtractor {
- private static final Chunks CHUNKS = Chunks.getInstance(false);
+ private final Chunks chunks;
private final POIFSChunkParser parser;
public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
try {
this.parser = new POIFSChunkParser(filesystem);
+ this.chunks = parser.identifyChunks();
} catch (IOException e) {
throw new TikaException("Failed to parse Outlook chunks", e);
}
@@ -47,23 +48,23 @@ class OutlookExtractor {
public void parse(XHTMLContentHandler xhtml, Metadata metadata)
throws TikaException, SAXException {
- String subject = getChunk(CHUNKS.subjectChunk);
- String from = getChunk(CHUNKS.displayFromChunk);
+ String subject = getChunk(chunks.subjectChunk);
+ String from = getChunk(chunks.displayFromChunk);
metadata.set(Metadata.AUTHOR, from);
metadata.set(Metadata.TITLE, subject);
- metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
+ metadata.set(Metadata.SUBJECT, getChunk(chunks.conversationTopic));
xhtml.element("h1", subject);
xhtml.startElement("dl");
header(xhtml, "From", from);
- header(xhtml, "To", getChunk(CHUNKS.displayToChunk));
- header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
- header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
+ header(xhtml, "To", getChunk(chunks.displayToChunk));
+ header(xhtml, "Cc", getChunk(chunks.displayCCChunk));
+ header(xhtml, "Bcc", getChunk(chunks.displayBCCChunk));
xhtml.endElement("dl");
- xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
+ xhtml.element("p", getChunk(chunks.textBodyChunk));
}
private void header(XHTMLContentHandler xhtml, String key, String value)
Modified:
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=928505&r1=928504&r2=928505&view=diff
==============================================================================
---
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
(original)
+++
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Sun Mar 28 22:46:52 2010
@@ -93,4 +93,35 @@ public class OutlookParserTest extends T
assertFalse(matcher.find());
}
+ /**
+ * Test case for TIKA-395, to ensure parser works for new Outlook formats.
+ *
+ * @see <a
href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
+ */
+ public void testOutlookNew() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook2003.msg");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(
+ "Welcome to Microsoft Office Outlook 2003",
+ metadata.get(Metadata.TITLE));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Outlook 2003"));
+ assertTrue(content.contains("Streamlined Mail Experience"));
+ assertTrue(content.contains("Navigation Pane"));
+ }
+
}
Added:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg?rev=928505&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream