Author: dmeikle
Date: Sun Mar 28 22:46:52 2010
New Revision: 928505

URL: http://svn.apache.org/viewvc?rev=928505&view=rev
Log:
TIKA-395: Update to allow OutlookParser to support new format Outlook messages.

Added:
    
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg
   (with props)
Modified:
    
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

Modified: 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=928505&r1=928504&r2=928505&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 Sun Mar 28 22:46:52 2010
@@ -33,13 +33,14 @@ import org.xml.sax.SAXException;
  */
 class OutlookExtractor {
 
-    private static final Chunks CHUNKS = Chunks.getInstance(false);
+    private final Chunks chunks;
 
     private final POIFSChunkParser parser;
 
     public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
         try {
             this.parser = new POIFSChunkParser(filesystem);
+            this.chunks = parser.identifyChunks();
         } catch (IOException e) {
             throw new TikaException("Failed to parse Outlook chunks", e);
         }
@@ -47,23 +48,23 @@ class OutlookExtractor {
 
     public void parse(XHTMLContentHandler xhtml, Metadata metadata)
             throws TikaException, SAXException {
-        String subject = getChunk(CHUNKS.subjectChunk);
-        String from = getChunk(CHUNKS.displayFromChunk);
+        String subject = getChunk(chunks.subjectChunk);
+        String from = getChunk(chunks.displayFromChunk);
 
         metadata.set(Metadata.AUTHOR, from);
         metadata.set(Metadata.TITLE, subject);
-        metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
+        metadata.set(Metadata.SUBJECT, getChunk(chunks.conversationTopic));
 
         xhtml.element("h1", subject);
 
         xhtml.startElement("dl");
         header(xhtml, "From", from);
-        header(xhtml, "To", getChunk(CHUNKS.displayToChunk));
-        header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
-        header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
+        header(xhtml, "To", getChunk(chunks.displayToChunk));
+        header(xhtml, "Cc", getChunk(chunks.displayCCChunk));
+        header(xhtml, "Bcc", getChunk(chunks.displayBCCChunk));
         xhtml.endElement("dl");
 
-        xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
+        xhtml.element("p", getChunk(chunks.textBodyChunk));
     }
 
     private void header(XHTMLContentHandler xhtml, String key, String value)

Modified: 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=928505&r1=928504&r2=928505&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 (original)
+++ 
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 Sun Mar 28 22:46:52 2010
@@ -93,4 +93,35 @@ public class OutlookParserTest extends T
         assertFalse(matcher.find());
     }
 
+    /**
+     * Test case for TIKA-395, to ensure parser works for new Outlook formats. 
+     *
+     * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-395";>TIKA-395</a>
+     */
+    public void testOutlookNew() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook2003.msg");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(
+                "Welcome to Microsoft Office Outlook 2003",
+                metadata.get(Metadata.TITLE));
+
+        String content = handler.toString();
+        assertTrue(content.contains("Outlook 2003"));
+        assertTrue(content.contains("Streamlined Mail Experience"));
+        assertTrue(content.contains("Navigation Pane"));
+    }
+
 }

Added: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg?rev=928505&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/test-outlook2003.msg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to