Author: jukka
Date: Sun Sep 14 07:24:46 2008
New Revision: 695223

URL: http://svn.apache.org/viewvc?rev=695223&view=rev
Log:
TIKA-54: Outlook msg parser

Integrated Outlook parsing with OfficeParser. This way magic autodetection 
works correctly also for Outlook files.

Added a better test case.

Added:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
   (contents, props changed)
      - copied, changed from r692882, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
    
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Removed:
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    incubator/tika/trunk/src/main/resources/tika-config.xml

Modified: incubator/tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=695223&r1=695222&r2=695223&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Sep 14 07:24:46 2008
@@ -78,6 +78,9 @@
 33. TIKA-120 - Add support for retrieving ID3 tags from MP3 files
                (Dave Meikle & Jukka Zitting)
 
+34. TIKA-54  - Outlook msg parser
+               (Rida Benjelloun, Dave Meikle & Jukka Zitting)
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=695223&r1=695222&r2=695223&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Sun Sep 14 07:24:46 2008
@@ -75,6 +75,8 @@
                 setType(metadata, "application/vnd.ms-excel");
             } else if ("VisioDocument".equals(name)) {
                 setType(metadata, "application/vnd.visio");
+            } else if (name.startsWith("__substg1.0_")) {
+                setType(metadata, "application/vnd.ms-outlook");
             }
         }
     }
@@ -119,6 +121,9 @@
                 for (String text : extractor.getAllText()) {
                     xhtml.element("p", text);
                 }
+            } else if (name.startsWith("__substg1.0_")) {
+                setType(metadata, "application/vnd.ms-outlook");
+                new OutlookExtractor(filesystem).parse(xhtml, metadata);
             }
         }
 

Copied: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 (from r692882, 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java)
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java&r1=692882&r2=695223&rev=695223&view=diff
==============================================================================
--- 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
 (original)
+++ 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 Sun Sep 14 07:24:46 2008
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -16,40 +16,82 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import org.apache.tika.metadata.Metadata;
+import java.io.IOException;
+
+import org.apache.poi.hsmf.datatypes.Chunks;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.hsmf.parsers.POIFSChunkParser;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.poi.hsmf.MAPIMessage;
-import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import java.io.InputStream;
-import java.io.IOException;
-
 /**
  * Outlook Message Parser.
  */
-public class OutlookMessageParser extends AbstractParser {
-    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata)
-            throws IOException, TikaException, SAXException {
+class OutlookExtractor {
+
+    private static final Chunks CHUNKS = Chunks.getInstance();
+
+    private final POIFSChunkParser parser;
+
+    public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
         try {
-            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
-            xhtml.startDocument();
+            this.parser = new POIFSChunkParser(filesystem);
+        } catch (IOException e) {
+            throw new TikaException("Failed to parse Outlook chunks", e);
+        }
+    }
+
+    public void parse(XHTMLContentHandler xhtml, Metadata metadata)
+            throws TikaException, SAXException {
+        String subject = getChunk(CHUNKS.subjectChunk);
+        String from = getChunk(CHUNKS.displayFromChunk);
 
-            MAPIMessage msg = new MAPIMessage(stream);
-            metadata.add("from", msg.getDisplayFrom());
-            metadata.add("to", msg.getDisplayTo());
-            metadata.add(Metadata.SUBJECT, msg.getSubject());
-            metadata.add("messageClass", msg.getMessageClass());
-            metadata.add("conversationTopic", msg.getConversationTopic());
+        metadata.set(Metadata.AUTHOR, from);
+        metadata.set(Metadata.TITLE, subject);
+        metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
 
-            xhtml.element("p", msg.getTextBody());
-            xhtml.endDocument();
+        xhtml.element("h1", subject);
+        xhtml.characters("\n");
+
+        xhtml.startElement("dl");
+        header(xhtml, "From", from);
+        header(xhtml, "To", getChunk(CHUNKS.displayToChunk));
+        header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
+        header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
+        xhtml.endElement("dl");
+        xhtml.characters("\n");
+
+        xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
+    }
+
+    private void header(XHTMLContentHandler xhtml, String key, String value)
+            throws SAXException {
+        if (value.length() > 0) {
+            xhtml.element("dt", key);
+            xhtml.characters("\t");
+            xhtml.element("dd", value);
+            xhtml.characters("\n");
         }
-        catch (ChunkNotFoundException ex) {
-            throw new TikaException("Error parsing message.");
+    }
+
+    /**
+     * Returns the content of the identified string chunk in the
+     * current document. Returns the empty string if the identified
+     * chunk does not exist in the current document.
+     *
+     * @param chunk string chunk identifier
+     * @return content of the identified chunk, or the empty string
+     */
+    private String getChunk(StringChunk chunk) {
+        try {
+            return parser.getDocumentNode(chunk).toString();
+        } catch (ChunkNotFoundException e) {
+            return "";
         }
     }
+
 }

Propchange: 
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
------------------------------------------------------------------------------
    svn:mergeinfo = 

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=695223&r1=695222&r2=695223&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Sep 14 07:24:46 
2008
@@ -34,9 +34,6 @@
                 <mime>application/vnd.ms-excel</mime>
                 <mime>application/vnd.ms-powerpoint</mime>
                 <mime>application/vnd.visio</mime>
-        </parser>
-
-        <parser name="parse-outlook" 
class="org.apache.tika.parser.microsoft.OutlookMessageParser">
                 <mime>application/vnd.ms-outlook</mime>
         </parser>
 

Added: 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=695223&view=auto
==============================================================================
--- 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 (added)
+++ 
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 Sun Sep 14 07:24:46 2008
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing Outlook files.
+ */
+public class OutlookParserTest extends TestCase {
+
+    public void testOutlookParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook.msg");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(
+                "Microsoft Outlook Express 6",
+                metadata.get(Metadata.TITLE));
+        assertEquals(
+                "L'\u00C9quipe Microsoft Outlook Express",
+                metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertTrue(content.contains("Microsoft Outlook Express 6"));
+        assertTrue(content.contains("L'\u00C9quipe Microsoft Outlook 
Express"));
+        assertTrue(content.contains("Nouvel utilisateur de Outlook Express"));
+        assertTrue(content.contains("Messagerie et groupes de discussion"));
+    }
+
+}


Reply via email to