jukka
Wed, 09 Jul 2008 16:59:37 -0700
Author: jukka Date: Wed Jul 9 16:59:03 2008 New Revision: 675384 URL: http://svn.apache.org/viewvc?rev=675384&view=rev Log: TIKA-54: Outlook msg parser - Patch by Dave Meikle - Test file by Rida Benjelloun Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg (with props) Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml incubator/tika/trunk/src/main/resources/tika-config.xml incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java?rev=675384&view=auto ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java (added) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java Wed Jul 9 16:59:03 2008 @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.poi.hsmf.MAPIMessage; +import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.InputStream; +import java.io.IOException; + +/** + * Outlook Message Parser. + */ +public class OutlookMessageParser extends AbstractParser { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata) + throws IOException, TikaException, SAXException { + try { + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + + MAPIMessage msg = new MAPIMessage(stream); + metadata.add("from", msg.getDisplayFrom()); + metadata.add("to", msg.getDisplayTo()); + metadata.add(Metadata.SUBJECT, msg.getSubject()); + metadata.add("messageClass", msg.getMessageClass()); + metadata.add("conversationTopic", msg.getConversationTopic()); + + xhtml.element("p", msg.getTextBody()); + xhtml.endDocument(); + } + catch (ChunkNotFoundException ex) { + throw new TikaException("Error parsing message."); + } + } +} Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=675384&r1=675383&r2=675384&view=diff ============================================================================== --- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original) +++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Wed Jul 9 16:59:03 2008 @@ -184,6 +184,10 @@ <alias type="application/msexcel" /> </mime-type> + <mime-type type="application/vnd.ms-outlook"> + <glob pattern="*.msg" /> + </mime-type> + <!-- ===================================================================== --> <!-- Open Document Format for Office Applications (OpenDocument) v1.0 --> <!-- http://www.oasis-open.org/specs/index.php#opendocumentv1.0 --> Modified: incubator/tika/trunk/src/main/resources/tika-config.xml URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=675384&r1=675383&r2=675384&view=diff ============================================================================== --- incubator/tika/trunk/src/main/resources/tika-config.xml (original) +++ incubator/tika/trunk/src/main/resources/tika-config.xml Wed Jul 9 16:59:03 2008 @@ -36,6 +36,10 @@ <mime>application/vnd.visio</mime> </parser> + <parser name="parse-outlook" class="org.apache.tika.parser.microsoft.OutlookMessageParser"> + <mime>application/vnd.ms-outlook</mime> + </parser> + <parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser"> <mime>text/html</mime> <mime>application/x-asp</mime> Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=675384&r1=675383&r2=675384&view=diff ============================================================================== --- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java (original) +++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Wed Jul 9 16:59:03 2008 @@ -153,6 +153,14 @@ assertEquals(s1, s2); } + public void testOutlookExtraction() throws Exception { + File file = getTestFile("test-outlook.msg"); + String s1 = ParseUtils.getStringContent(file, tc); + String s2 = ParseUtils.getStringContent(file, tc, + "application/vnd.ms-outlook"); + assertEquals(s1, s2); + } + public void testHTMLExtraction() throws Exception { File file = getTestFile("testHTML.html"); String s1 = ParseUtils.getStringContent(file, tc); Added: incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg?rev=675384&view=auto ============================================================================== Binary file - no diff available. Propchange: incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream