Author: jukka
Date: Sun Sep 14 07:24:46 2008
New Revision: 695223
URL: http://svn.apache.org/viewvc?rev=695223&view=rev
Log:
TIKA-54: Outlook msg parser
Integrated Outlook parsing with OfficeParser. This way magic autodetection
works correctly also for Outlook files.
Added a better test case.
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
(contents, props changed)
- copied, changed from r692882,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
incubator/tika/trunk/src/main/resources/tika-config.xml
Modified: incubator/tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=695223&r1=695222&r2=695223&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Sep 14 07:24:46 2008
@@ -78,6 +78,9 @@
33. TIKA-120 - Add support for retrieving ID3 tags from MP3 files
(Dave Meikle & Jukka Zitting)
+34. TIKA-54 - Outlook msg parser
+ (Rida Benjelloun, Dave Meikle & Jukka Zitting)
+
Release 0.1-incubating - 12/27/2007
1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=695223&r1=695222&r2=695223&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Sun Sep 14 07:24:46 2008
@@ -75,6 +75,8 @@
setType(metadata, "application/vnd.ms-excel");
} else if ("VisioDocument".equals(name)) {
setType(metadata, "application/vnd.visio");
+ } else if (name.startsWith("__substg1.0_")) {
+ setType(metadata, "application/vnd.ms-outlook");
}
}
}
@@ -119,6 +121,9 @@
for (String text : extractor.getAllText()) {
xhtml.element("p", text);
}
+ } else if (name.startsWith("__substg1.0_")) {
+ setType(metadata, "application/vnd.ms-outlook");
+ new OutlookExtractor(filesystem).parse(xhtml, metadata);
}
}
Copied:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
(from r692882,
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java)
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?p2=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java&p1=incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java&r1=692882&r2=695223&rev=695223&view=diff
==============================================================================
---
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
(original)
+++
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
Sun Sep 14 07:24:46 2008
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -16,40 +16,82 @@
*/
package org.apache.tika.parser.microsoft;
-import org.apache.tika.metadata.Metadata;
+import java.io.IOException;
+
+import org.apache.poi.hsmf.datatypes.Chunks;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.hsmf.parsers.POIFSChunkParser;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.poi.hsmf.MAPIMessage;
-import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import java.io.InputStream;
-import java.io.IOException;
-
/**
* Outlook Message Parser.
*/
-public class OutlookMessageParser extends AbstractParser {
- public void parse(InputStream stream, ContentHandler handler, Metadata
metadata)
- throws IOException, TikaException, SAXException {
+class OutlookExtractor {
+
+ private static final Chunks CHUNKS = Chunks.getInstance();
+
+ private final POIFSChunkParser parser;
+
+ public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
try {
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
- xhtml.startDocument();
+ this.parser = new POIFSChunkParser(filesystem);
+ } catch (IOException e) {
+ throw new TikaException("Failed to parse Outlook chunks", e);
+ }
+ }
+
+ public void parse(XHTMLContentHandler xhtml, Metadata metadata)
+ throws TikaException, SAXException {
+ String subject = getChunk(CHUNKS.subjectChunk);
+ String from = getChunk(CHUNKS.displayFromChunk);
- MAPIMessage msg = new MAPIMessage(stream);
- metadata.add("from", msg.getDisplayFrom());
- metadata.add("to", msg.getDisplayTo());
- metadata.add(Metadata.SUBJECT, msg.getSubject());
- metadata.add("messageClass", msg.getMessageClass());
- metadata.add("conversationTopic", msg.getConversationTopic());
+ metadata.set(Metadata.AUTHOR, from);
+ metadata.set(Metadata.TITLE, subject);
+ metadata.set(Metadata.SUBJECT, getChunk(CHUNKS.conversationTopic));
- xhtml.element("p", msg.getTextBody());
- xhtml.endDocument();
+ xhtml.element("h1", subject);
+ xhtml.characters("\n");
+
+ xhtml.startElement("dl");
+ header(xhtml, "From", from);
+ header(xhtml, "To", getChunk(CHUNKS.displayToChunk));
+ header(xhtml, "Cc", getChunk(CHUNKS.displayCCChunk));
+ header(xhtml, "Bcc", getChunk(CHUNKS.displayBCCChunk));
+ xhtml.endElement("dl");
+ xhtml.characters("\n");
+
+ xhtml.element("p", getChunk(CHUNKS.textBodyChunk));
+ }
+
+ private void header(XHTMLContentHandler xhtml, String key, String value)
+ throws SAXException {
+ if (value.length() > 0) {
+ xhtml.element("dt", key);
+ xhtml.characters("\t");
+ xhtml.element("dd", value);
+ xhtml.characters("\n");
}
- catch (ChunkNotFoundException ex) {
- throw new TikaException("Error parsing message.");
+ }
+
+ /**
+ * Returns the content of the identified string chunk in the
+ * current document. Returns the empty string if the identified
+ * chunk does not exist in the current document.
+ *
+ * @param chunk string chunk identifier
+ * @return content of the identified chunk, or the empty string
+ */
+ private String getChunk(StringChunk chunk) {
+ try {
+ return parser.getDocumentNode(chunk).toString();
+ } catch (ChunkNotFoundException e) {
+ return "";
}
}
+
}
Propchange:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
------------------------------------------------------------------------------
svn:mergeinfo =
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=695223&r1=695222&r2=695223&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Sep 14 07:24:46
2008
@@ -34,9 +34,6 @@
<mime>application/vnd.ms-excel</mime>
<mime>application/vnd.ms-powerpoint</mime>
<mime>application/vnd.visio</mime>
- </parser>
-
- <parser name="parse-outlook"
class="org.apache.tika.parser.microsoft.OutlookMessageParser">
<mime>application/vnd.ms-outlook</mime>
</parser>
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=695223&view=auto
==============================================================================
---
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
(added)
+++
incubator/tika/trunk/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Sun Sep 14 07:24:46 2008
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing Outlook files.
+ */
+public class OutlookParserTest extends TestCase {
+
+ public void testOutlookParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook.msg");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(
+ "Microsoft Outlook Express 6",
+ metadata.get(Metadata.TITLE));
+ assertEquals(
+ "L'\u00C9quipe Microsoft Outlook Express",
+ metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Microsoft Outlook Express 6"));
+ assertTrue(content.contains("L'\u00C9quipe Microsoft Outlook
Express"));
+ assertTrue(content.contains("Nouvel utilisateur de Outlook Express"));
+ assertTrue(content.contains("Messagerie et groupes de discussion"));
+ }
+
+}