Author: tallison
Date: Mon Jul 20 15:27:32 2015
New Revision: 1691962
URL: http://svn.apache.org/r1691962
Log:
TIKA-1238: Update OutlookExtractor's codepoint detection algorithm
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1691962&r1=1691961&r2=1691962&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Jul 20 15:27:32 2015
@@ -1,5 +1,8 @@
Release 1.10 - Current Development
+ * Added more robust error handling for encoding detection
+ of .MSG files (TIKA-1238).
+
* Fixed bug in Tika's use of the Jackcess parser that
prevented reading of v97 Access files (TIKA-1681).
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1691962&r1=1691961&r2=1691962&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
Mon Jul 20 15:27:32 2015
@@ -18,26 +18,39 @@ package org.apache.tika.parser.microsoft
import java.io.ByteArrayInputStream;
import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.UnsupportedCharsetException;
import java.text.ParseException;
import java.util.Date;
+import java.util.List;
import java.util.Locale;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
import org.apache.poi.hsmf.MAPIMessage;
import org.apache.poi.hsmf.datatypes.AttachmentChunks;
import org.apache.poi.hsmf.datatypes.ByteChunk;
import org.apache.poi.hsmf.datatypes.Chunk;
+import org.apache.poi.hsmf.datatypes.Chunks;
import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.hsmf.datatypes.PropertyValue;
import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.datatypes.Types;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.util.CodePageUtil;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlEncodingDetector;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.parser.mbox.MboxParser;
import org.apache.tika.parser.rtf.RTFParser;
@@ -52,6 +65,9 @@ import org.xml.sax.SAXException;
* Outlook Message Parser.
*/
public class OutlookExtractor extends AbstractPOIFSExtractor {
+ private static final Metadata EMPTY_METADATA = new Metadata();
+ HtmlEncodingDetector detector = new HtmlEncodingDetector();
+
private final MAPIMessage msg;
public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context)
throws TikaException {
@@ -76,22 +92,7 @@ public class OutlookExtractor extends Ab
// If the message contains strings that aren't stored
// as Unicode, try to sort out an encoding for them
if (msg.has7BitEncodingStrings()) {
- if (msg.getHeaders() != null) {
- // There's normally something in the headers
- msg.guess7BitEncoding();
- } else {
- // Nothing in the header, try encoding detection
- // on the message body
- StringChunk text = msg.getMainChunks().textBodyChunk;
- if (text != null) {
- CharsetDetector detector = new CharsetDetector();
- detector.setText(text.getRawValue());
- CharsetMatch match = detector.detect();
- if (match.getConfidence() > 35) {
- msg.set7BitEncoding(match.getName());
- }
- }
- }
+ guess7BitEncoding(msg);
}
// Start with the metadata
@@ -262,4 +263,123 @@ public class OutlookExtractor extends Ab
xhtml.element("dd", value);
}
}
+
+ /**
+ * Tries to identify the correct encoding for 7-bit (non-unicode)
+ * strings in the file.
+ * <p>Many messages store their strings as unicode, which is
+ * nice and easy. Some use one-byte encodings for their
+ * strings, but don't always store the encoding anywhere
+ * helpful in the file.</p>
+ * <p>This method checks for codepage properties, and failing that
+ * looks at the headers for the message, and uses these to
+ * guess the correct encoding for your file.</p>
+ * <p>Bug #49441 has more on why this is needed</p>
+ * <p>This is taken verbatim from POI (TIKA-1238)
+ * as a temporary workaround to prevent unsupported encoding exceptions</p>
+ */
+ private void guess7BitEncoding(MAPIMessage msg) {
+ Chunks mainChunks = msg.getMainChunks();
+ //sanity check
+ if (mainChunks == null) {
+ return;
+ }
+
+ Map<MAPIProperty, List<PropertyValue>> props =
mainChunks.getProperties();
+ if (props != null) {
+ // First choice is a codepage property
+ for (MAPIProperty prop : new MAPIProperty[]{
+ MAPIProperty.MESSAGE_CODEPAGE,
+ MAPIProperty.INTERNET_CPID
+ }) {
+ List<PropertyValue> val = props.get(prop);
+ if (val != null && val.size() > 0) {
+ int codepage = ((PropertyValue.LongPropertyValue)
val.get(0)).getValue();
+ String encoding = null;
+ try {
+ encoding = CodePageUtil.codepageToEncoding(codepage,
true);
+ } catch (UnsupportedEncodingException e) {
+ //swallow
+ }
+ if (tryToSet7BitEncoding(msg, encoding)) {
+ return;
+ }
+ }
+ }
+ }
+
+ // Second choice is a charset on a content type header
+ try {
+ String[] headers = msg.getHeaders();
+ if(headers != null && headers.length > 0) {
+ // Look for a content type with a charset
+ Pattern p =
Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?",
Pattern.CASE_INSENSITIVE);
+
+ for(String header : headers) {
+ if(header.startsWith("Content-Type")) {
+ Matcher m = p.matcher(header);
+ if(m.matches()) {
+ // Found it! Tell all the string chunks
+ String charset = m.group(1);
+ if (tryToSet7BitEncoding(msg, charset)) {
+ return;
+ }
+ }
+ }
+ }
+ }
+ } catch(ChunkNotFoundException e) {}
+
+ // Nothing suitable in the headers, try HTML
+ // TODO: do we need to replicate this in Tika? If we wind up
+ // parsing the html version of the email, this is duplicative??
+ // Or do we need to reset the header strings based on the html
+ // meta header if there is no other information?
+ try {
+ String html = msg.getHtmlBody();
+ if(html != null && html.length() > 0) {
+ Charset charset = null;
+ try {
+ charset = detector.detect(new ByteArrayInputStream(
+ html.getBytes(IOUtils.UTF_8)), EMPTY_METADATA);
+ } catch (IOException e) {
+ //swallow
+ }
+ if (charset != null && tryToSet7BitEncoding(msg,
charset.name())) {
+ return;
+ }
+ }
+ } catch(ChunkNotFoundException e) {}
+
+ //absolute last resort, try charset detector
+ StringChunk text = mainChunks.textBodyChunk;
+ if (text != null) {
+ CharsetDetector detector = new CharsetDetector();
+ detector.setText(text.getRawValue());
+ CharsetMatch match = detector.detect();
+ if (match != null && match.getConfidence() > 35 &&
+ tryToSet7BitEncoding(msg, match.getName())) {
+ return;
+ }
+ }
+ }
+
+ private boolean tryToSet7BitEncoding(MAPIMessage msg, String charsetName) {
+ if (charsetName == null) {
+ return false;
+ }
+
+ if (charsetName.equalsIgnoreCase("utf-8")) {
+ return false;
+ }
+ try {
+ if (Charset.isSupported(charsetName)) {
+ msg.set7BitEncoding(charsetName);
+ return true;
+ }
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
+ //swallow
+ }
+ return false;
+ }
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1691962&r1=1691961&r2=1691962&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Mon Jul 20 15:27:32 2015
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.microsoft;
-import static org.apache.tika.TikaTest.assertContains;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
@@ -30,6 +29,7 @@ import java.io.StringWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
@@ -42,7 +42,7 @@ import org.xml.sax.ContentHandler;
/**
* Test case for parsing Outlook files.
*/
-public class OutlookParserTest {
+public class OutlookParserTest extends TikaTest {
@Test
public void testOutlookParsing() throws Exception {
@@ -181,6 +181,10 @@ public class OutlookParserTest {
// Make sure we don't have nested html docs
assertEquals(2, content.split("<body>").length);
assertEquals(2, content.split("<\\/body>").length);
+
+ // Make sure that the Chinese actually came through
+ assertContains("\u5F35\u6BD3\u502B",
metadata.get(TikaCoreProperties.CREATOR));
+ assertContains("\u9673\u60E0\u73CD", content);
}
@Test