This is an automated email from the ASF dual-hosted git repository.
lfcnassif pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 560e91a TIKA-2456: fix detection of emails inside mbox
new 9e6a91c Merge branch 'master' of https://github.com/apache/tika.git
560e91a is described below
commit 560e91a176ca5ff1adfc3ff1c1f63e32ec4e928a
Author: lfcnassif <[email protected]>
AuthorDate: Thu Aug 31 13:31:01 2017 -0300
TIKA-2456: fix detection of emails inside mbox
---
CHANGES.txt | 2 ++
.../org/apache/tika/parser/mbox/MboxParser.java | 1 +
.../apache/tika/parser/mbox/MboxParserTest.java | 15 +++++++++++++
.../test/resources/test-documents/single_mail.mbox | 25 ++++++++++++++++++++++
4 files changed, 43 insertions(+)
diff --git a/CHANGES.txt b/CHANGES.txt
index 9759a5e..b4e0e35 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.17 - ???
+ * Fix detection of emails extracted from mbox (TIKA-2456)
+
* Add OverrideDetector and allow PSTParser to specify body content type
as text or html -- to avoid incorrect auto-detection of
rfc/mbox, etc. (TIKA-2454)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index 11c8d4a..4aa1d67 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -103,6 +103,7 @@ public class MboxParser extends AbstractParser {
Queue<String> multiline = new LinkedList<String>();
mailMetadata.add(EMAIL_FROMLINE_METADATA,
curLine.substring(MBOX_RECORD_DIVIDER.length()));
mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+ mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE,
"message/rfc822");
curLine = reader.readLine();
if (curLine == null) {
break;
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index 94c4e70..22b1d9a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -152,4 +152,19 @@ public class MboxParserTest {
assertContains("When a Mapper completes", handler.toString());
}
+
+ @Test
+ public void testOverrideDetector() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, new AutoDetectParser());
+
+ try (InputStream stream =
getStream("/test-documents/single_mail.mbox")) {
+ mboxParser.parse(stream, handler, metadata, context);
+ }
+
+ Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
+ assertEquals("message/rfc822", firstMail.get(Metadata.CONTENT_TYPE));
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/single_mail.mbox
b/tika-parsers/src/test/resources/test-documents/single_mail.mbox
new file mode 100644
index 0000000..753ed95
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/single_mail.mbox
@@ -0,0 +1,25 @@
+From
core-user-return-14700-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org
Mon Jun 01 04:28:28 2009
+Comments: comments before header to confuse detection
+Content-Type: text/html; charset=utf-8
+Content-Transfer-Encoding: 8bit
+
+<html>
+ <head>
+ <meta http-equiv="content-type" content="text/html; charset=utf-8">
+ </head>
+ <body text="#000000" bgcolor="#FFFFFF">
+ <p>Então talvez resolva fazendo uma imagem da partição lógica após
+ bootar o sistema do HD.</p>
+ <p>Daqui a pouco vai aparecer o equivalente da Micrsoft: <a
+ class="moz-txt-link-freetext"
+href="https://technet.microsoft.com/en-us/library/hh831739%28v=ws.11%29.aspx">https://technet.microsoft.com/en-us/library/hh831739(v=ws.11).aspx</a></p>
+ <p>Junto com o ReFS:
+ <a class="moz-txt-link-freetext"
+href="https://docs.microsoft.com/en-us/windows-server/storage/refs/refs-overview">https://docs.microsoft.com/en-us/windows-server/storage/refs/refs-overview</a></p>
+ <p>abs<br>
+ </p>
+ <pre class="moz-signature" cols="72">LuÃs Filipe da Cruz Nassif</pre>
+ <pre class="moz-signature" cols="72">--
+LuÃs Filipe da Cruz Nassif</pre>
+ </body>
+</html>
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].