This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 560e91a  TIKA-2456: fix detection of emails inside mbox
     new 9e6a91c  Merge branch 'master' of https://github.com/apache/tika.git
560e91a is described below

commit 560e91a176ca5ff1adfc3ff1c1f63e32ec4e928a
Author: lfcnassif <[email protected]>
AuthorDate: Thu Aug 31 13:31:01 2017 -0300

    TIKA-2456: fix detection of emails inside mbox
---
 CHANGES.txt                                        |  2 ++
 .../org/apache/tika/parser/mbox/MboxParser.java    |  1 +
 .../apache/tika/parser/mbox/MboxParserTest.java    | 15 +++++++++++++
 .../test/resources/test-documents/single_mail.mbox | 25 ++++++++++++++++++++++
 4 files changed, 43 insertions(+)

diff --git a/CHANGES.txt b/CHANGES.txt
index 9759a5e..b4e0e35 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.17 - ???
 
+  * Fix detection of emails extracted from mbox (TIKA-2456)
+  
   * Add OverrideDetector and allow PSTParser to specify body content type
     as text or html -- to avoid incorrect auto-detection of
     rfc/mbox, etc. (TIKA-2454)
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
index 11c8d4a..4aa1d67 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
@@ -103,6 +103,7 @@ public class MboxParser extends AbstractParser {
                     Queue<String> multiline = new LinkedList<String>();
                     mailMetadata.add(EMAIL_FROMLINE_METADATA, 
curLine.substring(MBOX_RECORD_DIVIDER.length()));
                     mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
+                    mailMetadata.set(TikaCoreProperties.CONTENT_TYPE_OVERRIDE, 
"message/rfc822");
                     curLine = reader.readLine();
                     if (curLine == null) {
                         break;
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index 94c4e70..22b1d9a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -152,4 +152,19 @@ public class MboxParserTest {
 
         assertContains("When a Mapper completes", handler.toString());
     }
+    
+    @Test
+    public void testOverrideDetector() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, new AutoDetectParser());
+
+        try (InputStream stream = 
getStream("/test-documents/single_mail.mbox")) {
+            mboxParser.parse(stream, handler, metadata, context);
+        }
+        
+        Metadata firstMail = mboxParser.getTrackingMetadata().get(0);
+        assertEquals("message/rfc822", firstMail.get(Metadata.CONTENT_TYPE));
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/single_mail.mbox 
b/tika-parsers/src/test/resources/test-documents/single_mail.mbox
new file mode 100644
index 0000000..753ed95
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/single_mail.mbox
@@ -0,0 +1,25 @@
+From 
core-user-return-14700-apmail-hadoop-core-user-archive=hadoop.apache....@hadoop.apache.org
 Mon Jun 01 04:28:28 2009
+Comments: comments before header to confuse detection
+Content-Type: text/html; charset=utf-8
+Content-Transfer-Encoding: 8bit
+
+<html>
+  <head>
+    <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  </head>
+  <body text="#000000" bgcolor="#FFFFFF">
+    <p>Então talvez resolva fazendo uma imagem da partição lógica após
+      bootar o sistema do HD.</p>
+    <p>Daqui a pouco vai aparecer o equivalente da Micrsoft: <a
+        class="moz-txt-link-freetext"
+href="https://technet.microsoft.com/en-us/library/hh831739%28v=ws.11%29.aspx";>https://technet.microsoft.com/en-us/library/hh831739(v=ws.11).aspx</a></p>
+    <p>Junto com o ReFS:
+      <a class="moz-txt-link-freetext"
+href="https://docs.microsoft.com/en-us/windows-server/storage/refs/refs-overview";>https://docs.microsoft.com/en-us/windows-server/storage/refs/refs-overview</a></p>
+    <p>abs<br>
+    </p>
+    <pre class="moz-signature" cols="72">Luís Filipe da Cruz Nassif</pre>
+    <pre class="moz-signature" cols="72">-- 
+Luís Filipe da Cruz Nassif</pre>
+  </body>
+</html>

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to