tika git commit: TIKA-2122 : add all headers from MSG and RFC822 files, update changes file

2016-10-17 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/master 8e819c3ca -> bf08ba94b


TIKA-2122 : add all headers from MSG and RFC822 files, update changes file


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/bf08ba94
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/bf08ba94
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/bf08ba94

Branch: refs/heads/master
Commit: bf08ba94b9c8f33759dc5bf91b02aaa1dde7670e
Parents: 8e819c3
Author: tballison 
Authored: Mon Oct 17 14:29:45 2016 -0400
Committer: tballison 
Committed: Mon Oct 17 14:29:45 2016 -0400

--
 CHANGES.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/bf08ba94/CHANGES.txt
--
diff --git a/CHANGES.txt b/CHANGES.txt
index 42c0a87..b5fd6f7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,8 @@
 Release 1.14 - ???
 
-  * Upgrade metadata extractor to 2.9.1 (TIKA-2113).
+  * Extract all headers from MSG/RFC822 (TIKA-2122).
+
+  * Upgrade metadata-extractor to 2.9.1 (TIKA-2113).
 
   * Extract PDF DocInfo metadata into separate keys to prevent
 overwriting by XMP metadata (TIKA-2057).



tika git commit: TIKA-2122 : add all headers from MSG and RFC822 files

2016-10-17 Thread tallison
Repository: tika
Updated Branches:
  refs/heads/master bfd1d9139 -> 8e819c3ca


TIKA-2122 : add all headers from MSG and RFC822 files


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/8e819c3c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/8e819c3c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/8e819c3c

Branch: refs/heads/master
Commit: 8e819c3caf3ff3b0492f600b4193d1b3ee74f51b
Parents: bfd1d91
Author: tballison 
Authored: Mon Oct 17 14:10:46 2016 -0400
Committer: tballison 
Committed: Mon Oct 17 14:10:46 2016 -0400

--
 .../java/org/apache/tika/metadata/Message.java  |  6 ++
 .../src/test/java/org/apache/tika/TikaTest.java |  8 ++
 .../tika/parser/mail/MailContentHandler.java|  5 ++
 .../tika/parser/microsoft/OutlookExtractor.java | 87 +++-
 .../tika/parser/mail/RFC822ParserTest.java  |  1 +
 .../parser/microsoft/OutlookParserTest.java | 15 
 6 files changed, 121 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-core/src/main/java/org/apache/tika/metadata/Message.java
--
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
index ffb9413..dad3952 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
@@ -16,10 +16,16 @@
  */
 package org.apache.tika.metadata;
 
+import org.apache.tika.Tika;
+
 /**
  * A collection of Message related property names.
  */
 public interface Message {
+String MESSAGE_PREFIX = "Message"+ Metadata.NAMESPACE_PREFIX_DELIMITER;
+
+String MESSAGE_RAW_HEADER_PREFIX = 
MESSAGE_PREFIX+"Raw-Header"+Metadata.NAMESPACE_PREFIX_DELIMITER;
+
 String MESSAGE_RECIPIENT_ADDRESS = "Message-Recipient-Address";
 
 String MESSAGE_FROM = "Message-From";

http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-core/src/test/java/org/apache/tika/TikaTest.java
--
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 690db33..0bc5a83 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -296,4 +296,12 @@ public abstract class TikaTest {
 i++;
 }
 }
+
+public static void debug(Metadata metadata) {
+for (String n : metadata.names()) {
+for (String v : metadata.getValues(n)) {
+System.out.println(n + " : "+v);
+}
+}
+}
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
--
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 6a9bc1b..60170e6 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -51,6 +51,7 @@ import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Message;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
@@ -238,6 +239,7 @@ class MailContentHandler implements ContentHandler {
 
 try {
 String fieldname = field.getName();
+
 ParsedField parsedField = LenientFieldParser.getParser().parse(
 field, DecodeMonitor.SILENT);
 if (fieldname.equalsIgnoreCase("From")) {
@@ -276,6 +278,9 @@ class MailContentHandler implements ContentHandler {
 date = tryOtherDateFormats(field.getBody());
 }
 metadata.set(TikaCoreProperties.CREATED, date);
+} else {
+
metadata.add(Metadata.MESSAGE_RAW_HEADER_PREFIX+parsedField.getName(),
+field.getBody());
 }
 } catch (RuntimeException me) {
 if (strictParsing) {

http://git-wip-us.apache.org/repos/asf/tika/blob/8e819c3c/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
--
diff --git 
a/tika-parsers/src/main/java/org/apache/t