This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4345
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 375e59580270ebd0c0da7a41e43d7739925fede0
Author: tallison <[email protected]>
AuthorDate: Thu Nov 7 12:03:57 2024 -0500

    TIKA-4345 -- allow configurability for injecting headers into content in msg
---
 .../parser/microsoft/AbstractOfficeParser.java     | 14 +++++
 .../apache/tika/parser/microsoft/OfficeParser.java |  1 -
 .../tika/parser/microsoft/OfficeParserConfig.java  | 17 +++++-
 .../tika/parser/microsoft/OutlookExtractor.java    | 71 ++++++++++++----------
 .../tika/parser/microsoft/rtf/RTFParser.java       | 31 ++++++++--
 .../tika/parser/microsoft/rtf/TextExtractor.java   | 19 +++---
 .../tika/parser/microsoft/OutlookParserTest.java   | 39 ++++++++++--
 7 files changed, 141 insertions(+), 51 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index ec785f5d2..ea5179552 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -180,4 +180,18 @@ public abstract class AbstractOfficeParser implements 
Parser {
     public boolean isIncludeHeadersAndFooters() {
         return defaultOfficeParserConfig.isIncludeHeadersAndFooters();
     }
+
+    /**
+     * If set to <code>true</code>, this will write the to/from/cc into the 
body content
+     *
+     * @param val
+     */
+    @Field
+    public void setWriteSelectHeadersInBody(boolean val) {
+        defaultOfficeParserConfig.setWriteSelectHeadersInBody(val);
+    }
+
+    public boolean isWriteSelectHeadersInBody() {
+        return defaultOfficeParserConfig.isWriteSelectHeadersInBody();
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 21a771c86..8fe685686 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -248,7 +248,6 @@ public class OfficeParser extends AbstractOfficeParser {
                 break;
             case OUTLOOK:
                 OutlookExtractor extractor = new OutlookExtractor(root, 
metadata, context);
-
                 extractor.parse(xhtml);
                 break;
             case ENCRYPTED:
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 680b63c9e..af69eefa1 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -34,8 +34,9 @@ public class OfficeParserConfig implements Serializable {
 
     private boolean useSAXDocxExtractor = false;
     private boolean useSAXPptxExtractor = false;
-    private boolean extractAllAlternativesFromMSG;
 
+    private boolean extractAllAlternativesFromMSG = false;
+    private boolean writeSelectHeadersInBody = false;
     private String dateOverrideFormat = null;
     private int maxOverride = 0;//ignore
 
@@ -201,6 +202,20 @@ public class OfficeParserConfig implements Serializable {
         this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
     }
 
+    public boolean isWriteSelectHeadersInBody() {
+        return writeSelectHeadersInBody;
+    }
+
+    /**
+     * If set to <code>true</code>, this will add to/from/cc into the
+     * body content.
+     *
+     * @param val
+     */
+    public void setWriteSelectHeadersInBody(boolean val) {
+        this.writeSelectHeadersInBody = val;
+    }
+
     public boolean isIncludeMissingRows() {
         return includeMissingRows;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 451346745..a73adbaf6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -227,24 +227,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                 }
             }
 
-
-            xhtml.element("h1", subject);
-
-            // Output the from and to details in text, as you
-            //  often want them in text form for searching
-            xhtml.startElement("dl");
-            if (from != null) {
-                header(xhtml, "From", from);
-            }
-            header(xhtml, "To", msg.getDisplayTo());
-            header(xhtml, "Cc", msg.getDisplayCC());
-            header(xhtml, "Bcc", msg.getDisplayBCC());
-            try {
-                header(xhtml, "Recipients", msg.getRecipientEmailAddress());
-            } catch (ChunkNotFoundException e) {
-                //swallow
-            }
-            xhtml.endElement("dl");
+            writeSelectHeadersInBody(subject, from, msg, xhtml);
 
             // Get the message body. Preference order is: html, rtf, text
             Chunk htmlChunk = null;
@@ -265,7 +248,6 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
 
             // Process the attachments
             for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
-                xhtml.startElement("div", "class", "attachment-entry");
 
                 String filename = null;
                 if (attachment.getAttachLongFileName() != null) {
@@ -273,9 +255,6 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                 } else if (attachment.getAttachFileName() != null) {
                     filename = attachment.getAttachFileName().getValue();
                 }
-                if (filename != null && filename.length() > 0) {
-                    xhtml.element("h1", filename);
-                }
 
                 if (attachment.getAttachData() != null) {
                     handleEmbeddedResource(
@@ -286,8 +265,6 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                     
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), 
filename,
                             xhtml, true);
                 }
-
-                xhtml.endElement("div");
             }
         } catch (ChunkNotFoundException e) {
             throw new TikaException("POI MAPIMessage broken - didn't return 
null on missing chunk",
@@ -302,6 +279,31 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
         }
     }
 
+    private void writeSelectHeadersInBody(String subject, String from, 
MAPIMessage msg, XHTMLContentHandler xhtml)
+            throws SAXException, ChunkNotFoundException {
+        if (! officeParserConfig.isWriteSelectHeadersInBody()) {
+            return;
+        }
+        xhtml.element("h1", subject);
+
+        // Output the from and to details in text, as you
+        //  often want them in text form for searching
+        xhtml.startElement("dl");
+        if (from != null) {
+            header(xhtml, "From", from);
+        }
+        header(xhtml, "To", msg.getDisplayTo());
+        header(xhtml, "Cc", msg.getDisplayCC());
+        header(xhtml, "Bcc", msg.getDisplayBCC());
+        try {
+            header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+        } catch (ChunkNotFoundException e) {
+            //swallow
+        }
+        xhtml.endElement("dl");
+
+    }
+
     private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk 
textChunk,
                                   XHTMLContentHandler xhtml)
             throws SAXException, IOException, TikaException {
@@ -310,9 +312,18 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
             extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml);
             return;
         }
-
+        if (officeParserConfig.isWriteSelectHeadersInBody()) {
+            xhtml.startElement("div", "class", "message-body");
+            _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+            xhtml.endElement("div");
+        } else {
+            _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+        }
+    }
+    private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk 
textChunk,
+                                  XHTMLContentHandler xhtml)
+            throws SAXException, IOException, TikaException {
         boolean doneBody = false;
-        xhtml.startElement("div", "class", "message-body");
         if (htmlChunk != null) {
             byte[] data = null;
             if (htmlChunk instanceof ByteChunk) {
@@ -341,21 +352,19 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                 MAPIRtfAttribute rtf =
                         new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED, 
Types.BINARY.getId(),
                                 chunk.getValue());
-                Parser rtfParser = EmbeddedDocumentUtil
+                RTFParser rtfParser = (RTFParser) EmbeddedDocumentUtil
                         .tryToFindExistingLeafParser(RTFParser.class, 
parseContext);
                 if (rtfParser == null) {
                     rtfParser = new RTFParser();
                 }
-                
rtfParser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(),
-                        new EmbeddedContentHandler(new 
BodyContentHandler(xhtml)), new Metadata(),
-                        parseContext);
+                
rtfParser.parseInline(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(),
+                        xhtml, new Metadata(), parseContext);
                 doneBody = true;
             }
         }
         if (textChunk != null && (extractAllAlternatives || !doneBody)) {
             xhtml.element("p", ((StringChunk) textChunk).getValue());
         }
-        xhtml.endElement("div");
 
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java
index faa808fa1..8b17575a7 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java
@@ -64,19 +64,38 @@ public class RTFParser implements Parser {
                       ParseContext context) throws IOException, SAXException, 
TikaException {
         metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
         TaggedInputStream tagged = new TaggedInputStream(stream);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
         try {
-            XHTMLContentHandler xhtmlHandler = new 
XHTMLContentHandler(handler, metadata);
-            RTFEmbObjHandler embObjHandler =
-                    new RTFEmbObjHandler(xhtmlHandler, metadata, context, 
getMemoryLimitInKb());
-            final TextExtractor ert = new TextExtractor(xhtmlHandler, 
metadata, embObjHandler);
-            ert.setIgnoreListMarkup(ignoreListMarkup);
-            ert.extract(stream);
+            parseInline(stream, xhtml, metadata, context);
         } catch (IOException e) {
             tagged.throwIfCauseOf(e);
             throw new TikaException("Error parsing an RTF document", e);
+        } finally {
+            xhtml.endDocument();
         }
     }
 
+    /**
+     * This bypasses wrapping the handler for inline parsing (in at least the 
OutlookExtractor).
+     *
+     * @param is
+     * @param handler
+     * @param metadata
+     * @param context
+     * @throws TikaException
+     * @throws IOException
+     * @throws SAXException
+     */
+    public void parseInline(InputStream is, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws TikaException, IOException, SAXException {
+        RTFEmbObjHandler embObjHandler =
+                new RTFEmbObjHandler(handler, metadata, context, 
getMemoryLimitInKb());
+        final TextExtractor ert = new TextExtractor(handler, metadata, 
embObjHandler);
+        ert.setIgnoreListMarkup(ignoreListMarkup);
+        ert.extract(is);
+    }
+
     public int getMemoryLimitInKb() {
         //there's a race condition here, but it shouldn't matter.
         if (USE_STATIC) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 28ca76299..83abb1ae6 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -17,6 +17,8 @@
 
 package org.apache.tika.parser.microsoft.rtf;
 
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;
@@ -36,7 +38,9 @@ import java.util.Stack;
 import java.util.TimeZone;
 
 import org.apache.commons.io.IOUtils;
+import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -47,7 +51,6 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.CharsetUtils;
 
 /* Tokenizes and performs a "shallow" parse of the RTF
@@ -256,7 +259,7 @@ final class TextExtractor {
     // close the group, we restore it
     private final LinkedList<GroupState> groupStates = new LinkedList<>();
     private final StringBuilder pendingBuffer = new StringBuilder();
-    private final XHTMLContentHandler out;
+    private final ContentHandler out;
     private final Metadata metadata;
     private final RTFEmbObjHandler embObjHandler;
     // How many next ansi chars we should skip; this
@@ -330,7 +333,7 @@ final class TextExtractor {
     //to defend against DoS with memory consumption
     private int maxStackSize = 1000;
 
-    public TextExtractor(XHTMLContentHandler out, Metadata metadata,
+    public TextExtractor(ContentHandler out, Metadata metadata,
                          RTFEmbObjHandler embObjHandler) {
         this.metadata = metadata;
         this.out = out;
@@ -464,7 +467,6 @@ final class TextExtractor {
     }
 
     private void extract(PushbackInputStream in) throws IOException, 
SAXException, TikaException {
-        out.startDocument();
 
         while (true) {
             final int b = in.read();
@@ -503,7 +505,6 @@ final class TextExtractor {
         while (paragraphStack.size() > 0) {
             end(paragraphStack.pop());
         }
-        out.endDocument();
     }
 
     private void parseControlToken(PushbackInputStream in)
@@ -1084,11 +1085,11 @@ final class TextExtractor {
     }
 
     private void end(String tag) throws IOException, SAXException, 
TikaException {
-        out.endElement(tag);
+        out.endElement(XHTML, tag, tag);
     }
 
     private void start(String tag) throws IOException, SAXException, 
TikaException {
-        out.startElement(tag);
+        out.startElement(XHTML, tag, tag, new AttributesImpl());
     }
 
     // Handle non-parameter control word:
@@ -1357,7 +1358,9 @@ final class TextExtractor {
         } else if (equals("fldrslt") && fieldState == 2) {
             assert pendingURL != null;
             lazyStartParagraph();
-            out.startElement("a", "href", pendingURL);
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute(XHTML, "href", "href", "CDATA", pendingURL);
+            out.startElement("", "a", "a", attrs);
             pendingURL = null;
             fieldState = 3;
             groupState.ignore = false;
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index ffd4c0e5d..686a6657c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -43,6 +43,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 
 /**
@@ -52,11 +53,19 @@ public class OutlookParserTest extends TikaTest {
 
     @Test
     public void testOutlookParsing() throws Exception {
+
+        //test default behavior
+        List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", 
AUTO_DETECT_PARSER,
+                BasicContentHandlerFactory.HANDLER_TYPE.BODY);
+        assertNotContained("Microsoft Outlook Express 6", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+
+        //test legacy behavior
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/test-outlook.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
         }
         assertEquals("application/vnd.ms-outlook", 
metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("Microsoft Outlook Express 6", 
metadata.get(TikaCoreProperties.TITLE));
@@ -98,7 +107,7 @@ public class OutlookParserTest extends TikaTest {
         Metadata metadata = new Metadata();
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/testMSG.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
         }
 
         assertEquals("application/vnd.ms-outlook", 
metadata.get(Metadata.CONTENT_TYPE));
@@ -176,7 +185,7 @@ public class OutlookParserTest extends TikaTest {
         handler.setResult(new StreamResult(sw));
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
         }
 
         // As the HTML version should have been processed, ensure
@@ -233,6 +242,12 @@ public class OutlookParserTest extends TikaTest {
 
     @Test
     public void testOutlookHTMLfromRTF() throws Exception {
+
+        //test default behavior
+        List<Metadata> metadataList = 
getRecursiveMetadata("test-outlook2003.msg");
+        assertNotContained("<dd>New Outlook User</dd>", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+        //test legacy behavior with the configuration set
         Metadata metadata = new Metadata();
 
         // Check the HTML version
@@ -244,7 +259,7 @@ public class OutlookParserTest extends TikaTest {
         handler.setResult(new StreamResult(sw));
 
         try (InputStream stream = 
getResourceAsStream("/test-documents/test-outlook2003.msg")) {
-            AUTO_DETECT_PARSER.parse(stream, handler, metadata, new 
ParseContext());
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, 
configureInjectHeaders());
         }
 
         // As the HTML version should have been processed, ensure
@@ -267,6 +282,14 @@ public class OutlookParserTest extends TikaTest {
         assertEquals(2, content.split("<\\/body>").length);
     }
 
+    private ParseContext configureInjectHeaders() {
+        ParseContext parseContext = new ParseContext();
+        OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+        officeParserConfig.setWriteSelectHeadersInBody(true);
+        parseContext.set(OfficeParserConfig.class, officeParserConfig);
+        return parseContext;
+    }
+
     @Test
     public void testMAPIMessageClasses() throws Exception {
 
@@ -319,4 +342,12 @@ public class OutlookParserTest extends TikaTest {
         }
 
     }
+
+    @Test
+    public void testNewlinesInRTFBody() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", 
AUTO_DETECT_PARSER,
+                BasicContentHandlerFactory.HANDLER_TYPE.BODY);
+        assertContains("annuaires\t \n" + " Synchronisation", 
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
 }

Reply via email to