This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4345 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 264d7812abf8d425fe68ef1dc770693abdaebe8f Author: tallison <[email protected]> AuthorDate: Wed Nov 13 09:55:30 2024 -0500 TIKA-4345 -- stop injecting headers into the body for msg files --- CHANGES.txt | 7 +++++ .../parser/microsoft/AbstractOfficeParser.java | 13 -------- .../tika/parser/microsoft/OfficeParserConfig.java | 15 --------- .../tika/parser/microsoft/OutlookExtractor.java | 36 ++-------------------- .../tika/parser/microsoft/OutlookParserTest.java | 33 ++++++-------------- 5 files changed, 18 insertions(+), 86 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 395f41b07..a5feb090f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,8 +1,15 @@ Release 4.0.0-BETA1 - ??? BREAKING CHANGES + * Headers are no longer injected into the body/content of MSG files (TIKA-4345). Please open + a ticket if you need this behavior across email formats. + + Release 3.1.0 - ?? + * Allow users to turn off the injection of some headers into the content stream of MSG + files (TIKA-4345). + * Add a wrapper for Google's magika detector (TIKA-4344). * Add support for MachO via Alexey Pelykh (TIKA-4309). diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java index ea5179552..a44073d4e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java @@ -181,17 +181,4 @@ public abstract class AbstractOfficeParser implements Parser { return defaultOfficeParserConfig.isIncludeHeadersAndFooters(); } - /** - * If set to <code>true</code>, this will write the to/from/cc into the body content - * - * @param val - */ - @Field - public void setWriteSelectHeadersInBody(boolean val) { - defaultOfficeParserConfig.setWriteSelectHeadersInBody(val); - } - - public boolean isWriteSelectHeadersInBody() { - return defaultOfficeParserConfig.isWriteSelectHeadersInBody(); - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java index af69eefa1..8e761efad 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java @@ -36,7 +36,6 @@ public class OfficeParserConfig implements Serializable { private boolean useSAXPptxExtractor = false; private boolean extractAllAlternativesFromMSG = false; - private boolean writeSelectHeadersInBody = false; private String dateOverrideFormat = null; private int maxOverride = 0;//ignore @@ -202,20 +201,6 @@ public class OfficeParserConfig implements Serializable { this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG; } - public boolean isWriteSelectHeadersInBody() { - return writeSelectHeadersInBody; - } - - /** - * If set to <code>true</code>, this will add to/from/cc into the - * body content. - * - * @param val - */ - public void setWriteSelectHeadersInBody(boolean val) { - this.writeSelectHeadersInBody = val; - } - public boolean isIncludeMissingRows() { return includeMissingRows; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index a73adbaf6..5a2dc996e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -227,8 +227,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } - writeSelectHeadersInBody(subject, from, msg, xhtml); - // Get the message body. Preference order is: html, rtf, text Chunk htmlChunk = null; Chunk rtfChunk = null; @@ -279,31 +277,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } } - private void writeSelectHeadersInBody(String subject, String from, MAPIMessage msg, XHTMLContentHandler xhtml) - throws SAXException, ChunkNotFoundException { - if (! officeParserConfig.isWriteSelectHeadersInBody()) { - return; - } - xhtml.element("h1", subject); - - // Output the from and to details in text, as you - // often want them in text form for searching - xhtml.startElement("dl"); - if (from != null) { - header(xhtml, "From", from); - } - header(xhtml, "To", msg.getDisplayTo()); - header(xhtml, "Cc", msg.getDisplayCC()); - header(xhtml, "Bcc", msg.getDisplayBCC()); - try { - header(xhtml, "Recipients", msg.getRecipientEmailAddress()); - } catch (ChunkNotFoundException e) { - //swallow - } - xhtml.endElement("dl"); - - } - private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException { @@ -312,13 +285,8 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml); return; } - if (officeParserConfig.isWriteSelectHeadersInBody()) { - xhtml.startElement("div", "class", "message-body"); - _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); - xhtml.endElement("div"); - } else { - _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); - } + _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); + } private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, XHTMLContentHandler xhtml) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index 686a6657c..f10f4aa7c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -54,18 +54,12 @@ public class OutlookParserTest extends TikaTest { @Test public void testOutlookParsing() throws Exception { - //test default behavior - List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg", AUTO_DETECT_PARSER, - BasicContentHandlerFactory.HANDLER_TYPE.BODY); - assertNotContained("Microsoft Outlook Express 6", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); - - //test legacy behavior ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = getResourceAsStream("/test-documents/test-outlook.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); } assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Microsoft Outlook Express 6", metadata.get(TikaCoreProperties.TITLE)); @@ -90,9 +84,9 @@ public class OutlookParserTest extends TikaTest { assertEquals("2007-04-05T16:26:06Z", metadata.get(TikaCoreProperties.CREATED)); String content = handler.toString(); - assertContains("Microsoft Outlook Express 6", content); - assertContains("L'\u00C9quipe Microsoft Outlook Express", content); - assertContains("Nouvel utilisateur de Outlook Express", content); + assertNotContained("Microsoft Outlook Express 6", content); + assertNotContained("L'\u00C9quipe Microsoft Outlook Express", content); + assertNotContained("Nouvel utilisateur de Outlook Express", content); assertContains("Messagerie et groupes de discussion", content); } @@ -107,7 +101,7 @@ public class OutlookParserTest extends TikaTest { Metadata metadata = new Metadata(); try (InputStream stream = getResourceAsStream("/test-documents/testMSG.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); } assertEquals("application/vnd.ms-outlook", metadata.get(Metadata.CONTENT_TYPE)); @@ -115,7 +109,6 @@ public class OutlookParserTest extends TikaTest { String content = handler.toString(); Pattern pattern = Pattern.compile("From"); Matcher matcher = pattern.matcher(content); - assertTrue(matcher.find()); assertFalse(matcher.find()); //test that last header is added @@ -185,13 +178,13 @@ public class OutlookParserTest extends TikaTest { handler.setResult(new StreamResult(sw)); try (InputStream stream = getResourceAsStream("/test-documents/testMSG_chinese.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); } // As the HTML version should have been processed, ensure // we got some of the links String content = sw.toString(); - assertContains("<dd>[email protected]</dd>", content); + assertNotContained("<dd>[email protected]</dd>", content); assertContains("<p>Alfresco MSG format testing", content); assertContains("<li>1", content); assertContains("<li>2", content); @@ -259,13 +252,13 @@ public class OutlookParserTest extends TikaTest { handler.setResult(new StreamResult(sw)); try (InputStream stream = getResourceAsStream("/test-documents/test-outlook2003.msg")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, configureInjectHeaders()); + AUTO_DETECT_PARSER.parse(stream, handler, metadata, new ParseContext()); } // As the HTML version should have been processed, ensure // we got some of the links String content = sw.toString().replaceAll("[\\r\\n\\t]+", " ").replaceAll(" +", " "); - assertContains("<dd>New Outlook User</dd>", content); + assertNotContained("<dd>New Outlook User</dd>", content); assertContains("designed <i>to help you", content); assertContains( "<p> <a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", @@ -282,14 +275,6 @@ public class OutlookParserTest extends TikaTest { assertEquals(2, content.split("<\\/body>").length); } - private ParseContext configureInjectHeaders() { - ParseContext parseContext = new ParseContext(); - OfficeParserConfig officeParserConfig = new OfficeParserConfig(); - officeParserConfig.setWriteSelectHeadersInBody(true); - parseContext.set(OfficeParserConfig.class, officeParserConfig); - return parseContext; - } - @Test public void testMAPIMessageClasses() throws Exception {
