This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 386a56070 TIKA-4345 (#2037)
386a56070 is described below
commit 386a56070c255a6addf5f1965d18fb65d0a5bff2
Author: Tim Allison <[email protected]>
AuthorDate: Thu Nov 7 12:45:31 2024 -0500
TIKA-4345 (#2037)
* TIKA-4345 -- extract metadata before writing to pstmailitem body so that
more metadata is written to the xhtml
* TIKA-4345 -- allow configurability for injecting headers into content in
msg
---
.../parser/microsoft/AbstractOfficeParser.java | 14 +++++
.../apache/tika/parser/microsoft/OfficeParser.java | 1 -
.../tika/parser/microsoft/OfficeParserConfig.java | 17 +++++-
.../tika/parser/microsoft/OutlookExtractor.java | 71 ++++++++++++----------
.../parser/microsoft/pst/PSTMailItemParser.java | 4 +-
.../tika/parser/microsoft/rtf/RTFParser.java | 31 ++++++++--
.../tika/parser/microsoft/rtf/TextExtractor.java | 19 +++---
.../tika/parser/microsoft/OutlookParserTest.java | 39 ++++++++++--
.../parser/microsoft/pst/OutlookPSTParserTest.java | 7 ++-
9 files changed, 147 insertions(+), 56 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
index ec785f5d2..ea5179552 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractOfficeParser.java
@@ -180,4 +180,18 @@ public abstract class AbstractOfficeParser implements
Parser {
public boolean isIncludeHeadersAndFooters() {
return defaultOfficeParserConfig.isIncludeHeadersAndFooters();
}
+
+ /**
+ * If set to <code>true</code>, this will write the to/from/cc into the
body content
+ *
+ * @param val
+ */
+ @Field
+ public void setWriteSelectHeadersInBody(boolean val) {
+ defaultOfficeParserConfig.setWriteSelectHeadersInBody(val);
+ }
+
+ public boolean isWriteSelectHeadersInBody() {
+ return defaultOfficeParserConfig.isWriteSelectHeadersInBody();
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 21a771c86..8fe685686 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -248,7 +248,6 @@ public class OfficeParser extends AbstractOfficeParser {
break;
case OUTLOOK:
OutlookExtractor extractor = new OutlookExtractor(root,
metadata, context);
-
extractor.parse(xhtml);
break;
case ENCRYPTED:
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index 680b63c9e..af69eefa1 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -34,8 +34,9 @@ public class OfficeParserConfig implements Serializable {
private boolean useSAXDocxExtractor = false;
private boolean useSAXPptxExtractor = false;
- private boolean extractAllAlternativesFromMSG;
+ private boolean extractAllAlternativesFromMSG = false;
+ private boolean writeSelectHeadersInBody = false;
private String dateOverrideFormat = null;
private int maxOverride = 0;//ignore
@@ -201,6 +202,20 @@ public class OfficeParserConfig implements Serializable {
this.extractAllAlternativesFromMSG = extractAllAlternativesFromMSG;
}
+ public boolean isWriteSelectHeadersInBody() {
+ return writeSelectHeadersInBody;
+ }
+
+ /**
+ * If set to <code>true</code>, this will add to/from/cc into the
+ * body content.
+ *
+ * @param val
+ */
+ public void setWriteSelectHeadersInBody(boolean val) {
+ this.writeSelectHeadersInBody = val;
+ }
+
public boolean isIncludeMissingRows() {
return includeMissingRows;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 451346745..a73adbaf6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -227,24 +227,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
-
- xhtml.element("h1", subject);
-
- // Output the from and to details in text, as you
- // often want them in text form for searching
- xhtml.startElement("dl");
- if (from != null) {
- header(xhtml, "From", from);
- }
- header(xhtml, "To", msg.getDisplayTo());
- header(xhtml, "Cc", msg.getDisplayCC());
- header(xhtml, "Bcc", msg.getDisplayBCC());
- try {
- header(xhtml, "Recipients", msg.getRecipientEmailAddress());
- } catch (ChunkNotFoundException e) {
- //swallow
- }
- xhtml.endElement("dl");
+ writeSelectHeadersInBody(subject, from, msg, xhtml);
// Get the message body. Preference order is: html, rtf, text
Chunk htmlChunk = null;
@@ -265,7 +248,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
// Process the attachments
for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
- xhtml.startElement("div", "class", "attachment-entry");
String filename = null;
if (attachment.getAttachLongFileName() != null) {
@@ -273,9 +255,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
} else if (attachment.getAttachFileName() != null) {
filename = attachment.getAttachFileName().getValue();
}
- if (filename != null && filename.length() > 0) {
- xhtml.element("h1", filename);
- }
if (attachment.getAttachData() != null) {
handleEmbeddedResource(
@@ -286,8 +265,6 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(),
filename,
xhtml, true);
}
-
- xhtml.endElement("div");
}
} catch (ChunkNotFoundException e) {
throw new TikaException("POI MAPIMessage broken - didn't return
null on missing chunk",
@@ -302,6 +279,31 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
}
}
+ private void writeSelectHeadersInBody(String subject, String from,
MAPIMessage msg, XHTMLContentHandler xhtml)
+ throws SAXException, ChunkNotFoundException {
+ if (! officeParserConfig.isWriteSelectHeadersInBody()) {
+ return;
+ }
+ xhtml.element("h1", subject);
+
+ // Output the from and to details in text, as you
+ // often want them in text form for searching
+ xhtml.startElement("dl");
+ if (from != null) {
+ header(xhtml, "From", from);
+ }
+ header(xhtml, "To", msg.getDisplayTo());
+ header(xhtml, "Cc", msg.getDisplayCC());
+ header(xhtml, "Bcc", msg.getDisplayBCC());
+ try {
+ header(xhtml, "Recipients", msg.getRecipientEmailAddress());
+ } catch (ChunkNotFoundException e) {
+ //swallow
+ }
+ xhtml.endElement("dl");
+
+ }
+
private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
@@ -310,9 +312,18 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml);
return;
}
-
+ if (officeParserConfig.isWriteSelectHeadersInBody()) {
+ xhtml.startElement("div", "class", "message-body");
+ _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+ xhtml.endElement("div");
+ } else {
+ _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml);
+ }
+ }
+ private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk
textChunk,
+ XHTMLContentHandler xhtml)
+ throws SAXException, IOException, TikaException {
boolean doneBody = false;
- xhtml.startElement("div", "class", "message-body");
if (htmlChunk != null) {
byte[] data = null;
if (htmlChunk instanceof ByteChunk) {
@@ -341,21 +352,19 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
MAPIRtfAttribute rtf =
new MAPIRtfAttribute(MAPIProperty.RTF_COMPRESSED,
Types.BINARY.getId(),
chunk.getValue());
- Parser rtfParser = EmbeddedDocumentUtil
+ RTFParser rtfParser = (RTFParser) EmbeddedDocumentUtil
.tryToFindExistingLeafParser(RTFParser.class,
parseContext);
if (rtfParser == null) {
rtfParser = new RTFParser();
}
-
rtfParser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(),
- new EmbeddedContentHandler(new
BodyContentHandler(xhtml)), new Metadata(),
- parseContext);
+
rtfParser.parseInline(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(),
+ xhtml, new Metadata(), parseContext);
doneBody = true;
}
}
if (textChunk != null && (extractAllAlternatives || !doneBody)) {
xhtml.element("p", ((StringChunk) textChunk).getValue());
}
- xhtml.endElement("div");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index 15b4cf0fa..a87c6cb84 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -87,11 +87,11 @@ public class PSTMailItemParser implements Parser {
private void parseMailAndAttachments(PSTMessage pstMsg,
XHTMLContentHandler handler, Metadata metadata, ParseContext context,
EmbeddedDocumentExtractor
embeddedExtractor)
throws SAXException, IOException, TikaException {
+ extractMetadata(pstMsg, metadata);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA",
pstMsg.getInternetMessageId());
handler.startElement("div", attributes);
- handler.element("h1", pstMsg.getSubject());
parseMailItem(pstMsg, handler, metadata, context);
parseMailAttachments(pstMsg, handler, metadata, context,
embeddedExtractor);
@@ -100,7 +100,7 @@ public class PSTMailItemParser implements Parser {
private void parseMailItem(PSTMessage pstMail, XHTMLContentHandler xhtml,
Metadata metadata, ParseContext context)
throws SAXException, IOException, TikaException {
- extractMetadata(pstMail, metadata);
+
//try the html first. It preserves logical paragraph markers
String htmlChunk = pstMail.getBodyHTML();
if (! StringUtils.isBlank(htmlChunk)) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java
index faa808fa1..8b17575a7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFParser.java
@@ -64,19 +64,38 @@ public class RTFParser implements Parser {
ParseContext context) throws IOException, SAXException,
TikaException {
metadata.set(Metadata.CONTENT_TYPE, "application/rtf");
TaggedInputStream tagged = new TaggedInputStream(stream);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
try {
- XHTMLContentHandler xhtmlHandler = new
XHTMLContentHandler(handler, metadata);
- RTFEmbObjHandler embObjHandler =
- new RTFEmbObjHandler(xhtmlHandler, metadata, context,
getMemoryLimitInKb());
- final TextExtractor ert = new TextExtractor(xhtmlHandler,
metadata, embObjHandler);
- ert.setIgnoreListMarkup(ignoreListMarkup);
- ert.extract(stream);
+ parseInline(stream, xhtml, metadata, context);
} catch (IOException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("Error parsing an RTF document", e);
+ } finally {
+ xhtml.endDocument();
}
}
+ /**
+ * This bypasses wrapping the handler for inline parsing (in at least the
OutlookExtractor).
+ *
+ * @param is
+ * @param handler
+ * @param metadata
+ * @param context
+ * @throws TikaException
+ * @throws IOException
+ * @throws SAXException
+ */
+ public void parseInline(InputStream is, ContentHandler handler, Metadata
metadata, ParseContext context)
+ throws TikaException, IOException, SAXException {
+ RTFEmbObjHandler embObjHandler =
+ new RTFEmbObjHandler(handler, metadata, context,
getMemoryLimitInKb());
+ final TextExtractor ert = new TextExtractor(handler, metadata,
embObjHandler);
+ ert.setIgnoreListMarkup(ignoreListMarkup);
+ ert.extract(is);
+ }
+
public int getMemoryLimitInKb() {
//there's a race condition here, but it shouldn't matter.
if (USE_STATIC) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 28ca76299..83abb1ae6 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -17,6 +17,8 @@
package org.apache.tika.parser.microsoft.rtf;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
@@ -36,7 +38,9 @@ import java.util.Stack;
import java.util.TimeZone;
import org.apache.commons.io.IOUtils;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -47,7 +51,6 @@ import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.CharsetUtils;
/* Tokenizes and performs a "shallow" parse of the RTF
@@ -256,7 +259,7 @@ final class TextExtractor {
// close the group, we restore it
private final LinkedList<GroupState> groupStates = new LinkedList<>();
private final StringBuilder pendingBuffer = new StringBuilder();
- private final XHTMLContentHandler out;
+ private final ContentHandler out;
private final Metadata metadata;
private final RTFEmbObjHandler embObjHandler;
// How many next ansi chars we should skip; this
@@ -330,7 +333,7 @@ final class TextExtractor {
//to defend against DoS with memory consumption
private int maxStackSize = 1000;
- public TextExtractor(XHTMLContentHandler out, Metadata metadata,
+ public TextExtractor(ContentHandler out, Metadata metadata,
RTFEmbObjHandler embObjHandler) {
this.metadata = metadata;
this.out = out;
@@ -464,7 +467,6 @@ final class TextExtractor {
}
private void extract(PushbackInputStream in) throws IOException,
SAXException, TikaException {
- out.startDocument();
while (true) {
final int b = in.read();
@@ -503,7 +505,6 @@ final class TextExtractor {
while (paragraphStack.size() > 0) {
end(paragraphStack.pop());
}
- out.endDocument();
}
private void parseControlToken(PushbackInputStream in)
@@ -1084,11 +1085,11 @@ final class TextExtractor {
}
private void end(String tag) throws IOException, SAXException,
TikaException {
- out.endElement(tag);
+ out.endElement(XHTML, tag, tag);
}
private void start(String tag) throws IOException, SAXException,
TikaException {
- out.startElement(tag);
+ out.startElement(XHTML, tag, tag, new AttributesImpl());
}
// Handle non-parameter control word:
@@ -1357,7 +1358,9 @@ final class TextExtractor {
} else if (equals("fldrslt") && fieldState == 2) {
assert pendingURL != null;
lazyStartParagraph();
- out.startElement("a", "href", pendingURL);
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute(XHTML, "href", "href", "CDATA", pendingURL);
+ out.startElement("", "a", "a", attrs);
pendingURL = null;
fieldState = 3;
groupState.ignore = false;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index ffd4c0e5d..686a6657c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -43,6 +43,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
/**
@@ -52,11 +53,19 @@ public class OutlookParserTest extends TikaTest {
@Test
public void testOutlookParsing() throws Exception {
+
+ //test default behavior
+ List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg",
AUTO_DETECT_PARSER,
+ BasicContentHandlerFactory.HANDLER_TYPE.BODY);
+ assertNotContained("Microsoft Outlook Express 6",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+
+ //test legacy behavior
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try (InputStream stream =
getResourceAsStream("/test-documents/test-outlook.msg")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, new
ParseContext());
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata,
configureInjectHeaders());
}
assertEquals("application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("Microsoft Outlook Express 6",
metadata.get(TikaCoreProperties.TITLE));
@@ -98,7 +107,7 @@ public class OutlookParserTest extends TikaTest {
Metadata metadata = new Metadata();
try (InputStream stream =
getResourceAsStream("/test-documents/testMSG.msg")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, new
ParseContext());
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata,
configureInjectHeaders());
}
assertEquals("application/vnd.ms-outlook",
metadata.get(Metadata.CONTENT_TYPE));
@@ -176,7 +185,7 @@ public class OutlookParserTest extends TikaTest {
handler.setResult(new StreamResult(sw));
try (InputStream stream =
getResourceAsStream("/test-documents/testMSG_chinese.msg")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, new
ParseContext());
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata,
configureInjectHeaders());
}
// As the HTML version should have been processed, ensure
@@ -233,6 +242,12 @@ public class OutlookParserTest extends TikaTest {
@Test
public void testOutlookHTMLfromRTF() throws Exception {
+
+ //test default behavior
+ List<Metadata> metadataList =
getRecursiveMetadata("test-outlook2003.msg");
+ assertNotContained("<dd>New Outlook User</dd>",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+
+ //test legacy behavior with the configuration set
Metadata metadata = new Metadata();
// Check the HTML version
@@ -244,7 +259,7 @@ public class OutlookParserTest extends TikaTest {
handler.setResult(new StreamResult(sw));
try (InputStream stream =
getResourceAsStream("/test-documents/test-outlook2003.msg")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, new
ParseContext());
+ AUTO_DETECT_PARSER.parse(stream, handler, metadata,
configureInjectHeaders());
}
// As the HTML version should have been processed, ensure
@@ -267,6 +282,14 @@ public class OutlookParserTest extends TikaTest {
assertEquals(2, content.split("<\\/body>").length);
}
+ private ParseContext configureInjectHeaders() {
+ ParseContext parseContext = new ParseContext();
+ OfficeParserConfig officeParserConfig = new OfficeParserConfig();
+ officeParserConfig.setWriteSelectHeadersInBody(true);
+ parseContext.set(OfficeParserConfig.class, officeParserConfig);
+ return parseContext;
+ }
+
@Test
public void testMAPIMessageClasses() throws Exception {
@@ -319,4 +342,12 @@ public class OutlookParserTest extends TikaTest {
}
}
+
+ @Test
+ public void testNewlinesInRTFBody() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("test-outlook.msg",
AUTO_DETECT_PARSER,
+ BasicContentHandlerFactory.HANDLER_TYPE.BODY);
+ assertContains("annuaires\t \n" + " Synchronisation",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+ }
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index 8807b4782..6e9a6d6d1 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -51,10 +51,9 @@ public class OutlookPSTParserTest extends TikaTest {
assertTrue(output.contains("<meta name=\"Content-Type\"
content=\"application/vnd.ms-outlook-pst\""));
assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
- assertTrue(output.contains("<div class=\"embedded\"
id=\"<[email protected]>\">" + "<h1>Re: Feature
Generators</h1>"));
+ assertTrue(output.contains("<div class=\"embedded\"
id=\"<[email protected]>\">"));
assertTrue(output.contains(
- "<div class=\"embedded\"
id=\"<[email protected]" +
".bf1.yahoo.com>\"><h1>Re: init tokenizer fails: \"Bad type in " +
- "putfield/putstatic\"</h1>"));
+ "<div class=\"embedded\"
id=\"<[email protected]" +
".bf1.yahoo.com>\">"));
assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine
(pour la recherche)</h1>"));
@@ -79,6 +78,8 @@ public class OutlookPSTParserTest extends TikaTest {
assertEquals("[email protected]",
m1.get(Office.MAPI_FROM_REPRESENTING_EMAIL));
assertEquals("NOTE", m1.get(Office.MAPI_MESSAGE_CLASS));
assertEquals("/Début du fichier de données Outlook",
m1.get(PST.PST_FOLDER_PATH));
+ //test that subject is making it into the xhtml
+ assertContains("<meta name=\"dc:subject\" content=\"Re: Feature
Generators\"", m1.get(TikaCoreProperties.TIKA_CONTENT));
Metadata m6 = metadataList.get(6);
assertEquals("Couchbase", m6.get(Message.MESSAGE_FROM_NAME));