This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4391 in repository https://gitbox.apache.org/repos/asf/tika.git
commit beb9a581630ec8c623f17ccf87d8f22dc436b908 Author: tallison <[email protected]> AuthorDate: Wed May 7 17:19:42 2025 -0400 TIKA-4391 -- identify "inline" images within .msg files. --- .../main/java/org/apache/tika/metadata/MAPI.java | 2 + .../java/org/apache/tika/metadata/RTFMetadata.java | 3 + .../tika/parser/microsoft/OutlookExtractor.java | 136 ++++++++++++++++----- .../tika/parser/microsoft/rtf/TextExtractor.java | 3 + .../tika/parser/microsoft/OutlookParserTest.java | 8 ++ 5 files changed, 121 insertions(+), 31 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java index 5f4ef12ae..d52b67351 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java @@ -65,6 +65,8 @@ public interface MAPI { Property PRIORTY = Property.internalInteger(PREFIX_MAPI_META + "priority"); Property IS_FLAGGED = Property.internalBoolean(PREFIX_MAPI_META + "is-flagged"); + Property BODY_TYPES_PROCESSED = Property.internalTextBag(PREFIX_MAPI_META + "body-types-processed"); + Property ATTACH_LONG_PATH_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "long-path-name"); Property ATTACH_LONG_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "long-file-name"); Property ATTACH_FILE_NAME = Property.internalText(PREFIX_MAPI_ATTACH_META + "file-name"); diff --git a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java index e4572e38f..22842391f 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java @@ -44,4 +44,7 @@ public interface RTFMetadata { Property EMB_ITEM = Property.internalText( PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_item"); + Property CONTAINS_ENCAPSULATED_HTML = Property.internalBoolean( + PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contains_encapsulated_html"); + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 30e1ca14a..0e219dac6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -30,11 +30,13 @@ import java.util.Calendar; import java.util.Collections; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -68,6 +70,7 @@ import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.RTFMetadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; @@ -90,6 +93,9 @@ import org.apache.tika.utils.StringUtils; */ public class OutlookExtractor extends AbstractPOIFSExtractor { static Logger LOGGER = LoggerFactory.getLogger(OutlookExtractor.class); + public enum BODY_TYPES_PROCESSED { + HTML, RTF, TEXT; + } private static final Metadata EMPTY_METADATA = new Metadata(); private static final MAPIProperty[] LITERAL_TIME_MAPI_PROPERTIES = new MAPIProperty[] { @@ -116,6 +122,10 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { private static final Map<String, String> MESSAGE_CLASSES = new LinkedHashMap<>(); + private static final Pattern IMG_TAG_PATTERN = Pattern.compile("<img ([^>]{0,1000})>"); + private static final Pattern SRC_ATTR_PATTERN = Pattern.compile("src=\"cid:([^\"]{0,1000})\""); + private static final Pattern TEXT_CID_PATTERN = Pattern.compile("\\[cid:([^]]{0,1000})]"); + static { for (MAPIProperty property : LITERAL_TIME_MAPI_PROPERTIES) { String name = property.mapiProperty.toLowerCase(Locale.ROOT); @@ -128,8 +138,6 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { loadMessageClasses(); } - - private static void loadMessageClasses() { String fName = "/org/apache/tika/parser/microsoft/msg/mapi_message_classes.properties"; try (BufferedReader r = new BufferedReader( @@ -275,41 +283,54 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { textChunk = chunk; } } - handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); + Set<String> contentIdNames = new HashSet<>(); + handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames); // Process the attachments for (AttachmentChunks attachment : msg.getAttachmentFiles()) { - Metadata metadata = new Metadata(); - updateAttachmentMetadata(attachment, metadata); + Metadata attachMetadata = new Metadata(); + updateAttachmentMetadata(attachment, attachMetadata, contentIdNames); String filename = null; - if (!StringUtils.isBlank(metadata.get(MAPI.ATTACH_LONG_FILE_NAME))) { - filename = metadata.get(MAPI.ATTACH_LONG_FILE_NAME); - } else if (!StringUtils.isBlank(metadata.get(MAPI.ATTACH_DISPLAY_NAME))) { - filename = metadata.get(MAPI.ATTACH_DISPLAY_NAME); - } else if (!StringUtils.isBlank(metadata.get(MAPI.ATTACH_FILE_NAME))) { - filename = metadata.get(MAPI.ATTACH_FILE_NAME); + if (!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_LONG_FILE_NAME))) { + filename = attachMetadata.get(MAPI.ATTACH_LONG_FILE_NAME); + } else if (!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_DISPLAY_NAME))) { + filename = attachMetadata.get(MAPI.ATTACH_DISPLAY_NAME); + } else if (!StringUtils.isBlank(attachMetadata.get(MAPI.ATTACH_FILE_NAME))) { + filename = attachMetadata.get(MAPI.ATTACH_FILE_NAME); } //this is allowed to be null; - String mimeType = metadata.get(MAPI.ATTACH_MIME); + String mimeType = attachMetadata.get(MAPI.ATTACH_MIME); if (attachment.getAttachData() != null) { handleEmbeddedResource(TikaInputStream.get(attachment .getAttachData() - .getValue()), metadata, filename, null, null, mimeType, xhtml, true); + .getValue()), attachMetadata, filename, null, null, mimeType, xhtml, true); } if (attachment.getAttachmentDirectory() != null) { handleEmbeddedOfficeDoc(attachment .getAttachmentDirectory() - .getDirectory(), metadata, filename, xhtml, true); + .getDirectory(), attachMetadata, filename, xhtml, true); } } } - private void updateAttachmentMetadata(AttachmentChunks attachment, Metadata metadata) { + private void updateAttachmentMetadata(AttachmentChunks attachment, Metadata metadata, + Set<String> contentIdNames) { + StringChunk contentIdChunk = attachment.getAttachContentId(); + if (contentIdChunk != null) { + String contentId = contentIdChunk.getValue(); + if (! StringUtils.isBlank(contentId)) { + contentId = contentId.trim(); + if (contentIdNames.contains(contentId)) { + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY, + TikaCoreProperties.EmbeddedResourceType.INLINE.name()); + } + metadata.set(MAPI.ATTACH_CONTENT_ID, contentId); + } + } addStringChunkToMetadata(MAPI.ATTACH_LONG_PATH_NAME, attachment.getAttachLongPathName(), metadata); addStringChunkToMetadata(MAPI.ATTACH_LONG_FILE_NAME, attachment.getAttachLongFileName(), metadata); addStringChunkToMetadata(MAPI.ATTACH_FILE_NAME, attachment.getAttachFileName(), metadata); - addStringChunkToMetadata(MAPI.ATTACH_CONTENT_ID, attachment.getAttachContentId(), metadata); addStringChunkToMetadata(MAPI.ATTACH_CONTENT_LOCATION, attachment.getAttachContentLocation(), metadata); addStringChunkToMetadata(MAPI.ATTACH_DISPLAY_NAME, attachment.getAttachDisplayName(), metadata); addStringChunkToMetadata(MAPI.ATTACH_EXTENSION, attachment.getAttachExtension(), metadata); @@ -441,20 +462,20 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { } private void handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, - XHTMLContentHandler xhtml) + XHTMLContentHandler xhtml, Set<String> contentIdNames) throws SAXException, IOException, TikaException { if (extractAllAlternatives) { - extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml); + extractAllAlternatives(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames); return; } - _handleBodyChunks(htmlChunk, rtfChunk, textChunk, xhtml); + _handleBestBodyChunk(htmlChunk, rtfChunk, textChunk, xhtml, contentIdNames); } - private void _handleBodyChunks(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, - XHTMLContentHandler xhtml) + private void _handleBestBodyChunk(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, + XHTMLContentHandler xhtml, Set<String> contentIdNames) throws SAXException, IOException, TikaException { - boolean doneBody = false; + //try html, then rtf, then text if (htmlChunk != null) { byte[] data = null; if (htmlChunk instanceof ByteChunk) { @@ -468,13 +489,16 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { if (htmlParser == null) { htmlParser = new JSoupParser(); } + Metadata htmlMetadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(data)) { - htmlParser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext); + htmlParser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), htmlMetadata, parseContext); } - doneBody = true; + extractContentIdNamesFromHtml(data, htmlMetadata, contentIdNames); + parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.HTML.name()); + return; } } - if (rtfChunk != null && (extractAllAlternatives || !doneBody)) { + if (rtfChunk != null) { ByteChunk chunk = (ByteChunk) rtfChunk; //avoid buffer underflow TIKA-2530 //TODO -- would be good to find an example triggering file and @@ -488,26 +512,64 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { if (rtfParser == null) { rtfParser = new RTFParser(); } + Metadata rtfMetadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(rtf.getData())) { - rtfParser.parseInline(tis, xhtml, new Metadata(), parseContext); + rtfParser.parseInline(tis, xhtml, rtfMetadata, parseContext); } - doneBody = true; + extractContentIdNamesFromRtf(rtf.getData(), rtfMetadata, contentIdNames); + parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.RTF.name()); + parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML, + rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML)); + return; } } - if (textChunk != null && (extractAllAlternatives || !doneBody)) { - xhtml.element("p", ((StringChunk) textChunk).getValue()); + if (textChunk != null) { + String s = ((StringChunk) textChunk).getValue(); + xhtml.element("p", s); + extractContentIdNamesFromText(s, contentIdNames); + parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.TEXT.name()); + } + + } + + private void extractContentIdNamesFromRtf(byte[] data, Metadata metadata, Set<String> contentIdNames) { + //for now, hope that there's encapsulated html + //TODO: check for encapsulated html. If it doesn't exist, handle RTF specifically + extractContentIdNamesFromHtml(data, metadata, contentIdNames); + } + + private void extractContentIdNamesFromHtml(byte[] data, Metadata metadata, Set<String> contentIdNames) { + String html = new String(data, UTF_8); + Matcher imageMatcher = IMG_TAG_PATTERN.matcher(html); + Matcher cidSrcMatcher = SRC_ATTR_PATTERN.matcher(""); + while (imageMatcher.find()) { + String imgElementContents = imageMatcher.group(1); + cidSrcMatcher.reset(imgElementContents); + while (cidSrcMatcher.find()) { + String cid = cidSrcMatcher.group(1); + cid = cid.trim(); + contentIdNames.add(cid); + } } + } + private void extractContentIdNamesFromText(String s, Set<String> contentIdNames) { + Matcher m = TEXT_CID_PATTERN.matcher(s); + while (m.find()) { + contentIdNames.add(m.group(1)); + } } private void extractAllAlternatives(Chunk htmlChunk, Chunk rtfChunk, Chunk textChunk, - XHTMLContentHandler xhtml) + XHTMLContentHandler xhtml, Set<String> contentIdNames) throws TikaException, SAXException, IOException { if (htmlChunk != null) { byte[] data = getValue(htmlChunk); if (data != null) { handleEmbeddedResource(TikaInputStream.get(data), "html-body", null, MediaType.TEXT_HTML.toString(), xhtml, true); + extractContentIdNamesFromHtml(data, new Metadata(), contentIdNames); + parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.HTML.name()); } } if (rtfChunk != null) { @@ -518,8 +580,16 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { byte[] data = rtf.getData(); if (data != null) { - handleEmbeddedResource(TikaInputStream.get(data), "rtf-body", null, + Metadata rtfMetadata = new Metadata(); + handleEmbeddedResource(TikaInputStream.get(data), rtfMetadata, + "rtf-body", null, null, "application/rtf", xhtml, true); + extractContentIdNamesFromRtf(data, rtfMetadata, contentIdNames); + //copy this info into the parent...what else should we copy? + parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.RTF.name()); + parentMetadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML, + rtfMetadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML)); + } } if (textChunk != null) { @@ -530,6 +600,10 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { MediaType.TEXT_PLAIN.toString()); handleEmbeddedResource(TikaInputStream.get(data), chunkMetadata, null, "text-body", null, MediaType.TEXT_PLAIN.toString(), xhtml, true); + if (textChunk instanceof StringChunk) { + extractContentIdNamesFromText(((StringChunk) textChunk).getValue(), contentIdNames); + } + parentMetadata.add(MAPI.BODY_TYPES_PROCESSED, BODY_TYPES_PROCESSED.TEXT.name()); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java index 1fe173259..5612de219 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java @@ -50,6 +50,7 @@ import org.apache.tika.metadata.Office; import org.apache.tika.metadata.OfficeOpenXMLCore; import org.apache.tika.metadata.OfficeOpenXMLExtended; import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.RTFMetadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.utils.CharsetUtils; @@ -899,6 +900,8 @@ final class TextExtractor { hour = param; } else if (equals("min")) { minute = param; + } else if (equals("fromhtml") && param == 1) { + metadata.set(RTFMetadata.CONTAINS_ENCAPSULATED_HTML, true); } if (fontTableState == 1) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index 6c532a84e..ba8a6c64e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -39,6 +39,7 @@ import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.RTFMetadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; @@ -240,6 +241,8 @@ public class OutlookParserTest extends TikaTest { metadata.get(MAPI.INTERNET_REFERENCES)); assertEquals("<c8508767c15dbf40a21693142739ea8d564d18f...@exvmbx018-1.exch018.msoutlookonline.net>", metadata.get(MAPI.IN_REPLY_TO_ID)); + + assertEquals("true", metadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML)); } @Test @@ -247,6 +250,7 @@ public class OutlookParserTest extends TikaTest { List<Metadata> metadataList = getRecursiveMetadata("testMSG_att_msg.msg"); assertEquals("/Test Attachment.msg", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); assertEquals("/smbprn.00009008.KdcPjl.pdf", metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + assertEquals("true", metadataList.get(0).get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML)); } @Test @@ -289,6 +293,8 @@ public class OutlookParserTest extends TikaTest { // Make sure we don't have nested html docs assertEquals(2, content.split("<body>").length); assertEquals(2, content.split("<\\/body>").length); + + assertEquals("true", metadata.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML)); } @Test @@ -323,6 +329,8 @@ public class OutlookParserTest extends TikaTest { assertTrue(m.get("mapi:property:PidLidValidFlagStringProof").contains("2017-02-28T18:42")); assertEquals("0", m.get("mapi:property:PidLidAppointmentSequence")); assertEquals("false", m.get("mapi:property:PidLidRecurring")); + assertEquals("true", m.get(RTFMetadata.CONTAINS_ENCAPSULATED_HTML)); + } @Test
