This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4696-improve-inline-tagging in repository https://gitbox.apache.org/repos/asf/tika.git
commit e16183bd737844dcdd3b76860b07a9acc352affa Author: tallison <[email protected]> AuthorDate: Mon Mar 23 08:19:03 2026 -0400 improve tagging of inline images --- .../main/java/org/apache/tika/metadata/MAPI.java | 14 +++ .../tika/parser/microsoft/OutlookExtractor.java | 134 +++++++++++++++++++-- .../tika/parser/microsoft/OutlookParserTest.java | 43 +++++++ 3 files changed, 179 insertions(+), 12 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java index 613c3d3d9d..c8f81a980d 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/MAPI.java @@ -77,4 +77,18 @@ public interface MAPI { Property ATTACH_MIME = Property.internalText(PREFIX_MAPI_ATTACH_META + "mime"); Property ATTACH_LANGUAGE = Property.internalText(PREFIX_MAPI_ATTACH_META + "language"); + /** + * PidTagAttachFlags (0x3714) — indicates which body formats might reference this attachment. + * Bit 1 (0x1) = ATT_INVISIBLE_IN_HTML + * Bit 2 (0x2) = ATT_INVISIBLE_IN_RTF + * Bit 3 (0x4) = ATT_RENDERED_IN_BODY + */ + Property ATTACH_FLAGS = Property.internalInteger(PREFIX_MAPI_ATTACH_META + "flags"); + + /** + * PidTagAttachmentHidden (0x7FFE) — indicates whether this attachment is hidden from the end + * user. Inline images typically have this set to true. + */ + Property ATTACH_HIDDEN = Property.internalBoolean(PREFIX_MAPI_ATTACH_META + "hidden"); + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 552c52889c..eb8fca4f47 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -20,8 +20,11 @@ import static java.nio.charset.StandardCharsets.UTF_8; import java.io.BufferedReader; import java.io.IOException; +import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; @@ -56,7 +59,10 @@ import org.apache.poi.hsmf.datatypes.RecipientChunks; import org.apache.poi.hsmf.datatypes.StringChunk; import org.apache.poi.hsmf.datatypes.Types; import org.apache.poi.hsmf.exceptions.ChunkNotFoundException; +import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; +import org.apache.poi.poifs.filesystem.DocumentEntry; +import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.util.CodePageUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -173,6 +179,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { private static Pattern HEADER_KEY_PAT = Pattern.compile("\\A([\\x21-\\x39\\x3B-\\x7E]+):(.*?)\\Z"); + private final DirectoryNode root; private final MAPIMessage msg; private final ParseContext parseContext; private final boolean extractAllAlternatives; @@ -181,6 +188,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { public OutlookExtractor(DirectoryNode root, Metadata metadata, ParseContext context) throws TikaException { super(context, metadata); + this.root = root; this.parseContext = context; this.extractAllAlternatives = context.get(OfficeParserConfig.class).isExtractAllAlternativesFromMSG(); @@ -317,18 +325,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { private void updateAttachmentMetadata(AttachmentChunks attachment, Metadata metadata, Set<String> contentIdNames) { - StringChunk contentIdChunk = attachment.getAttachContentId(); - if (contentIdChunk != null) { - String contentId = contentIdChunk.getValue(); - if (! StringUtils.isBlank(contentId)) { - contentId = contentId.trim(); - if (contentIdNames.contains(contentId)) { - metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY, - TikaCoreProperties.EmbeddedResourceType.INLINE.name()); - } - metadata.set(MAPI.ATTACH_CONTENT_ID, contentId); - } - } + // Extract string-based metadata from POI's named chunk getters addStringChunkToMetadata(MAPI.ATTACH_LONG_PATH_NAME, attachment.getAttachLongPathName(), metadata); addStringChunkToMetadata(MAPI.ATTACH_LONG_FILE_NAME, attachment.getAttachLongFileName(), metadata); addStringChunkToMetadata(MAPI.ATTACH_FILE_NAME, attachment.getAttachFileName(), metadata); @@ -337,6 +334,119 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { addStringChunkToMetadata(MAPI.ATTACH_EXTENSION, attachment.getAttachExtension(), metadata); addStringChunkToMetadata(MAPI.ATTACH_MIME, attachment.getAttachMimeTag(), metadata); addStringChunkToMetadata(MAPI.ATTACH_LANGUAGE, attachment.getAttachLanguage(), metadata); + + // Extract fixed properties from the attachment's __properties_version1.0 stream + // POI's AttachmentChunks doesn't parse this stream, so we read it directly. + Map<Integer, Long> attachProps = readAttachmentProperties(attachment.getPOIFSName()); + Long attachFlags = attachProps.get(PID_TAG_ATTACH_FLAGS); + if (attachFlags != null) { + metadata.set(MAPI.ATTACH_FLAGS, attachFlags.intValue()); + } + Long attachHidden = attachProps.get(PID_TAG_ATTACHMENT_HIDDEN); + if (attachHidden != null) { + metadata.set(MAPI.ATTACH_HIDDEN, attachHidden.intValue() != 0); + } + + // Determine inline vs attachment + String contentId = null; + StringChunk contentIdChunk = attachment.getAttachContentId(); + if (contentIdChunk != null) { + String rawCid = contentIdChunk.getValue(); + if (!StringUtils.isBlank(rawCid)) { + contentId = rawCid.trim(); + metadata.set(MAPI.ATTACH_CONTENT_ID, contentId); + } + } + + if (contentId != null && contentIdNames.contains(contentId)) { + // Layer 1: CID referenced in the message body — high confidence inline + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY, + TikaCoreProperties.EmbeddedResourceType.INLINE.name()); + } else if (contentId != null + && attachFlags != null + && (attachFlags & ATT_RENDERED_IN_BODY) != 0 + && isInlineableMimeType(metadata.get(MAPI.ATTACH_MIME))) { + // Layer 2: MAPI says rendered in body + image MIME type — the CID regex + // missed it (e.g. encapsulated RTF with stripped img tags) + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY, + TikaCoreProperties.EmbeddedResourceType.INLINE.name()); + } + } + + /** + * Returns true for MIME types that are safe to label as INLINE. + * We gate on this to avoid marking PDFs, DOCX, etc. as inline — downstream + * consumers use INLINE to decide what to index separately. + */ + private static boolean isInlineableMimeType(String mimeType) { + if (StringUtils.isBlank(mimeType)) { + return false; + } + String lower = mimeType.toLowerCase(Locale.ROOT).trim(); + return lower.startsWith("image/"); + } + + // PidTagAttachFlags (0x3714) — bit flags indicating which body formats reference this + private static final int PID_TAG_ATTACH_FLAGS = 0x3714; + // Bit 2 = ATT_RENDERED_IN_BODY: this attachment is referenced by the body + private static final int ATT_RENDERED_IN_BODY = 0x4; + // PidTagAttachmentHidden (0x7FFE) — boolean, true if hidden from end user (inline images) + private static final int PID_TAG_ATTACHMENT_HIDDEN = 0x7FFE; + + /** + * Read fixed MAPI properties from the __properties_version1.0 stream inside an + * attachment storage. POI's {@link AttachmentChunks} does not parse this stream. + * + * <p>The stream format is: 8-byte header, followed by 16-byte property entries. + * Each entry: 2 bytes property type, 2 bytes property ID, 4 bytes flags, + * 8 bytes value (inline for fixed-size types).</p> + * + * @param poifsName the OLE2 directory name for this attachment + * (e.g. "__attach_version1.0_#00000000") + * @return map of property ID to value for fixed-size integer/boolean properties + */ + private Map<Integer, Long> readAttachmentProperties(String poifsName) { + Map<Integer, Long> result = new HashMap<>(); + try { + DirectoryEntry attachDir = (DirectoryEntry) root.getEntry(poifsName); + DocumentEntry propsEntry = + (DocumentEntry) attachDir.getEntry("__properties_version1.0"); + byte[] data; + try (InputStream dis = new DocumentInputStream(propsEntry)) { + data = dis.readAllBytes(); + } + if (data.length < 8) { + return result; + } + ByteBuffer buf = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN); + int offset = 8; // skip 8-byte header + while (offset + 16 <= data.length) { + int propType = buf.getShort(offset) & 0xFFFF; + int propId = buf.getShort(offset + 2) & 0xFFFF; + long value; + switch (propType) { + case 0x0003: // PtypInteger32 + value = buf.getInt(offset + 8) & 0xFFFFFFFFL; + result.put(propId, value); + break; + case 0x000B: // PtypBoolean + value = buf.getShort(offset + 8) & 0xFFFF; + result.put(propId, value); + break; + case 0x0014: // PtypInteger64 + value = buf.getLong(offset + 8); + result.put(propId, value); + break; + default: + // skip variable-length, binary, time and other types + break; + } + offset += 16; + } + } catch (Exception e) { + LOGGER.debug("Could not read attachment properties for {}", poifsName, e); + } + return result; } private void addStringChunkToMetadata(Property property, StringChunk stringChunk, Metadata metadata) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index 20b010e7b7..eb92465dbe 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -407,4 +407,47 @@ public class OutlookParserTest extends TikaTest { assertContains("annuaires\t \n" + " Synchronisation", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT)); } + @Test + public void testAttachFlagsExtracted() throws Exception { + // test-outlook2003.msg has 11 JPEG attachments with PidTagAttachFlags=4 + // (ATT_RENDERED_IN_BODY) but no Content-ID + List<Metadata> metadataList = getRecursiveMetadata("test-outlook2003.msg"); + // first entry is the message itself, rest are attachments + assertTrue(metadataList.size() > 1, "expected attachments"); + for (int i = 1; i < metadataList.size(); i++) { + Metadata m = metadataList.get(i); + assertEquals("4", m.get(MAPI.ATTACH_FLAGS), + "attachment " + i + " should have flags=4"); + } + } + + @Test + public void testRegularAttachmentsNotMarkedInline() throws Exception { + // testMSG_att_doc.msg has regular document attachments with flags=0 + // and no Content-ID — they must NOT be marked INLINE + List<Metadata> metadataList = getRecursiveMetadata("testMSG_att_doc.msg"); + assertTrue(metadataList.size() > 1, "expected attachments"); + for (int i = 1; i < metadataList.size(); i++) { + Metadata m = metadataList.get(i); + String resourceType = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY); + assertFalse( + TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(resourceType), + "regular attachment " + i + " should not be INLINE"); + } + } + + @Test + public void testImageWithFlagsButNoCidNotInline() throws Exception { + // test-outlook2003.msg has image attachments with ATT_RENDERED_IN_BODY + // but NO Content-ID. Layer 2 requires CID, so these should NOT be INLINE. + List<Metadata> metadataList = getRecursiveMetadata("test-outlook2003.msg"); + for (int i = 1; i < metadataList.size(); i++) { + Metadata m = metadataList.get(i); + String resourceType = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE_KEY); + assertFalse( + TikaCoreProperties.EmbeddedResourceType.INLINE.name().equals(resourceType), + "image attachment " + i + " without CID should not be INLINE"); + } + } + }
