This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4268 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 53d3f48a8c7c9123ef33193cef24b2af7f465539 Author: tallison <[email protected]> AuthorDate: Fri Jun 7 09:46:48 2024 -0400 TIKA-4268 -- improve embedded resource paths for email and generally --- .../apache/tika/metadata/TikaCoreProperties.java | 32 +++++++++++++++++-- .../apache/tika/parser/RecursiveParserWrapper.java | 8 ++--- .../tika/sax/RecursiveParserWrapperHandler.java | 37 +++++++++++++++++++++- .../tika/parser/mail/MailContentHandler.java | 8 ++--- .../apache/tika/parser/mail/RFC822ParserTest.java | 4 +++ .../parser/microsoft/AbstractPOIFSExtractor.java | 36 +++++++++++++++++++++ .../tika/parser/microsoft/OutlookExtractor.java | 2 +- .../parser/microsoft/pst/OutlookPSTParser.java | 2 +- .../parser/microsoft/pst/PSTMailItemParser.java | 5 ++- .../tika/parser/microsoft/OutlookParserTest.java | 7 ++++ .../parser/microsoft/pst/OutlookPSTParserTest.java | 4 ++- 11 files changed, 127 insertions(+), 18 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index effa4a667..3d7d34d4e 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -53,14 +53,40 @@ public interface TikaCoreProperties { /** * This tracks the embedded file paths based on the name of embedded files - * where available. There is a small risk that there may be path collisions - * and that these paths may not be unique within a file. - * + * where available. + * <p/> + * This field should be treated with great care and should NOT + * be used for creating a directory structure to write out attachments + * because: there may be path collisions or illegal characters or other mayhem. + * <p/> * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}. */ Property EMBEDDED_RESOURCE_PATH = Property.internalText(TIKA_META_PREFIX + "embedded_resource_path"); + + /** + * This is calculated in {@link org.apache.tika.sax.RecursiveParserWrapperHandler}. + * It differs from {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} in that + * it is calculated at the end of the full parse of a file. {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} + * is calculated during the parse, and, for some parsers, an embedded file's name isn't known until + * after its child files have been parsed. + * <p/> + * Note that the unknown file count may differ between {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} + * because there should be fewer unknown files when this is calculated. More simply, + * there is no connection between "embedded-1" in this field and "embedded-1" in + * {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH}. + * <p/> + * This field should be treated with great care and should NOT + * be used for creating a directory structure to write out attachments + * because: there may be path collisions or illegal characters or other mayhem. + * <p/> + * + * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}. + */ + Property FINAL_EMBEDDED_RESOURCE_PATH = + Property.internalText(TIKA_META_PREFIX + "final_embedded_resource_path"); + /** * This tracks the embedded file paths based on the embedded file's * {@link TikaCoreProperties#EMBEDDED_ID}. diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 629b289ae..4e4f72dfa 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -187,7 +187,7 @@ public class RecursiveParserWrapper extends ParserDecorator { } } - private String getResourceName(Metadata metadata, ParserState state) { + public static String getResourceName(Metadata metadata, AtomicInteger counter) { String objectName = ""; if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) { objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); @@ -196,7 +196,7 @@ public class RecursiveParserWrapper extends ParserDecorator { } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) { objectName = "version-number-" + metadata.get(TikaCoreProperties.VERSION_NUMBER); } else { - objectName = "embedded-" + (++state.unknownCount); + objectName = "embedded-" + counter.incrementAndGet(); } //make sure that there isn't any path info in the objectName //some parsers can return paths, not just file names @@ -234,7 +234,7 @@ public class RecursiveParserWrapper extends ParserDecorator { return; } // Work out what this thing is - String objectName = getResourceName(metadata, parserState); + String objectName = getResourceName(metadata, parserState.unknownCount); String objectLocation = this.location + objectName; metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, objectLocation); @@ -319,7 +319,7 @@ public class RecursiveParserWrapper extends ParserDecorator { */ private static class ParserState { private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler; - private int unknownCount = 0; + private AtomicInteger unknownCount = new AtomicInteger(0); private int embeddedCount = 0;//this is effectively 1-indexed private ParserState(AbstractRecursiveParserWrapperHandler handler) { this.recursiveParserWrapperHandler = handler; diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java index 7ad6f8b25..8ac7277aa 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java @@ -16,8 +16,11 @@ */ package org.apache.tika.sax; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -28,6 +31,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; +import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.utils.ParserUtils; /** @@ -123,10 +127,41 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe } catch (TikaException e) { throw new SAXException(e); } - if (metadata.size() > 0) { metadataList.add(0, ParserUtils.cloneMetadata(metadata)); } + writeFinalEmbeddedPaths(); + } + + private void writeFinalEmbeddedPaths() { + //for some file types, the file's "name" is not known before + //their attachments are parsed. This goes through the id paths + //and regenerates the path for the "final embedded resource path" + Map<String, String> idToName = new HashMap<>(); + AtomicInteger unknownCount = new AtomicInteger(0); + for (Metadata metadata : metadataList) { + String id = metadata.get(TikaCoreProperties.EMBEDDED_ID); + if (id == null) { + continue; + } + String name = RecursiveParserWrapper.getResourceName(metadata, unknownCount); + idToName.put(id, name); + } + for (Metadata metadata : metadataList) { + String idPath = metadata.get(TikaCoreProperties.EMBEDDED_ID_PATH); + if (idPath == null) { + continue; + } + if (idPath.startsWith("/")) { + idPath = idPath.substring(1); + } + String[] ids = idPath.split("/"); + StringBuilder sb = new StringBuilder(); + for (String id : ids) { + sb.append("/").append(idToName.get(id)); + } + metadata.set(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH, sb.toString()); + } } /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index d75bf2991..95edcb9ff 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -353,10 +353,10 @@ class MailContentHandler implements ContentHandler { metadata.add(TikaCoreProperties.CREATOR, from); } } else if (fieldname.equalsIgnoreCase("Subject")) { - metadata.set(TikaCoreProperties.TITLE, - ((UnstructuredField) parsedField).getValue()); - metadata.set(TikaCoreProperties.SUBJECT, - ((UnstructuredField) parsedField).getValue()); + String txt = ((UnstructuredField) parsedField).getValue(); + metadata.set(TikaCoreProperties.TITLE, txt); + metadata.set(TikaCoreProperties.SUBJECT, txt); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, txt + ".eml"); } else if (fieldname.equalsIgnoreCase("To")) { processAddressList(parsedField, "To:", Metadata.MESSAGE_TO); } else if (fieldname.equalsIgnoreCase("CC")) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java index f558a7ffe..1abf88e52 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java @@ -405,6 +405,10 @@ public class RFC822ParserTest extends TikaTest { assertEquals(null, metadataList.get(1).get(Metadata.CONTENT_DISPOSITION)); assertEquals("attachment; filename=\"testPNG.png\"", metadataList.get(2).get(Metadata.CONTENT_DISPOSITION)); + assertEquals("/Test Attachment Email.eml/embedded-1", + metadataList.get(1).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH)); + assertEquals("/Test Attachment Email.eml/testPNG.png", + metadataList.get(2).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH)); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index bbebde63d..b42c0f588 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -18,6 +18,8 @@ package org.apache.tika.parser.microsoft; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.poi.hpsf.ClassID; @@ -37,6 +39,7 @@ import org.apache.tika.detect.Detector; import org.apache.tika.detect.zip.DefaultZipContainerDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -201,6 +204,21 @@ abstract class AbstractPOIFSExtractor { handleOLENative(dir, type, rName, metadata, xhtml, outputHtml); } else if (type == POIFSDocumentType.COMP_OBJ) { handleCompObj(dir, type, rName, metadata, xhtml, outputHtml); + } else if (type == POIFSDocumentType.OUTLOOK) { + //for Outlook try to use the title first so that we don't wind up with __substg1.0_37... + //if that doesn't exist, backoff to rName + //add the suffix + metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); + String name = tryToGetMsgTitle(dir, rName); + if (! StringUtils.isBlank(name)) { + if (StringUtils.isBlank(type.getExtension())) { + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + } else { + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, + name + '.' + type.getExtension()); + } + } + parseEmbedded(dir, xhtml, metadata, outputHtml); } else { metadata.set(Metadata.CONTENT_TYPE, type.getType().toString()); if (! StringUtils.isBlank(rName)) { @@ -380,4 +398,22 @@ abstract class AbstractPOIFSExtractor { embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, outputHtml); } } + + + public static String tryToGetMsgTitle(DirectoryEntry node, String defaultVal) { + + for (String entryName : new String[] {"__substg1.0_0037001F", "__substg1.0_0E1D001F", "__substg1.0_0070001F"} ) { + try { + Entry entry = node.getEntry(entryName); + if (entry instanceof DocumentEntry) { + try (InputStream is = new BoundedInputStream(1000, new DocumentInputStream((DocumentEntry) entry))) { + return org.apache.commons.io.IOUtils.toString(is, StandardCharsets.UTF_16LE); + } + } + } catch (IOException e) { + //do nothing + } + } + return defaultVal; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 58b74a54d..2453b4dc4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -283,7 +283,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { null, null, xhtml, true); } if (attachment.getAttachmentDirectory() != null) { - handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), + handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), filename, xhtml, true); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java index ded254489..8cfb938c9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java @@ -116,7 +116,7 @@ public class OutlookPSTParser implements Parser { metadata.set(PST.PST_FOLDER_PATH, folderPath); try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { tis.setOpenContainer(pstMail); - metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getSubject() + ".msg"); embeddedExtractor.parseEmbedded(tis, handler, metadata, true); } pstMail = (PSTMessage) pstFolder.getNextChild(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index f0fbd9f68..f8f412764 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -130,7 +130,7 @@ public class PSTMailItemParser implements Parser { } private void extractMetadata(PSTMessage pstMail, Metadata metadata) { - metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId()); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getSubject() + ".msg"); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId()); metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name()); metadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId()); @@ -220,14 +220,13 @@ public class PSTMailItemParser implements Parser { TikaException, SAXException { PSTMessage attachedEmail = attachment.getEmbeddedPSTMessage(); - attachment.getAttachMethod(); //check for whether this is a binary attachment or an embedded pst msg if (attachedEmail != null) { try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { tis.setOpenContainer(attachedEmail); Metadata attachMetadata = new Metadata(); attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING); - attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, attachedEmail.getInternetMessageId()); + attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, attachedEmail.getSubject() + ".msg"); attachMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name()); embeddedExtractor.parseEmbedded(tis, xhtml, attachMetadata, true); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index 5b8a7192c..ffd4c0e5d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -224,6 +224,13 @@ public class OutlookParserTest extends TikaTest { assertEquals(2, content.split("<\\/body>").length); } + @Test + public void testEmbeddedPath() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("testMSG_att_msg.msg"); + assertEquals("/Test Attachment.msg", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + assertEquals("/smbprn.00009008.KdcPjl.pdf", metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); + } + @Test public void testOutlookHTMLfromRTF() throws Exception { Metadata metadata = new Metadata(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index c65a52758..8807b4782 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -65,6 +65,8 @@ public class OutlookPSTParserTest extends TikaTest { @Test public void testExtendedMetadata() throws Exception { List<Metadata> metadataList = getRecursiveMetadata("testPST.pst"); + assertEquals(10, metadataList.size()); + Metadata m1 = metadataList.get(1); assertEquals("Jörn Kottmann", m1.get(Message.MESSAGE_FROM_NAME)); assertEquals("Jörn Kottmann", m1.get(TikaCoreProperties.CREATOR)); @@ -98,7 +100,7 @@ public class OutlookPSTParserTest extends TikaTest { assertEquals("[email protected]", m6.get(Message.MESSAGE_FROM_EMAIL)); Metadata m7 = metadataList.get(7); - assertEquals("/<[email protected]>/<[email protected]>/attachment.docx", + assertEquals("/ First email.msg/First email.msg/attachment.docx", m7.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)); assertEquals("/7/8/9", m7.get(TikaCoreProperties.EMBEDDED_ID_PATH)); }
