This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new dd3e7ee40 Tika-4268 -- improve embedded resource path naming, esp for
msg and email-based formats (#1804)
dd3e7ee40 is described below
commit dd3e7ee40e5060408d0830c21715564a0ab3c805
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jun 7 13:51:41 2024 -0400
Tika-4268 -- improve embedded resource path naming, esp for msg and
email-based formats (#1804)
* TIKA-4268 -- improve embedded resource paths for email and generally
---
.../apache/tika/metadata/TikaCoreProperties.java | 32 +++++++++++++++++--
.../apache/tika/parser/RecursiveParserWrapper.java | 8 ++---
.../tika/sax/RecursiveParserWrapperHandler.java | 37 +++++++++++++++++++++-
.../tika/parser/mail/MailContentHandler.java | 8 ++---
.../apache/tika/parser/mail/RFC822ParserTest.java | 4 +++
.../parser/microsoft/AbstractPOIFSExtractor.java | 36 +++++++++++++++++++++
.../tika/parser/microsoft/OutlookExtractor.java | 2 +-
.../parser/microsoft/pst/OutlookPSTParser.java | 2 +-
.../parser/microsoft/pst/PSTMailItemParser.java | 5 ++-
.../tika/parser/microsoft/OutlookParserTest.java | 7 ++++
.../parser/microsoft/pst/OutlookPSTParserTest.java | 4 ++-
.../microsoft/POIContainerExtractionTest.java | 2 +-
12 files changed, 128 insertions(+), 19 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index effa4a667..3d7d34d4e 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -53,14 +53,40 @@ public interface TikaCoreProperties {
/**
* This tracks the embedded file paths based on the name of embedded files
- * where available. There is a small risk that there may be path
collisions
- * and that these paths may not be unique within a file.
- *
+ * where available.
+ * <p/>
+ * This field should be treated with great care and should NOT
+ * be used for creating a directory structure to write out attachments
+ * because: there may be path collisions or illegal characters or other
mayhem.
+ * <p/>
* For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
*/
Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TIKA_META_PREFIX + "embedded_resource_path");
+
+ /**
+ * This is calculated in {@link
org.apache.tika.sax.RecursiveParserWrapperHandler}.
+ * It differs from {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} in
that
+ * it is calculated at the end of the full parse of a file. {@link
TikaCoreProperties#EMBEDDED_RESOURCE_PATH}
+ * is calculated during the parse, and, for some parsers, an embedded
file's name isn't known until
+ * after its child files have been parsed.
+ * <p/>
+ * Note that the unknown file count may differ between {@link
TikaCoreProperties#EMBEDDED_RESOURCE_PATH}
+ * because there should be fewer unknown files when this is calculated.
More simply,
+ * there is no connection between "embedded-1" in this field and
"embedded-1" in
+ * {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH}.
+ * <p/>
+ * This field should be treated with great care and should NOT
+ * be used for creating a directory structure to write out attachments
+ * because: there may be path collisions or illegal characters or other
mayhem.
+ * <p/>
+ *
+ * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
+ */
+ Property FINAL_EMBEDDED_RESOURCE_PATH =
+ Property.internalText(TIKA_META_PREFIX +
"final_embedded_resource_path");
+
/**
* This tracks the embedded file paths based on the embedded file's
* {@link TikaCoreProperties#EMBEDDED_ID}.
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 629b289ae..4e4f72dfa 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -187,7 +187,7 @@ public class RecursiveParserWrapper extends ParserDecorator
{
}
}
- private String getResourceName(Metadata metadata, ParserState state) {
+ public static String getResourceName(Metadata metadata, AtomicInteger
counter) {
String objectName = "";
if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
@@ -196,7 +196,7 @@ public class RecursiveParserWrapper extends ParserDecorator
{
} else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
objectName = "version-number-" +
metadata.get(TikaCoreProperties.VERSION_NUMBER);
} else {
- objectName = "embedded-" + (++state.unknownCount);
+ objectName = "embedded-" + counter.incrementAndGet();
}
//make sure that there isn't any path info in the objectName
//some parsers can return paths, not just file names
@@ -234,7 +234,7 @@ public class RecursiveParserWrapper extends ParserDecorator
{
return;
}
// Work out what this thing is
- String objectName = getResourceName(metadata, parserState);
+ String objectName = getResourceName(metadata,
parserState.unknownCount);
String objectLocation = this.location + objectName;
metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH,
objectLocation);
@@ -319,7 +319,7 @@ public class RecursiveParserWrapper extends ParserDecorator
{
*/
private static class ParserState {
private final AbstractRecursiveParserWrapperHandler
recursiveParserWrapperHandler;
- private int unknownCount = 0;
+ private AtomicInteger unknownCount = new AtomicInteger(0);
private int embeddedCount = 0;//this is effectively 1-indexed
private ParserState(AbstractRecursiveParserWrapperHandler handler) {
this.recursiveParserWrapperHandler = handler;
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 7ad6f8b25..8ac7277aa 100644
---
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -16,8 +16,11 @@
*/
package org.apache.tika.sax;
+import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -28,6 +31,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.NoOpFilter;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.utils.ParserUtils;
/**
@@ -123,10 +127,41 @@ public class RecursiveParserWrapperHandler extends
AbstractRecursiveParserWrappe
} catch (TikaException e) {
throw new SAXException(e);
}
-
if (metadata.size() > 0) {
metadataList.add(0, ParserUtils.cloneMetadata(metadata));
}
+ writeFinalEmbeddedPaths();
+ }
+
+ private void writeFinalEmbeddedPaths() {
+ //for some file types, the file's "name" is not known before
+ //their attachments are parsed. This goes through the id paths
+ //and regenerates the path for the "final embedded resource path"
+ Map<String, String> idToName = new HashMap<>();
+ AtomicInteger unknownCount = new AtomicInteger(0);
+ for (Metadata metadata : metadataList) {
+ String id = metadata.get(TikaCoreProperties.EMBEDDED_ID);
+ if (id == null) {
+ continue;
+ }
+ String name = RecursiveParserWrapper.getResourceName(metadata,
unknownCount);
+ idToName.put(id, name);
+ }
+ for (Metadata metadata : metadataList) {
+ String idPath = metadata.get(TikaCoreProperties.EMBEDDED_ID_PATH);
+ if (idPath == null) {
+ continue;
+ }
+ if (idPath.startsWith("/")) {
+ idPath = idPath.substring(1);
+ }
+ String[] ids = idPath.split("/");
+ StringBuilder sb = new StringBuilder();
+ for (String id : ids) {
+ sb.append("/").append(idToName.get(id));
+ }
+ metadata.set(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH,
sb.toString());
+ }
}
/**
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index d75bf2991..95edcb9ff 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -353,10 +353,10 @@ class MailContentHandler implements ContentHandler {
metadata.add(TikaCoreProperties.CREATOR, from);
}
} else if (fieldname.equalsIgnoreCase("Subject")) {
- metadata.set(TikaCoreProperties.TITLE,
- ((UnstructuredField) parsedField).getValue());
- metadata.set(TikaCoreProperties.SUBJECT,
- ((UnstructuredField) parsedField).getValue());
+ String txt = ((UnstructuredField) parsedField).getValue();
+ metadata.set(TikaCoreProperties.TITLE, txt);
+ metadata.set(TikaCoreProperties.SUBJECT, txt);
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, txt +
".eml");
} else if (fieldname.equalsIgnoreCase("To")) {
processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
} else if (fieldname.equalsIgnoreCase("CC")) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index f558a7ffe..1abf88e52 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -405,6 +405,10 @@ public class RFC822ParserTest extends TikaTest {
assertEquals(null,
metadataList.get(1).get(Metadata.CONTENT_DISPOSITION));
assertEquals("attachment; filename=\"testPNG.png\"",
metadataList.get(2).get(Metadata.CONTENT_DISPOSITION));
+ assertEquals("/Test Attachment Email.eml/embedded-1",
+
metadataList.get(1).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH));
+ assertEquals("/Test Attachment Email.eml/testPNG.png",
+
metadataList.get(2).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH));
}
@Test
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index bbebde63d..b42c0f588 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -18,6 +18,8 @@ package org.apache.tika.parser.microsoft;
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.hpsf.ClassID;
@@ -37,6 +39,7 @@ import org.apache.tika.detect.Detector;
import org.apache.tika.detect.zip.DefaultZipContainerDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -201,6 +204,21 @@ abstract class AbstractPOIFSExtractor {
handleOLENative(dir, type, rName, metadata, xhtml, outputHtml);
} else if (type == POIFSDocumentType.COMP_OBJ) {
handleCompObj(dir, type, rName, metadata, xhtml, outputHtml);
+ } else if (type == POIFSDocumentType.OUTLOOK) {
+ //for Outlook try to use the title first so that we don't wind up
with __substg1.0_37...
+ //if that doesn't exist, backoff to rName
+ //add the suffix
+ metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+ String name = tryToGetMsgTitle(dir, rName);
+ if (! StringUtils.isBlank(name)) {
+ if (StringUtils.isBlank(type.getExtension())) {
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ } else {
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+ name + '.' + type.getExtension());
+ }
+ }
+ parseEmbedded(dir, xhtml, metadata, outputHtml);
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
if (! StringUtils.isBlank(rName)) {
@@ -380,4 +398,22 @@ abstract class AbstractPOIFSExtractor {
embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata,
outputHtml);
}
}
+
+
+ public static String tryToGetMsgTitle(DirectoryEntry node, String
defaultVal) {
+
+ for (String entryName : new String[] {"__substg1.0_0037001F",
"__substg1.0_0E1D001F", "__substg1.0_0070001F"} ) {
+ try {
+ Entry entry = node.getEntry(entryName);
+ if (entry instanceof DocumentEntry) {
+ try (InputStream is = new BoundedInputStream(1000, new
DocumentInputStream((DocumentEntry) entry))) {
+ return org.apache.commons.io.IOUtils.toString(is,
StandardCharsets.UTF_16LE);
+ }
+ }
+ } catch (IOException e) {
+ //do nothing
+ }
+ }
+ return defaultVal;
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 58b74a54d..2453b4dc4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -283,7 +283,7 @@ public class OutlookExtractor extends
AbstractPOIFSExtractor {
null, null, xhtml, true);
}
if (attachment.getAttachmentDirectory() != null) {
-
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(),
+
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(),
filename,
xhtml, true);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index ded254489..8cfb938c9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -116,7 +116,7 @@ public class OutlookPSTParser implements Parser {
metadata.set(PST.PST_FOLDER_PATH, folderPath);
try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
tis.setOpenContainer(pstMail);
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
pstMail.getInternetMessageId());
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
pstMail.getSubject() + ".msg");
embeddedExtractor.parseEmbedded(tis, handler, metadata,
true);
}
pstMail = (PSTMessage) pstFolder.getNextChild();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index f0fbd9f68..f8f412764 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -130,7 +130,7 @@ public class PSTMailItemParser implements Parser {
}
private void extractMetadata(PSTMessage pstMail, Metadata metadata) {
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
pstMail.getInternetMessageId());
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
pstMail.getSubject() + ".msg");
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID,
pstMail.getInternetMessageId());
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
metadata.set(TikaCoreProperties.IDENTIFIER,
pstMail.getInternetMessageId());
@@ -220,14 +220,13 @@ public class PSTMailItemParser implements Parser {
TikaException, SAXException {
PSTMessage attachedEmail = attachment.getEmbeddedPSTMessage();
- attachment.getAttachMethod();
//check for whether this is a binary attachment or an embedded pst msg
if (attachedEmail != null) {
try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
tis.setOpenContainer(attachedEmail);
Metadata attachMetadata = new Metadata();
attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
PSTMailItemParser.PST_MAIL_ITEM_STRING);
- attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
attachedEmail.getInternetMessageId());
+ attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
attachedEmail.getSubject() + ".msg");
attachMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
embeddedExtractor.parseEmbedded(tis, xhtml, attachMetadata,
true);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 5b8a7192c..ffd4c0e5d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -224,6 +224,13 @@ public class OutlookParserTest extends TikaTest {
assertEquals(2, content.split("<\\/body>").length);
}
+ @Test
+ public void testEmbeddedPath() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testMSG_att_msg.msg");
+ assertEquals("/Test Attachment.msg",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/smbprn.00009008.KdcPjl.pdf",
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ }
+
@Test
public void testOutlookHTMLfromRTF() throws Exception {
Metadata metadata = new Metadata();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index c65a52758..8807b4782 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -65,6 +65,8 @@ public class OutlookPSTParserTest extends TikaTest {
@Test
public void testExtendedMetadata() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testPST.pst");
+ assertEquals(10, metadataList.size());
+
Metadata m1 = metadataList.get(1);
assertEquals("Jörn Kottmann", m1.get(Message.MESSAGE_FROM_NAME));
assertEquals("Jörn Kottmann", m1.get(TikaCoreProperties.CREATOR));
@@ -98,7 +100,7 @@ public class OutlookPSTParserTest extends TikaTest {
assertEquals("[email protected]",
m6.get(Message.MESSAGE_FROM_EMAIL));
Metadata m7 = metadataList.get(7);
-
assertEquals("/<[email protected]>/<[email protected]>/attachment.docx",
+ assertEquals("/ First email.msg/First email.msg/attachment.docx",
m7.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals("/7/8/9", m7.get(TikaCoreProperties.EMBEDDED_ID_PATH));
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 783554f2c..13f411d06 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -240,7 +240,7 @@ public class POIContainerExtractionTest extends
AbstractPOIContainerExtractionTe
assertEquals(2, handler.filenames.size());
assertEquals(2, handler.mediaTypes.size());
- assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
+ assertEquals("Test Attachment.msg", handler.filenames.get(0));
assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));