This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4268
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 53d3f48a8c7c9123ef33193cef24b2af7f465539
Author: tallison <[email protected]>
AuthorDate: Fri Jun 7 09:46:48 2024 -0400

    TIKA-4268 -- improve embedded resource paths for email and generally
---
 .../apache/tika/metadata/TikaCoreProperties.java   | 32 +++++++++++++++++--
 .../apache/tika/parser/RecursiveParserWrapper.java |  8 ++---
 .../tika/sax/RecursiveParserWrapperHandler.java    | 37 +++++++++++++++++++++-
 .../tika/parser/mail/MailContentHandler.java       |  8 ++---
 .../apache/tika/parser/mail/RFC822ParserTest.java  |  4 +++
 .../parser/microsoft/AbstractPOIFSExtractor.java   | 36 +++++++++++++++++++++
 .../tika/parser/microsoft/OutlookExtractor.java    |  2 +-
 .../parser/microsoft/pst/OutlookPSTParser.java     |  2 +-
 .../parser/microsoft/pst/PSTMailItemParser.java    |  5 ++-
 .../tika/parser/microsoft/OutlookParserTest.java   |  7 ++++
 .../parser/microsoft/pst/OutlookPSTParserTest.java |  4 ++-
 11 files changed, 127 insertions(+), 18 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index effa4a667..3d7d34d4e 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -53,14 +53,40 @@ public interface TikaCoreProperties {
 
     /**
      * This tracks the embedded file paths based on the name of embedded files
-     * where available.  There is a small risk that there may be path 
collisions
-     * and that these paths may not be unique within a file.
-     *
+     * where available.
+     * <p/>
+     * This field should be treated with great care and should NOT
+     * be used for creating a directory structure to write out attachments
+     * because: there may be path collisions or illegal characters or other 
mayhem.
+     * <p/>
      * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
      */
     Property EMBEDDED_RESOURCE_PATH =
             Property.internalText(TIKA_META_PREFIX + "embedded_resource_path");
 
+
+    /**
+     * This is calculated in {@link 
org.apache.tika.sax.RecursiveParserWrapperHandler}.
+     * It differs from {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH} in 
that
+     * it is calculated at the end of the full parse of a file. {@link 
TikaCoreProperties#EMBEDDED_RESOURCE_PATH}
+     * is calculated during the parse, and, for some parsers, an embedded 
file's name isn't known until
+     * after its child files have been parsed.
+     * <p/>
+     * Note that the unknown file count may differ between {@link 
TikaCoreProperties#EMBEDDED_RESOURCE_PATH}
+     * because there should be fewer unknown files when this is calculated. 
More simply,
+     * there is no connection between "embedded-1" in this field and 
"embedded-1" in
+     * {@link TikaCoreProperties#EMBEDDED_RESOURCE_PATH}.
+     * <p/>
+     * This field should be treated with great care and should NOT
+     * be used for creating a directory structure to write out attachments
+     * because: there may be path collisions or illegal characters or other 
mayhem.
+     * <p/>
+     *
+     * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
+     */
+    Property FINAL_EMBEDDED_RESOURCE_PATH =
+            Property.internalText(TIKA_META_PREFIX + 
"final_embedded_resource_path");
+
     /**
      * This tracks the embedded file paths based on the embedded file's
      * {@link TikaCoreProperties#EMBEDDED_ID}.
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java 
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 629b289ae..4e4f72dfa 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -187,7 +187,7 @@ public class RecursiveParserWrapper extends ParserDecorator 
{
         }
     }
 
-    private String getResourceName(Metadata metadata, ParserState state) {
+    public static String getResourceName(Metadata metadata, AtomicInteger 
counter) {
         String objectName = "";
         if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
             objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
@@ -196,7 +196,7 @@ public class RecursiveParserWrapper extends ParserDecorator 
{
         } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
             objectName = "version-number-" + 
metadata.get(TikaCoreProperties.VERSION_NUMBER);
         } else {
-            objectName = "embedded-" + (++state.unknownCount);
+            objectName = "embedded-" + counter.incrementAndGet();
         }
         //make sure that there isn't any path info in the objectName
         //some parsers can return paths, not just file names
@@ -234,7 +234,7 @@ public class RecursiveParserWrapper extends ParserDecorator 
{
                 return;
             }
             // Work out what this thing is
-            String objectName = getResourceName(metadata, parserState);
+            String objectName = getResourceName(metadata, 
parserState.unknownCount);
             String objectLocation = this.location + objectName;
 
             metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, 
objectLocation);
@@ -319,7 +319,7 @@ public class RecursiveParserWrapper extends ParserDecorator 
{
      */
     private static class ParserState {
         private final AbstractRecursiveParserWrapperHandler 
recursiveParserWrapperHandler;
-        private int unknownCount = 0;
+        private AtomicInteger unknownCount = new AtomicInteger(0);
         private int embeddedCount = 0;//this is effectively 1-indexed
         private ParserState(AbstractRecursiveParserWrapperHandler handler) {
             this.recursiveParserWrapperHandler = handler;
diff --git 
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
 
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 7ad6f8b25..8ac7277aa 100644
--- 
a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ 
b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -16,8 +16,11 @@
  */
 package org.apache.tika.sax;
 
+import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -28,6 +31,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.filter.MetadataFilter;
 import org.apache.tika.metadata.filter.NoOpFilter;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.utils.ParserUtils;
 
 /**
@@ -123,10 +127,41 @@ public class RecursiveParserWrapperHandler extends 
AbstractRecursiveParserWrappe
         } catch (TikaException e) {
             throw new SAXException(e);
         }
-
         if (metadata.size() > 0) {
             metadataList.add(0, ParserUtils.cloneMetadata(metadata));
         }
+        writeFinalEmbeddedPaths();
+    }
+
+    private void writeFinalEmbeddedPaths() {
+        //for some file types, the file's "name" is not known before
+        //their attachments are parsed. This goes through the id paths
+        //and regenerates the path for the "final embedded resource path"
+        Map<String, String> idToName = new HashMap<>();
+        AtomicInteger unknownCount = new AtomicInteger(0);
+        for (Metadata metadata : metadataList) {
+            String id = metadata.get(TikaCoreProperties.EMBEDDED_ID);
+            if (id == null) {
+                continue;
+            }
+            String name = RecursiveParserWrapper.getResourceName(metadata, 
unknownCount);
+            idToName.put(id, name);
+        }
+        for (Metadata metadata : metadataList) {
+            String idPath = metadata.get(TikaCoreProperties.EMBEDDED_ID_PATH);
+            if (idPath == null) {
+                continue;
+            }
+            if (idPath.startsWith("/")) {
+                idPath = idPath.substring(1);
+            }
+            String[] ids = idPath.split("/");
+            StringBuilder sb = new StringBuilder();
+            for (String id : ids) {
+                sb.append("/").append(idToName.get(id));
+            }
+            metadata.set(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH, 
sb.toString());
+        }
     }
 
     /**
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index d75bf2991..95edcb9ff 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -353,10 +353,10 @@ class MailContentHandler implements ContentHandler {
                     metadata.add(TikaCoreProperties.CREATOR, from);
                 }
             } else if (fieldname.equalsIgnoreCase("Subject")) {
-                metadata.set(TikaCoreProperties.TITLE,
-                        ((UnstructuredField) parsedField).getValue());
-                metadata.set(TikaCoreProperties.SUBJECT,
-                        ((UnstructuredField) parsedField).getValue());
+                String txt = ((UnstructuredField) parsedField).getValue();
+                metadata.set(TikaCoreProperties.TITLE, txt);
+                metadata.set(TikaCoreProperties.SUBJECT, txt);
+                metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, txt + 
".eml");
             } else if (fieldname.equalsIgnoreCase("To")) {
                 processAddressList(parsedField, "To:", Metadata.MESSAGE_TO);
             } else if (fieldname.equalsIgnoreCase("CC")) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index f558a7ffe..1abf88e52 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -405,6 +405,10 @@ public class RFC822ParserTest extends TikaTest {
         assertEquals(null, 
metadataList.get(1).get(Metadata.CONTENT_DISPOSITION));
         assertEquals("attachment; filename=\"testPNG.png\"",
                 metadataList.get(2).get(Metadata.CONTENT_DISPOSITION));
+        assertEquals("/Test Attachment Email.eml/embedded-1",
+                
metadataList.get(1).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH));
+        assertEquals("/Test Attachment Email.eml/testPNG.png",
+                
metadataList.get(2).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH));
     }
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index bbebde63d..b42c0f588 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -18,6 +18,8 @@ package org.apache.tika.parser.microsoft;
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
 
 import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.apache.poi.hpsf.ClassID;
@@ -37,6 +39,7 @@ import org.apache.tika.detect.Detector;
 import org.apache.tika.detect.zip.DefaultZipContainerDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -201,6 +204,21 @@ abstract class AbstractPOIFSExtractor {
             handleOLENative(dir, type, rName, metadata, xhtml, outputHtml);
         } else if (type == POIFSDocumentType.COMP_OBJ) {
             handleCompObj(dir, type, rName, metadata, xhtml, outputHtml);
+        } else if (type == POIFSDocumentType.OUTLOOK) {
+            //for Outlook try to use the title first so that we don't wind up 
with __substg1.0_37...
+            //if that doesn't exist, backoff to rName
+            //add the suffix
+            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+            String name = tryToGetMsgTitle(dir, rName);
+            if (! StringUtils.isBlank(name)) {
+                if (StringUtils.isBlank(type.getExtension())) {
+                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+                } else {
+                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+                            name + '.' + type.getExtension());
+                }
+            }
+            parseEmbedded(dir, xhtml, metadata, outputHtml);
         } else {
             metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
             if (! StringUtils.isBlank(rName)) {
@@ -380,4 +398,22 @@ abstract class AbstractPOIFSExtractor {
             embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, 
outputHtml);
         }
     }
+
+
+    public static String tryToGetMsgTitle(DirectoryEntry node, String 
defaultVal) {
+
+        for (String entryName : new String[] {"__substg1.0_0037001F", 
"__substg1.0_0E1D001F", "__substg1.0_0070001F"} ) {
+            try {
+                Entry entry = node.getEntry(entryName);
+                if (entry instanceof DocumentEntry) {
+                    try (InputStream is = new BoundedInputStream(1000, new 
DocumentInputStream((DocumentEntry) entry))) {
+                        return org.apache.commons.io.IOUtils.toString(is, 
StandardCharsets.UTF_16LE);
+                    }
+                }
+            } catch (IOException e) {
+                //do nothing
+            }
+        }
+        return defaultVal;
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index 58b74a54d..2453b4dc4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -283,7 +283,7 @@ public class OutlookExtractor extends 
AbstractPOIFSExtractor {
                             null, null, xhtml, true);
                 }
                 if (attachment.getAttachmentDirectory() != null) {
-                    
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(),
+                    
handleEmbeddedOfficeDoc(attachment.getAttachmentDirectory().getDirectory(), 
filename,
                             xhtml, true);
                 }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index ded254489..8cfb938c9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -116,7 +116,7 @@ public class OutlookPSTParser implements Parser {
                 metadata.set(PST.PST_FOLDER_PATH, folderPath);
                 try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
                     tis.setOpenContainer(pstMail);
-                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
pstMail.getInternetMessageId());
+                    metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
pstMail.getSubject() + ".msg");
                     embeddedExtractor.parseEmbedded(tis, handler, metadata, 
true);
                 }
                 pstMail = (PSTMessage) pstFolder.getNextChild();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index f0fbd9f68..f8f412764 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -130,7 +130,7 @@ public class PSTMailItemParser implements Parser {
     }
 
     private void extractMetadata(PSTMessage pstMail, Metadata metadata) {
-        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
pstMail.getInternetMessageId());
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
pstMail.getSubject() + ".msg");
         metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, 
pstMail.getInternetMessageId());
         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
         metadata.set(TikaCoreProperties.IDENTIFIER, 
pstMail.getInternetMessageId());
@@ -220,14 +220,13 @@ public class PSTMailItemParser implements Parser {
             TikaException, SAXException {
 
         PSTMessage attachedEmail = attachment.getEmbeddedPSTMessage();
-        attachment.getAttachMethod();
         //check for whether this is a binary attachment or an embedded pst msg
         if (attachedEmail != null) {
             try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
                 tis.setOpenContainer(attachedEmail);
                 Metadata attachMetadata = new Metadata();
                 
attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, 
PSTMailItemParser.PST_MAIL_ITEM_STRING);
-                attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
attachedEmail.getInternetMessageId());
+                attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
attachedEmail.getSubject() + ".msg");
                 attachMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
                 embeddedExtractor.parseEmbedded(tis, xhtml, attachMetadata, 
true);
             }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index 5b8a7192c..ffd4c0e5d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -224,6 +224,13 @@ public class OutlookParserTest extends TikaTest {
         assertEquals(2, content.split("<\\/body>").length);
     }
 
+    @Test
+    public void testEmbeddedPath() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testMSG_att_msg.msg");
+        assertEquals("/Test Attachment.msg", 
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+        assertEquals("/smbprn.00009008.KdcPjl.pdf", 
metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+    }
+
     @Test
     public void testOutlookHTMLfromRTF() throws Exception {
         Metadata metadata = new Metadata();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index c65a52758..8807b4782 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -65,6 +65,8 @@ public class OutlookPSTParserTest extends TikaTest {
     @Test
     public void testExtendedMetadata() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testPST.pst");
+        assertEquals(10, metadataList.size());
+
         Metadata m1 = metadataList.get(1);
         assertEquals("Jörn Kottmann", m1.get(Message.MESSAGE_FROM_NAME));
         assertEquals("Jörn Kottmann", m1.get(TikaCoreProperties.CREATOR));
@@ -98,7 +100,7 @@ public class OutlookPSTParserTest extends TikaTest {
         assertEquals("[email protected]", 
m6.get(Message.MESSAGE_FROM_EMAIL));
 
         Metadata m7 = metadataList.get(7);
-        
assertEquals("/<[email protected]>/<[email protected]>/attachment.docx",
+        assertEquals("/ First email.msg/First email.msg/attachment.docx",
                 m7.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
         assertEquals("/7/8/9", m7.get(TikaCoreProperties.EMBEDDED_ID_PATH));
     }

Reply via email to