This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 17826b53e TIKA-4518 (#2366)
17826b53e is described below

commit 17826b53ef1c114a426c55b4422baee4acf5043b
Author: Tim Allison <[email protected]>
AuthorDate: Wed Oct 15 13:34:36 2025 -0400

    TIKA-4518 (#2366)
    
    * TIKA-4518 -- improve pst handling with -Z option
---
 CHANGES.txt                                        |  14 ++----
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   4 +-
 .../java/org/apache/tika/cli/TikaCLIAsyncTest.java |   2 +-
 .../test/java/org/apache/tika/cli/TikaCLITest.java |  29 ++++++++++-
 tika-app/src/test/resources/test-data/testPST.pst  | Bin 0 -> 2302976 bytes
 .../extractor/DefaultEmbeddedStreamTranslator.java |  21 ++++----
 .../tika/extractor/EmbeddedStreamTranslator.java   |   8 +--
 .../apache/tika/extractor/RUnpackExtractor.java    |  36 +++++++++-----
 .../java/org/apache/tika/io/FilenameUtils.java     |  29 ++++++++++-
 .../java/org/apache/tika/io/FilenameUtilsTest.java |   8 ++-
 .../microsoft/MSEmbeddedStreamTranslator.java      |  39 ++++++---------
 .../microsoft/PSTEmailStreamTranslator.java        |  55 +++++++++++++++++++++
 ....apache.tika.extractor.EmbeddedStreamTranslator |   3 +-
 .../AbstractEmbeddedDocumentBytesHandler.java      |  37 ++------------
 .../server/core/resource/UnpackerResource.java     |  27 ++++------
 15 files changed, 194 insertions(+), 118 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 6de20938a..2d91e48c2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,17 +7,11 @@ Release 4.0.0-BETA1 - ???
    * Headers are no longer injected into the body/content of MSG files 
(TIKA-4345). Please open
      a ticket if you need this behavior across email formats.
 
-   * Remove tika-batch (TIKA-4333).
+   * Removed several modules, including: tika-batch (TIKA-4333), snaps 
deployment (TIKA-4502),
+     dotnet (TIKA-4332), advanced media module (TIKA-4500), tika-dl module 
(TIKA-4499),
+     tika-fuzzing module (TIKA-4506).
 
-   * Remove snaps deployment (TIKA-4502).
-
-   * Removed the dotnet module (TIKA-4332).
-
-   * Removed the advanced media module (TIKA-4500).
-   
-   * Removed the tika-dl module (TIKA-4499).
-
-   * Removed the tika-fuzzing module (TIKA-4506).
+   * API changes in the EmbeddedStreamTranslator (TIKA-4518).
 
   OTHER CHANGES
 
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 28a9b29c7..a1db5f8bf 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -1112,9 +1112,7 @@ public class TikaCLI {
 
             try (OutputStream os = Files.newOutputStream(outputFile)) {
                 if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
-                    try (InputStream translated = 
embeddedStreamTranslator.translate(tis, metadata)) {
-                        IOUtils.copy(translated, os);
-                    }
+                    embeddedStreamTranslator.translate(tis, metadata, os);
                 } else {
                     IOUtils.copy(tis, os);
                 }
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index 096e3ce73..faacd49a2 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -121,7 +121,7 @@ public class TikaCLIAsyncTest {
                 json++;
             }
         }
-        assertEquals(20, json);
+        assertEquals(21, json);
     }
 
     private void checkForPrettyPrint(File f) throws IOException {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 93de4e409..94ccfd96c 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -26,6 +26,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
+import java.io.Reader;
 import java.net.URI;
 import java.nio.file.FileVisitResult;
 import java.nio.file.FileVisitor;
@@ -34,6 +35,7 @@ import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 import org.jetbrains.annotations.NotNull;
@@ -44,7 +46,11 @@ import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.serialization.JsonMetadataList;
 import org.apache.tika.utils.ProcessUtils;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * Tests the Tika's cli
@@ -285,6 +291,28 @@ public class TikaCLITest {
         testRecursiveUnpack("testPDFPackage.pdf", expectedChildren, 2);
     }
 
+    @Test
+    public void testPSTRUnpack() throws Exception {
+        String[] expectedChildren = new String[]{"testPST.pst.json",
+                "testPST.pst-embed/00000007-First email.msg",
+                "testPST.pst-embed/00000001-Feature Generators.msg",
+                "testPST.pst-embed/00000008-First email.msg",
+                "testPST.pst-embed/00000004-[jira] [Resolved] (TIKA-1249) 
Vcard files detection.msg",
+                "testPST.pst-embed/00000003-Feature Generators.msg",
+                "testPST.pst-embed/00000002-putstatic\".msg",
+                "testPST.pst-embed/00000005-[jira] [Commented] (TIKA-1250) 
Process loops infintely processing a CHM file.msg",
+                "testPST.pst-embed/00000009-attachment.docx",
+                "testPST.pst-embed/00000006-[WEBINAR] - \"Introducing 
Couchbase Server 2.5\".msg"};
+        testRecursiveUnpack("testPST.pst", expectedChildren, 2);
+        try (Reader reader = 
Files.newBufferedReader(extractDir.resolve("testPST.pst.json"))) {
+            List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+            for (Metadata m : metadataList) {
+                String content = m.get(TikaCoreProperties.TIKA_CONTENT);
+                assertFalse(StringUtils.isBlank(content));
+            }
+        }
+    }
+
 
     /**
      * Tests -l option of the cli
@@ -378,7 +406,6 @@ public class TikaCLITest {
                 .list();
         assertNotNull(jsonFile);
         assertEquals(expectedLength, jsonFile.length);
-        //assertEquals(fileNames.size(), expectedChildrenFileNames.length);
 
         for (String expectedChildName : expectedChildrenFileNames) {
             assertTrue(fileNames.contains(expectedChildName));
diff --git a/tika-app/src/test/resources/test-data/testPST.pst 
b/tika-app/src/test/resources/test-data/testPST.pst
new file mode 100644
index 000000000..8ccc69547
Binary files /dev/null and b/tika-app/src/test/resources/test-data/testPST.pst 
differ
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
index 537c5ffa1..b9d6985cc 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
@@ -17,18 +17,17 @@
 package org.apache.tika.extractor;
 
 import java.io.IOException;
-import java.io.InputStream;
+import java.io.OutputStream;
 import java.util.List;
 
 import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.utils.ServiceLoaderUtils;
 
 /**
  * Loads EmbeddedStreamTranslators via service loading.  Tries to run each
- * in turn and returns the first non-null value.  If no translation has 
occurred,
- * this returns the original InputStream. If a translation has occurred, the
- * translator will consume the InputStream but not close it.
+ * in turn. If a translator accepts the stream, it will do the translation but 
not close the stream.
  */
 public class DefaultEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator {
 
@@ -58,7 +57,7 @@ public class DefaultEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator
      * @throws IOException
      */
     @Override
-    public boolean shouldTranslate(InputStream inputStream, Metadata metadata) 
throws IOException {
+    public boolean shouldTranslate(TikaInputStream inputStream, Metadata 
metadata) throws IOException {
         for (EmbeddedStreamTranslator translator : translators) {
             if (translator.shouldTranslate(inputStream, metadata)) {
                 return true;
@@ -68,20 +67,20 @@ public class DefaultEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator
     }
 
     /**
-     * This will consume the InputStream and return a new stream of translated 
bytes.
+     * This will consume the InputStream and write the stream to the output 
stream
      * @param inputStream
      * @param metadata
+     * @param outputStream to write to
      * @return
      * @throws IOException
      */
     @Override
-    public InputStream translate(InputStream inputStream, Metadata metadata) 
throws IOException {
+    public void translate(TikaInputStream inputStream, Metadata metadata, 
OutputStream outputStream) throws IOException {
         for (EmbeddedStreamTranslator translator : translators) {
-            InputStream translated = translator.translate(inputStream, 
metadata);
-            if (translated != null) {
-                return translated;
+            if (translator.shouldTranslate(inputStream, metadata)) {
+                translator.translate(inputStream, metadata, outputStream);
+                return;
             }
         }
-        return inputStream;
     }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
index b2ce05db4..4a582506f 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
@@ -17,8 +17,9 @@
 package org.apache.tika.extractor;
 
 import java.io.IOException;
-import java.io.InputStream;
+import java.io.OutputStream;
 
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 
 /**
@@ -30,9 +31,8 @@ import org.apache.tika.metadata.Metadata;
  */
 public interface EmbeddedStreamTranslator {
 
-    boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws 
IOException;
+    boolean shouldTranslate(TikaInputStream inputStream, Metadata metadata) 
throws IOException;
 
-    InputStream translate(InputStream inputStream,
-                          Metadata metadata) throws IOException;
+    void translate(TikaInputStream inputStream, Metadata metadata, 
OutputStream os) throws IOException;
 
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java 
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 42544dc80..70c21ffb4 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -21,6 +21,7 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.StandardCopyOption;
@@ -110,18 +111,27 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
         }
     }
 
-    private void parseWithBytes(TikaInputStream stream, ContentHandler 
handler, Metadata metadata)
-            throws TikaException, IOException, SAXException {
-        //TODO -- improve the efficiency of this so that we're not
-        //literally writing out a file per request
+    private void parseWithBytes(TikaInputStream tis, ContentHandler handler, 
Metadata metadata) throws TikaException, IOException, SAXException {
+
         Path tmp = Files.createTempFile("tika-tmp-", ".bin");
-        if (embeddedStreamTranslator.shouldTranslate(stream, metadata)) {
-            Files.copy(embeddedStreamTranslator.translate(stream, metadata), 
tmp, StandardCopyOption.REPLACE_EXISTING);
-        } else {
-            Files.copy(stream, tmp, StandardCopyOption.REPLACE_EXISTING);
-        }
-        try (TikaInputStream tmpTis = TikaInputStream.get(tmp)) {
-            parse(tmpTis, handler, metadata);
+        try {
+            //translate the stream or not
+            if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+                try (OutputStream os = Files.newOutputStream(tmp)) {
+                    embeddedStreamTranslator.translate(tis, metadata, os);
+                }
+            } else {
+                Files.copy(tis, tmp, StandardCopyOption.REPLACE_EXISTING);
+            }
+
+            //now do the parse
+            if (tis.getOpenContainer() != null) {
+                parse(tis, handler, metadata);
+            } else {
+                try (TikaInputStream tisTmp = TikaInputStream.get(tmp)) {
+                    parse(tisTmp, handler, metadata);
+                }
+            }
         } finally {
             try {
                 storeEmbeddedBytes(tmp, metadata);
@@ -139,6 +149,10 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
     }
 
     private void storeEmbeddedBytes(Path p, Metadata metadata) {
+        if (p == null) {
+            return;
+        }
+
         if (! embeddedBytesSelector.select(metadata)) {
             if (LOGGER.isDebugEnabled()) {
                 LOGGER.debug("skipping embedded bytes {} <-> {}",
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index de57eda72..cf250e934 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -140,12 +140,16 @@ public class FilenameUtils {
 
     public static String getSanitizedEmbeddedFileName(Metadata metadata,
                                                       String defaultExtension, 
int maxLength) {
-        String path = getEmbeddedPath(metadata);
+        String path = getEmbeddedName(metadata);
         //fName could be a full path or null
         if (StringUtils.isBlank(path)) {
             return null;
         }
         path = path.replaceAll("\u0000", " ");
+        if (path.startsWith("\"") && path.endsWith("\"")) {
+            path = path.substring(1, path.length() - 1);
+        }
+
         int prefixLength = getPrefixLength(path);
         if (prefixLength > 0) {
             path = path.substring(prefixLength);
@@ -173,6 +177,7 @@ public class FilenameUtils {
         namePart = namePart.replaceAll("(\\.\\.)+", "_");
         namePart = namePart.replaceAll("[/\\\\]+", "_");
         namePart = namePart.replaceAll(":+", "_");
+        namePart = namePart.trim();
 
         if (StringUtils.isBlank(namePart)) {
             return null;
@@ -286,6 +291,7 @@ public class FilenameUtils {
         return path;
     }
 
+    //may return null
     private static String getEmbeddedPath(Metadata metadata) {
         //potentially look for other values in embedded path or original file 
name, etc...
         //maybe different fallback order?
@@ -304,6 +310,27 @@ public class FilenameUtils {
         return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
     }
 
+    //this tries for resource name first, and then backs off to path
+    private static String getEmbeddedName(Metadata metadata) {
+        //potentially look for other values in embedded path or original file 
name, etc...
+        //maybe different fallback order?
+        String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+        if (! StringUtils.isBlank(path)) {
+            return path;
+        }
+        path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
+        if (! StringUtils.isBlank(path)) {
+            return path;
+        }
+
+        path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+        if (! StringUtils.isBlank(path)) {
+            return path;
+        }
+
+        return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
+    }
+
     /**
      * Calculate the extension based on the {@link Metadata#CONTENT_TYPE} 
value.
      * On parse exception or null value, return the default value.
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index c3abd4134..c670bac83 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -151,6 +151,9 @@ public class FilenameUtilsTest {
         assertEquals("brown fox.xlsx", sanitizeFilename("a:/the quick:brown 
fox.xlsx"));
         assertEquals("_the quick brown fox.xlsx", 
sanitizeFilename("C:\\a/b/c/..the quick brown fox.xlsx"));
         assertEquals("_the quick brown fox.xlsx", 
sanitizeFilename("~/a/b/c/.the quick brown fox.xlsx"));
+        assertEquals("the quick%3Ebrown fox.xlsx", sanitizeFilename("the 
quick>brown fox.xlsx"));
+        assertEquals("the quick\"brown fox.xlsx", sanitizeFilename("the 
quick\"brown fox.xlsx"));
+        assertEquals("the quick brown fox.xlsx", sanitizeFilename("\"the quick 
brown fox.xlsx\""));
 
         assertEquals("_.docx", sanitizeFilename("..................docx"));
         assertEquals("_.docx", sanitizeFilename("..docx"));
@@ -168,7 +171,7 @@ public class FilenameUtilsTest {
     @Test
     public void testEmbeddedFilePaths() throws Exception {
         String n = "the quick brown fox.docx";
-        /*assertEquals(n, sanitizePath(n));
+        assertEquals(n, sanitizePath(n));
         assertEquals(n, sanitizePath(n.substring(0, n.length() - 5),
                 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
         assertEquals(n, sanitizeFilename("the quick\u0000brown fox.docx"));
@@ -204,7 +207,7 @@ public class FilenameUtilsTest {
         assertNull(sanitizePath(""));
         assertNull(sanitizePath(null));
         assertNull(sanitizePath("/"));
-        assertNull(sanitizePath("~/"));*/
+        assertNull(sanitizePath("~/"));
         assertNull(sanitizePath("C:"));
         assertNull(sanitizePath("C:/"));
         assertNull(sanitizePath("C:\\"));
@@ -235,6 +238,7 @@ public class FilenameUtilsTest {
 
     private Metadata getMetadata(String name) {
         Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, name);
         return metadata;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
index 24f7ec2d3..433d34a00 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -18,9 +18,10 @@ package org.apache.tika.extractor.microsoft;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.OutputStream;
 
 import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.CloseShieldOutputStream;
 import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -43,26 +44,22 @@ public class MSEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator {
     private static final Logger LOG = 
LoggerFactory.getLogger(MSEmbeddedStreamTranslator.class);
 
     @Override
-    public boolean shouldTranslate(InputStream inputStream, Metadata metadata) 
throws IOException {
+    public boolean shouldTranslate(TikaInputStream tis, Metadata metadata) 
throws IOException {
         String contentType = 
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
         if 
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) 
{
             return true;
-        } else if (inputStream instanceof TikaInputStream) {
-            TikaInputStream tin = (TikaInputStream) inputStream;
-            if (tin.getOpenContainer() != null &&
-                    tin.getOpenContainer() instanceof DirectoryEntry) {
-                return true;
-            }
+        } else {
+            return tis.getOpenContainer() != null &&
+                    tis.getOpenContainer() instanceof DirectoryEntry;
         }
-        return false;
     }
 
     @Override
-    public InputStream translate(InputStream inputStream, Metadata metadata) 
throws IOException {
+    public void translate(TikaInputStream tis, Metadata metadata, OutputStream 
os) throws IOException {
         String contentType = 
metadata.get(org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE);
         if 
("application/vnd.openxmlformats-officedocument.oleObject".equals(contentType)) 
{
             UnsynchronizedByteArrayOutputStream bos = 
UnsynchronizedByteArrayOutputStream.builder().get();
-            IOUtils.copy(inputStream, bos);
+            IOUtils.copy(tis, bos);
             POIFSFileSystem poifs = new POIFSFileSystem(bos.toInputStream());
             OfficeParser.POIFSDocumentType type = 
OfficeParser.POIFSDocumentType.detectType(poifs);
             String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
@@ -82,21 +79,17 @@ public class MSEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator {
                 name += '.' + type.getExtension();
             }
             metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
-            return 
UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get();
-        } else if (inputStream instanceof TikaInputStream) {
-            TikaInputStream tin = (TikaInputStream) inputStream;
-
-            if (tin.getOpenContainer() != null &&
-                    tin.getOpenContainer() instanceof DirectoryEntry) {
-                POIFSFileSystem fs = new POIFSFileSystem();
-                copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
-                try (UnsynchronizedByteArrayOutputStream bos2 = 
UnsynchronizedByteArrayOutputStream.builder().get()) {
-                    fs.writeFilesystem(bos2);
-                    return bos2.toInputStream();
+            os.write(data);
+            os.flush();
+        } else {
+            if (tis.getOpenContainer() != null &&
+                    tis.getOpenContainer() instanceof DirectoryEntry) {
+                try (POIFSFileSystem fs = new POIFSFileSystem()) {
+                    copy((DirectoryEntry) tis.getOpenContainer(), 
fs.getRoot());
+                    fs.writeFilesystem(CloseShieldOutputStream.wrap(os));
                 }
             }
         }
-        return inputStream;
     }
 
     protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
throws IOException {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
new file mode 100644
index 000000000..055072481
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor.microsoft;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+
+public class PSTEmailStreamTranslator implements EmbeddedStreamTranslator {
+    private static final String MIME_TYPE = 
MediaType.application("x-tika-pst-mail-item").toString();
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(PSTEmailStreamTranslator.class);
+    private static final AtomicLong EMAIL_ITEMS = new AtomicLong(0);
+    private static final long LOG_EVERY_X_ITEMS = 100;
+
+    @Override
+    public boolean shouldTranslate(TikaInputStream tis, Metadata metadata) 
throws IOException {
+        return 
MIME_TYPE.equals(metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE))
+                || MIME_TYPE.equals(metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Override
+    public void translate(TikaInputStream tis, Metadata metadata, OutputStream 
os) throws IOException {
+        if (!shouldTranslate(tis, metadata)) {
+            return;
+        }
+        if (EMAIL_ITEMS.getAndIncrement() % LOG_EVERY_X_ITEMS == 0) {
+            LOG.warn("Translating pst email objects to .eml or .msg is not yet 
supported. "
+                    + "Please open a ticket on our JIRA or a pull request on 
Github.");
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
index e59cba80e..509de7d95 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
@@ -12,4 +12,5 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-org.apache.tika.extractor.microsoft.MSEmbeddedStreamTranslator
\ No newline at end of file
+org.apache.tika.extractor.microsoft.MSEmbeddedStreamTranslator
+org.apache.tika.extractor.microsoft.PSTEmailStreamTranslator
\ No newline at end of file
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
index 80ff66984..5dd27e419 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
@@ -18,26 +18,18 @@ package org.apache.tika.pipes.core.extractor;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 
-import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.EmbeddedDocumentBytesHandler;
 import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig;
 import org.apache.tika.utils.StringUtils;
 
 public abstract class AbstractEmbeddedDocumentBytesHandler implements 
EmbeddedDocumentBytesHandler {
 
-    private static final MimeTypes MIME_TYPES = 
TikaConfig.getDefaultConfig().getMimeRepository();
-
     List<Integer> ids = new ArrayList<>();
 
     public String getEmitKey(String containerEmitKey, int embeddedId,
@@ -56,13 +48,10 @@ public abstract class AbstractEmbeddedDocumentBytesHandler 
implements EmbeddedDo
             emitKey.append("-embed");
             emitKey.append("/");
             
emitKey.append(embeddedIdString).append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix());
-            Path p = 
Paths.get(metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
-            String fName = p.getFileName().toString();
-            emitKey.append(fName);
-            if (! fName.contains(".")) {
-                appendSuffix(emitKey, metadata, embeddedDocumentBytesConfig);
+            String fName = 
FilenameUtils.getSanitizedEmbeddedFileName(metadata, ".bin", 100);
+            if (! StringUtils.isBlank(fName)) {
+                emitKey.append(fName);
             }
-
             return emitKey.toString();
         } else if (embeddedDocumentBytesConfig.getKeyBaseStrategy() ==
                 
EmbeddedDocumentBytesConfig.KEY_BASE_STRATEGY.CONTAINER_NAME_NUMBERED) {
@@ -101,25 +90,7 @@ public abstract class AbstractEmbeddedDocumentBytesHandler 
implements EmbeddedDo
             emitKey.append(suffix);
         } else if (embeddedDocumentBytesConfig.getSuffixStrategy()
                                               
.equals(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.DETECTED)) {
-            emitKey.append(getExtension(metadata));
+            emitKey.append(FilenameUtils.calculateExtension(metadata, ".bin"));
         }
     }
-
-    private String getExtension(Metadata metadata) {
-        String mime = metadata.get(Metadata.CONTENT_TYPE);
-        try {
-            String ext = MIME_TYPES
-                    .forName(mime)
-                    .getExtension();
-            if (ext == null) {
-                return ".bin";
-            } else {
-                return ext;
-            }
-        } catch (MimeTypeException e) {
-            //swallow
-        }
-        return ".bin";
-
-    }
 }
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
index a2e3064d6..df8611d2c 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
@@ -43,7 +43,6 @@ import org.apache.commons.csv.CSVFormat;
 import org.apache.commons.csv.CSVPrinter;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.mutable.MutableInt;
@@ -193,12 +192,16 @@ public class UnpackerResource {
                     .builder()
                     .get();
 
-            BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes, 
tis);
-            IOUtils.copy(bis, bos);
-            if (bis.hasHitBound()) {
-                throw new IOException(new TikaMemoryLimitException(
-                        "An attachment is longer than " + "'unpackMaxBytes' 
(default=100MB, actual=" + unpackMaxBytes + "). " + "If you need to increase 
this " +
-                                "limit, add a header to your request, such as: 
unpackMaxBytes: " + "1073741824.  There is a hard limit of 2GB."));
+            if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+                embeddedStreamTranslator.translate(tis, metadata, bos);
+            } else {
+                BoundedInputStream bis = new 
BoundedInputStream(unpackMaxBytes, tis);
+                IOUtils.copy(bis, bos);
+                if (bis.hasHitBound()) {
+                    throw new IOException(new TikaMemoryLimitException(
+                            "An attachment is longer than " + 
"'unpackMaxBytes' (default=100MB, actual=" + unpackMaxBytes + "). " + "If you 
need to increase this " +
+                                    "limit, add a header to your request, such 
as: unpackMaxBytes: " + "1073741824.  There is a hard limit of 2GB."));
+                }
             }
             byte[] data = bos.toByteArray();
 
@@ -224,16 +227,6 @@ public class UnpackerResource {
                     LOG.warn("Unexpected MimeTypeException", e);
                 }
             }
-            try (TikaInputStream is = TikaInputStream.get(data)) {
-                if (embeddedStreamTranslator.shouldTranslate(is, metadata)) {
-                    InputStream translated = 
embeddedStreamTranslator.translate(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(),
 metadata);
-                    UnsynchronizedByteArrayOutputStream bos2 = 
UnsynchronizedByteArrayOutputStream
-                            .builder()
-                            .get();
-                    IOUtils.copy(translated, bos2);
-                    data = bos2.toByteArray();
-                }
-            }
 
             final String finalName = getFinalName(name, zout);
 


Reply via email to