This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4518
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ffda563bd8b4a849a85f9bfd44ce77818a1f4ecb
Author: tallison <[email protected]>
AuthorDate: Wed Oct 15 12:19:36 2025 -0400

    TIKA-4518 -- improve recursive file extraction, with focus on logging PST 
email issues.
---
 CHANGES.txt                                        |  14 ++----
 .../java/org/apache/tika/cli/TikaCLIAsyncTest.java |   2 +-
 .../test/java/org/apache/tika/cli/TikaCLITest.java |  29 ++++++++++-
 tika-app/src/test/resources/test-data/testPST.pst  | Bin 0 -> 2302976 bytes
 .../extractor/DefaultEmbeddedStreamTranslator.java |   7 ++-
 .../tika/extractor/EmbeddedStreamTranslator.java   |   1 -
 .../apache/tika/extractor/RUnpackExtractor.java    |  35 ++++++++-----
 .../java/org/apache/tika/io/FilenameUtils.java     |  29 ++++++++++-
 .../java/org/apache/tika/io/FilenameUtilsTest.java |   8 ++-
 .../microsoft/MSEmbeddedStreamTranslator.java      |   2 -
 .../microsoft/PSTEmailStreamTranslator.java        |  55 +++++++++++++++++++++
 ....apache.tika.extractor.EmbeddedStreamTranslator |   3 +-
 .../AbstractEmbeddedDocumentBytesHandler.java      |  37 ++------------
 13 files changed, 154 insertions(+), 68 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 6de20938a..2d91e48c2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,17 +7,11 @@ Release 4.0.0-BETA1 - ???
    * Headers are no longer injected into the body/content of MSG files 
(TIKA-4345). Please open
      a ticket if you need this behavior across email formats.
 
-   * Remove tika-batch (TIKA-4333).
+   * Removed several modules, including: tika-batch (TIKA-4333), snaps 
deployment (TIKA-4502),
+     dotnet (TIKA-4332), advanced media module (TIKA-4500), tika-dl module 
(TIKA-4499),
+     tika-fuzzing module (TIKA-4506).
 
-   * Remove snaps deployment (TIKA-4502).
-
-   * Removed the dotnet module (TIKA-4332).
-
-   * Removed the advanced media module (TIKA-4500).
-   
-   * Removed the tika-dl module (TIKA-4499).
-
-   * Removed the tika-fuzzing module (TIKA-4506).
+   * API changes in the EmbeddedStreamTranslator (TIKA-4518).
 
   OTHER CHANGES
 
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
index 096e3ce73..faacd49a2 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java
@@ -121,7 +121,7 @@ public class TikaCLIAsyncTest {
                 json++;
             }
         }
-        assertEquals(20, json);
+        assertEquals(21, json);
     }
 
     private void checkForPrettyPrint(File f) throws IOException {
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java 
b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index 93de4e409..94ccfd96c 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -26,6 +26,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
+import java.io.Reader;
 import java.net.URI;
 import java.nio.file.FileVisitResult;
 import java.nio.file.FileVisitor;
@@ -34,6 +35,7 @@ import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 import org.jetbrains.annotations.NotNull;
@@ -44,7 +46,11 @@ import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.serialization.JsonMetadataList;
 import org.apache.tika.utils.ProcessUtils;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * Tests the Tika's cli
@@ -285,6 +291,28 @@ public class TikaCLITest {
         testRecursiveUnpack("testPDFPackage.pdf", expectedChildren, 2);
     }
 
+    @Test
+    public void testPSTRUnpack() throws Exception {
+        String[] expectedChildren = new String[]{"testPST.pst.json",
+                "testPST.pst-embed/00000007-First email.msg",
+                "testPST.pst-embed/00000001-Feature Generators.msg",
+                "testPST.pst-embed/00000008-First email.msg",
+                "testPST.pst-embed/00000004-[jira] [Resolved] (TIKA-1249) 
Vcard files detection.msg",
+                "testPST.pst-embed/00000003-Feature Generators.msg",
+                "testPST.pst-embed/00000002-putstatic\".msg",
+                "testPST.pst-embed/00000005-[jira] [Commented] (TIKA-1250) 
Process loops infintely processing a CHM file.msg",
+                "testPST.pst-embed/00000009-attachment.docx",
+                "testPST.pst-embed/00000006-[WEBINAR] - \"Introducing 
Couchbase Server 2.5\".msg"};
+        testRecursiveUnpack("testPST.pst", expectedChildren, 2);
+        try (Reader reader = 
Files.newBufferedReader(extractDir.resolve("testPST.pst.json"))) {
+            List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+            for (Metadata m : metadataList) {
+                String content = m.get(TikaCoreProperties.TIKA_CONTENT);
+                assertFalse(StringUtils.isBlank(content));
+            }
+        }
+    }
+
 
     /**
      * Tests -l option of the cli
@@ -378,7 +406,6 @@ public class TikaCLITest {
                 .list();
         assertNotNull(jsonFile);
         assertEquals(expectedLength, jsonFile.length);
-        //assertEquals(fileNames.size(), expectedChildrenFileNames.length);
 
         for (String expectedChildName : expectedChildrenFileNames) {
             assertTrue(fileNames.contains(expectedChildName));
diff --git a/tika-app/src/test/resources/test-data/testPST.pst 
b/tika-app/src/test/resources/test-data/testPST.pst
new file mode 100644
index 000000000..8ccc69547
Binary files /dev/null and b/tika-app/src/test/resources/test-data/testPST.pst 
differ
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
index bf2321481..b9d6985cc 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
@@ -27,9 +27,7 @@ import org.apache.tika.utils.ServiceLoaderUtils;
 
 /**
  * Loads EmbeddedStreamTranslators via service loading.  Tries to run each
- * in turn and returns the first non-null value.  If no translation has 
occurred,
- * this returns the original InputStream. If a translation has occurred, the
- * translator will consume the InputStream but not close it.
+ * in turn. If a translator accepts the stream, it will do the translation but 
not close the stream.
  */
 public class DefaultEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator {
 
@@ -69,9 +67,10 @@ public class DefaultEmbeddedStreamTranslator implements 
EmbeddedStreamTranslator
     }
 
     /**
-     * This will consume the InputStream and return a new stream of translated 
bytes.
+     * This will consume the InputStream and write the stream to the output 
stream
      * @param inputStream
      * @param metadata
+     * @param outputStream to write to
      * @return
      * @throws IOException
      */
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
index 2391f0be5..4a582506f 100644
--- 
a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
+++ 
b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
@@ -17,7 +17,6 @@
 package org.apache.tika.extractor;
 
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.OutputStream;
 
 import org.apache.tika.io.TikaInputStream;
diff --git 
a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java 
b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index c5d8185b2..70c21ffb4 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -111,20 +111,27 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
         }
     }
 
-    private void parseWithBytes(TikaInputStream tis, ContentHandler handler, 
Metadata metadata)
-            throws TikaException, IOException, SAXException {
-        //TODO -- improve the efficiency of this so that we're not
-        //literally writing out a file per request
+    private void parseWithBytes(TikaInputStream tis, ContentHandler handler, 
Metadata metadata) throws TikaException, IOException, SAXException {
+
         Path tmp = Files.createTempFile("tika-tmp-", ".bin");
-        if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
-            try (OutputStream os = Files.newOutputStream(tmp)) {
-                embeddedStreamTranslator.translate(tis, metadata, os);
+        try {
+            //translate the stream or not
+            if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) {
+                try (OutputStream os = Files.newOutputStream(tmp)) {
+                    embeddedStreamTranslator.translate(tis, metadata, os);
+                }
+            } else {
+                Files.copy(tis, tmp, StandardCopyOption.REPLACE_EXISTING);
+            }
+
+            //now do the parse
+            if (tis.getOpenContainer() != null) {
+                parse(tis, handler, metadata);
+            } else {
+                try (TikaInputStream tisTmp = TikaInputStream.get(tmp)) {
+                    parse(tisTmp, handler, metadata);
+                }
             }
-        } else {
-            Files.copy(tis, tmp, StandardCopyOption.REPLACE_EXISTING);
-        }
-        try (TikaInputStream tmpTis = TikaInputStream.get(tmp)) {
-            parse(tis, handler, metadata);
         } finally {
             try {
                 storeEmbeddedBytes(tmp, metadata);
@@ -142,6 +149,10 @@ public class RUnpackExtractor extends 
ParsingEmbeddedDocumentExtractor {
     }
 
     private void storeEmbeddedBytes(Path p, Metadata metadata) {
+        if (p == null) {
+            return;
+        }
+
         if (! embeddedBytesSelector.select(metadata)) {
             if (LOGGER.isDebugEnabled()) {
                 LOGGER.debug("skipping embedded bytes {} <-> {}",
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index de57eda72..cf250e934 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -140,12 +140,16 @@ public class FilenameUtils {
 
     public static String getSanitizedEmbeddedFileName(Metadata metadata,
                                                       String defaultExtension, 
int maxLength) {
-        String path = getEmbeddedPath(metadata);
+        String path = getEmbeddedName(metadata);
         //fName could be a full path or null
         if (StringUtils.isBlank(path)) {
             return null;
         }
         path = path.replaceAll("\u0000", " ");
+        if (path.startsWith("\"") && path.endsWith("\"")) {
+            path = path.substring(1, path.length() - 1);
+        }
+
         int prefixLength = getPrefixLength(path);
         if (prefixLength > 0) {
             path = path.substring(prefixLength);
@@ -173,6 +177,7 @@ public class FilenameUtils {
         namePart = namePart.replaceAll("(\\.\\.)+", "_");
         namePart = namePart.replaceAll("[/\\\\]+", "_");
         namePart = namePart.replaceAll(":+", "_");
+        namePart = namePart.trim();
 
         if (StringUtils.isBlank(namePart)) {
             return null;
@@ -286,6 +291,7 @@ public class FilenameUtils {
         return path;
     }
 
+    //may return null
     private static String getEmbeddedPath(Metadata metadata) {
         //potentially look for other values in embedded path or original file 
name, etc...
         //maybe different fallback order?
@@ -304,6 +310,27 @@ public class FilenameUtils {
         return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
     }
 
+    //this tries for resource name first, and then backs off to path
+    private static String getEmbeddedName(Metadata metadata) {
+        //potentially look for other values in embedded path or original file 
name, etc...
+        //maybe different fallback order?
+        String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+        if (! StringUtils.isBlank(path)) {
+            return path;
+        }
+        path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
+        if (! StringUtils.isBlank(path)) {
+            return path;
+        }
+
+        path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+        if (! StringUtils.isBlank(path)) {
+            return path;
+        }
+
+        return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME);
+    }
+
     /**
      * Calculate the extension based on the {@link Metadata#CONTENT_TYPE} 
value.
      * On parse exception or null value, return the default value.
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java 
b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index c3abd4134..c670bac83 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -151,6 +151,9 @@ public class FilenameUtilsTest {
         assertEquals("brown fox.xlsx", sanitizeFilename("a:/the quick:brown 
fox.xlsx"));
         assertEquals("_the quick brown fox.xlsx", 
sanitizeFilename("C:\\a/b/c/..the quick brown fox.xlsx"));
         assertEquals("_the quick brown fox.xlsx", 
sanitizeFilename("~/a/b/c/.the quick brown fox.xlsx"));
+        assertEquals("the quick%3Ebrown fox.xlsx", sanitizeFilename("the 
quick>brown fox.xlsx"));
+        assertEquals("the quick\"brown fox.xlsx", sanitizeFilename("the 
quick\"brown fox.xlsx"));
+        assertEquals("the quick brown fox.xlsx", sanitizeFilename("\"the quick 
brown fox.xlsx\""));
 
         assertEquals("_.docx", sanitizeFilename("..................docx"));
         assertEquals("_.docx", sanitizeFilename("..docx"));
@@ -168,7 +171,7 @@ public class FilenameUtilsTest {
     @Test
     public void testEmbeddedFilePaths() throws Exception {
         String n = "the quick brown fox.docx";
-        /*assertEquals(n, sanitizePath(n));
+        assertEquals(n, sanitizePath(n));
         assertEquals(n, sanitizePath(n.substring(0, n.length() - 5),
                 
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
         assertEquals(n, sanitizeFilename("the quick\u0000brown fox.docx"));
@@ -204,7 +207,7 @@ public class FilenameUtilsTest {
         assertNull(sanitizePath(""));
         assertNull(sanitizePath(null));
         assertNull(sanitizePath("/"));
-        assertNull(sanitizePath("~/"));*/
+        assertNull(sanitizePath("~/"));
         assertNull(sanitizePath("C:"));
         assertNull(sanitizePath("C:/"));
         assertNull(sanitizePath("C:\\"));
@@ -235,6 +238,7 @@ public class FilenameUtilsTest {
 
     private Metadata getMetadata(String name) {
         Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, name);
         return metadata;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
index 3833b91da..433d34a00 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -21,8 +21,6 @@ import java.io.InputStream;
 import java.io.OutputStream;
 
 import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.apache.commons.io.output.CloseShieldOutputStream;
 import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
new file mode 100644
index 000000000..055072481
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/PSTEmailStreamTranslator.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor.microsoft;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.extractor.EmbeddedStreamTranslator;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+
+public class PSTEmailStreamTranslator implements EmbeddedStreamTranslator {
+    private static final String MIME_TYPE = 
MediaType.application("x-tika-pst-mail-item").toString();
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(PSTEmailStreamTranslator.class);
+    private static final AtomicLong EMAIL_ITEMS = new AtomicLong(0);
+    private static final long LOG_EVERY_X_ITEMS = 100;
+
+    @Override
+    public boolean shouldTranslate(TikaInputStream tis, Metadata metadata) 
throws IOException {
+        return 
MIME_TYPE.equals(metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE))
+                || MIME_TYPE.equals(metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    @Override
+    public void translate(TikaInputStream tis, Metadata metadata, OutputStream 
os) throws IOException {
+        if (!shouldTranslate(tis, metadata)) {
+            return;
+        }
+        if (EMAIL_ITEMS.getAndIncrement() % LOG_EVERY_X_ITEMS == 0) {
+            LOG.warn("Translating pst email objects to .eml or .msg is not yet 
supported. "
+                    + "Please open a ticket on our JIRA or a pull request on 
Github.");
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
index e59cba80e..509de7d95 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/resources/META-INF/services/org.apache.tika.extractor.EmbeddedStreamTranslator
@@ -12,4 +12,5 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-org.apache.tika.extractor.microsoft.MSEmbeddedStreamTranslator
\ No newline at end of file
+org.apache.tika.extractor.microsoft.MSEmbeddedStreamTranslator
+org.apache.tika.extractor.microsoft.PSTEmailStreamTranslator
\ No newline at end of file
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
index 80ff66984..5dd27e419 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/AbstractEmbeddedDocumentBytesHandler.java
@@ -18,26 +18,18 @@ package org.apache.tika.pipes.core.extractor;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.file.Path;
-import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 
-import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.EmbeddedDocumentBytesHandler;
 import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig;
 import org.apache.tika.utils.StringUtils;
 
 public abstract class AbstractEmbeddedDocumentBytesHandler implements 
EmbeddedDocumentBytesHandler {
 
-    private static final MimeTypes MIME_TYPES = 
TikaConfig.getDefaultConfig().getMimeRepository();
-
     List<Integer> ids = new ArrayList<>();
 
     public String getEmitKey(String containerEmitKey, int embeddedId,
@@ -56,13 +48,10 @@ public abstract class AbstractEmbeddedDocumentBytesHandler 
implements EmbeddedDo
             emitKey.append("-embed");
             emitKey.append("/");
             
emitKey.append(embeddedIdString).append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix());
-            Path p = 
Paths.get(metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
-            String fName = p.getFileName().toString();
-            emitKey.append(fName);
-            if (! fName.contains(".")) {
-                appendSuffix(emitKey, metadata, embeddedDocumentBytesConfig);
+            String fName = 
FilenameUtils.getSanitizedEmbeddedFileName(metadata, ".bin", 100);
+            if (! StringUtils.isBlank(fName)) {
+                emitKey.append(fName);
             }
-
             return emitKey.toString();
         } else if (embeddedDocumentBytesConfig.getKeyBaseStrategy() ==
                 
EmbeddedDocumentBytesConfig.KEY_BASE_STRATEGY.CONTAINER_NAME_NUMBERED) {
@@ -101,25 +90,7 @@ public abstract class AbstractEmbeddedDocumentBytesHandler 
implements EmbeddedDo
             emitKey.append(suffix);
         } else if (embeddedDocumentBytesConfig.getSuffixStrategy()
                                               
.equals(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.DETECTED)) {
-            emitKey.append(getExtension(metadata));
+            emitKey.append(FilenameUtils.calculateExtension(metadata, ".bin"));
         }
     }
-
-    private String getExtension(Metadata metadata) {
-        String mime = metadata.get(Metadata.CONTENT_TYPE);
-        try {
-            String ext = MIME_TYPES
-                    .forName(mime)
-                    .getExtension();
-            if (ext == null) {
-                return ".bin";
-            } else {
-                return ext;
-            }
-        } catch (MimeTypeException e) {
-            //swallow
-        }
-        return ".bin";
-
-    }
 }

Reply via email to