This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4617 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8c5f6b2fe8e3f85b4870d179eaee9434dd91965e Author: tallison <[email protected]> AuthorDate: Thu Jan 8 20:01:03 2026 -0500 TIKA-4617 -- stream translator should change names only in rare circumstances. --- .../tika/extractor/microsoft/MSEmbeddedStreamTranslator.java | 9 +++++++-- .../org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java | 5 +++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java index 24f7ec2d30..7e0794d080 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java @@ -22,6 +22,7 @@ import java.io.InputStream; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; +import org.apache.commons.lang3.StringUtils; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DocumentEntry; import org.apache.poi.poifs.filesystem.DocumentInputStream; @@ -72,14 +73,18 @@ public class MSEmbeddedStreamTranslator implements EmbeddedStreamTranslator { try { Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(poifs); if (ole.getDataSize() > 0) { - name = ole.getLabel(); + if (StringUtils.isAllBlank(name)) { + name = ole.getLabel(); + } data = ole.getDataBuffer(); } } catch (Ole10NativeException ex) { LOG.warn("Skipping invalid part", ex); } } else { - name += '.' + type.getExtension(); + if (! StringUtils.isAllBlank(type.getExtension()) && ! StringUtils.isAllBlank(name) && !name.contains(".")) { + name += '.' + type.getExtension(); + } } metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); return UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index ffba11b0c2..7c1ed12943 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -152,6 +152,7 @@ public class OOXMLParserTest extends TikaTest { Parser parser = TikaTest.AUTO_DETECT_PARSER; Parser digestingParser = new DigestingParser(parser, new CommonsDigester(100000, "sha256"), false); List<Metadata> metadataList = getRecursiveMetadata("testMSChart-govdocs-428996.pptx", digestingParser); + assertEquals(4, metadataList.size()); for (Metadata m : metadataList) { assertNotNull(m.get("X-TIKA:digest:SHA256")); @@ -159,5 +160,9 @@ public class OOXMLParserTest extends TikaTest { //before TIKA-4607 assertNull(m.get(TikaCoreProperties.EMBEDDED_EXCEPTION)); } + + assertEquals("/oleObject1.bin", metadataList.get(2).get(TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH)); + assertEquals("application/vnd.ms-graph", metadataList.get(2).get(Metadata.CONTENT_TYPE)); + assertEquals("4cfadec808582492aeb5f1ae0f391dadbd3402affeef3e5488b4f6a07537aea5", metadataList.get(2).get("X-TIKA:digest:SHA256")); } }
