This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit e4c2fe13af94a8275ed66fd76a057da3be43be32 Author: Tim Allison <[email protected]> AuthorDate: Tue Feb 3 20:18:00 2026 -0500 TIKA-4630 -- use embedded stored filename as the "resourcename" in gz (#2582) (cherry picked from commit 9f7e4dc04a0fd152377d1de88b50cfb8f035f617) --- .../apache/tika/parser/pkg/CompressorParser.java | 14 +++++--- .../org/apache/tika/parser/pkg/GzipParserTest.java | 38 +++++++++++---------- .../src/test/resources/test-documents/bob.gz | Bin 0 -> 41 bytes 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index aeebd8881e..955880d8a6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -212,10 +212,13 @@ public class CompressorParser implements Parser { xhtml.startDocument(); try { Metadata entrydata = new Metadata(); + boolean foundName = false; if (cis instanceof GzipCompressorInputStream) { - extractGzipMetadata((GzipCompressorInputStream) cis, entrydata); + foundName = extractGzipMetadata((GzipCompressorInputStream) cis, entrydata); + } + if (! foundName) { + setName(metadata, entrydata); } - setName(metadata, entrydata); // Use the delegate parser to parse the compressed document EmbeddedDocumentExtractor extractor = @@ -230,16 +233,19 @@ public class CompressorParser implements Parser { xhtml.endDocument(); } - private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata metadata) { + private boolean extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata metadata) { GzipParameters gzipParameters = gzcis.getMetaData(); if (gzipParameters == null) { - return; + return false; } String name = gzipParameters.getFileName(); if (!StringUtils.isBlank(name)) { metadata.set(TikaCoreProperties.INTERNAL_PATH, name); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + return true; } //TODO: modification, OS, comment + return false; } private void setName(Metadata parentMetadata, Metadata metadata) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java index 99f12427b2..436fda8411 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java @@ -16,21 +16,23 @@ */ package org.apache.tika.parser.pkg; -import static java.nio.charset.StandardCharsets.US_ASCII; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.List; import java.io.InputStream; import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; +import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.metadata.TikaCoreProperties; /** * Test case for parsing gzip files. */ -public class GzipParserTest extends AbstractPkgTest { +public class GzipParserTest extends TikaTest { /** * Tests that the ParseContext parser is correctly @@ -38,23 +40,23 @@ public class GzipParserTest extends AbstractPkgTest { */ @Test public void testEmbedded() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); + List<Metadata> metadataList = getRecursiveMetadata("test-documents.tgz"); - try (InputStream stream = getResourceAsStream("/test-documents/test-documents.tgz")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext); - } + // Container plus embedded tar contents + assertTrue(metadataList.size() > 1); - // Should find a single entry, for the (compressed) tar file - assertEquals(1, tracker.filenames.size()); - assertEquals(1, tracker.mediatypes.size()); - assertEquals(1, tracker.modifiedAts.size()); + // Embedded documents should have path through the tar file + String embeddedPath = metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); + assertTrue(embeddedPath.contains("test-documents.tar")); + } - assertEquals(null, tracker.filenames.get(0)); - assertEquals(null, tracker.mediatypes.get(0)); - assertEquals(null, tracker.modifiedAts.get(0)); + @Test + public void testGzipInternalFileName() throws Exception { + List<Metadata> metadataList = getRecursiveMetadata("bob.gz"); + assertEquals(2, metadataList.size()); - // Tar file starts with the directory name - assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 15, US_ASCII)); + Metadata m1 = metadataList.get(1); + assertEquals("alice.txt", m1.get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("alice.txt", m1.get(TikaCoreProperties.INTERNAL_PATH)); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz new file mode 100644 index 0000000000..c8abfd7b67 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz differ
