This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 9f7e4dc04a TIKA-4630 -- use embedded stored filename as the 
"resourcename" in gz (#2582)
9f7e4dc04a is described below

commit 9f7e4dc04a0fd152377d1de88b50cfb8f035f617
Author: Tim Allison <[email protected]>
AuthorDate: Tue Feb 3 20:18:00 2026 -0500

    TIKA-4630 -- use embedded stored filename as the "resourcename" in gz 
(#2582)
---
 .../apache/tika/parser/pkg/CompressorParser.java   |  14 +++++---
 .../org/apache/tika/parser/pkg/GzipParserTest.java |  39 +++++++++++----------
 .../src/test/resources/test-documents/bob.gz       | Bin 0 -> 41 bytes
 3 files changed, 30 insertions(+), 23 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index d8d1d40e6b..ab9cf01a00 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -240,10 +240,13 @@ public class CompressorParser implements Parser {
         xhtml.startDocument();
         try {
             Metadata entrydata = Metadata.newInstance(context);
+            boolean foundName = false;
             if (cis instanceof GzipCompressorInputStream) {
-                extractGzipMetadata((GzipCompressorInputStream) cis, 
entrydata);
+                foundName = extractGzipMetadata((GzipCompressorInputStream) 
cis, entrydata);
+            }
+            if (! foundName) {
+                setName(metadata, entrydata);
             }
-            setName(metadata, entrydata);
 
             // Use the delegate parser to parse the compressed document
             EmbeddedDocumentExtractor extractor =
@@ -261,16 +264,19 @@ public class CompressorParser implements Parser {
         xhtml.endDocument();
     }
 
-    private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata 
metadata) {
+    private boolean extractGzipMetadata(GzipCompressorInputStream gzcis, 
Metadata metadata) {
         GzipParameters gzipParameters = gzcis.getMetaData();
         if (gzipParameters == null) {
-            return;
+            return false;
         }
         String name = gzipParameters.getFileName();
         if (!StringUtils.isBlank(name)) {
             metadata.set(TikaCoreProperties.INTERNAL_PATH, name);
+            metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+            return true;
         }
         //TODO: modification, OS, comment
+        return false;
     }
 
     private void setName(Metadata parentMetadata, Metadata metadata) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index af092fb30a..3dd16378ab 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -16,20 +16,21 @@
  */
 package org.apache.tika.parser.pkg;
 
-import static java.nio.charset.StandardCharsets.US_ASCII;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.List;
 
 import org.junit.jupiter.api.Test;
-import org.xml.sax.ContentHandler;
 
-import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.metadata.TikaCoreProperties;
 
 /**
  * Test case for parsing gzip files.
  */
-public class GzipParserTest extends AbstractPkgTest {
+public class GzipParserTest extends TikaTest {
 
     /**
      * Tests that the ParseContext parser is correctly
@@ -37,23 +38,23 @@ public class GzipParserTest extends AbstractPkgTest {
      */
     @Test
     public void testEmbedded() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
+        List<Metadata> metadataList = 
getRecursiveMetadata("test-documents.tgz");
 
-        try (TikaInputStream tis = 
getResourceAsStream("/test-documents/test-documents.tgz")) {
-            AUTO_DETECT_PARSER.parse(tis, handler, metadata, trackingContext);
-        }
+        // Container plus embedded tar contents
+        assertTrue(metadataList.size() > 1);
 
-        // Should find a single entry, for the (compressed) tar file
-        assertEquals(1, tracker.filenames.size());
-        assertEquals(1, tracker.mediatypes.size());
-        assertEquals(1, tracker.modifiedAts.size());
+        // Embedded documents should have path through the tar file
+        String embeddedPath = 
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+        assertTrue(embeddedPath.contains("test-documents.tar"));
+    }
 
-        assertEquals(null, tracker.filenames.get(0));
-        assertEquals(null, tracker.mediatypes.get(0));
-        assertEquals(null, tracker.modifiedAts.get(0));
+    @Test
+    public void testGzipInternalFileName() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("bob.gz");
+        assertEquals(2, metadataList.size());
 
-        // Tar file starts with the directory name
-        assertEquals("test-documents/", new String(tracker.lastSeenStart, 0, 
15, US_ASCII));
+        Metadata m1 = metadataList.get(1);
+        assertEquals("alice.txt", 
m1.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("alice.txt", m1.get(TikaCoreProperties.INTERNAL_PATH));
     }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz
new file mode 100644
index 0000000000..c8abfd7b67
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz
 differ

Reply via email to