This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 9f7e4dc04a TIKA-4630 -- use embedded stored filename as the
"resourcename" in gz (#2582)
9f7e4dc04a is described below
commit 9f7e4dc04a0fd152377d1de88b50cfb8f035f617
Author: Tim Allison <[email protected]>
AuthorDate: Tue Feb 3 20:18:00 2026 -0500
TIKA-4630 -- use embedded stored filename as the "resourcename" in gz
(#2582)
---
.../apache/tika/parser/pkg/CompressorParser.java | 14 +++++---
.../org/apache/tika/parser/pkg/GzipParserTest.java | 39 +++++++++++----------
.../src/test/resources/test-documents/bob.gz | Bin 0 -> 41 bytes
3 files changed, 30 insertions(+), 23 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index d8d1d40e6b..ab9cf01a00 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -240,10 +240,13 @@ public class CompressorParser implements Parser {
xhtml.startDocument();
try {
Metadata entrydata = Metadata.newInstance(context);
+ boolean foundName = false;
if (cis instanceof GzipCompressorInputStream) {
- extractGzipMetadata((GzipCompressorInputStream) cis,
entrydata);
+ foundName = extractGzipMetadata((GzipCompressorInputStream)
cis, entrydata);
+ }
+ if (! foundName) {
+ setName(metadata, entrydata);
}
- setName(metadata, entrydata);
// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor =
@@ -261,16 +264,19 @@ public class CompressorParser implements Parser {
xhtml.endDocument();
}
- private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata
metadata) {
+ private boolean extractGzipMetadata(GzipCompressorInputStream gzcis,
Metadata metadata) {
GzipParameters gzipParameters = gzcis.getMetaData();
if (gzipParameters == null) {
- return;
+ return false;
}
String name = gzipParameters.getFileName();
if (!StringUtils.isBlank(name)) {
metadata.set(TikaCoreProperties.INTERNAL_PATH, name);
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ return true;
}
//TODO: modification, OS, comment
+ return false;
}
private void setName(Metadata parentMetadata, Metadata metadata) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index af092fb30a..3dd16378ab 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -16,20 +16,21 @@
*/
package org.apache.tika.parser.pkg;
-import static java.nio.charset.StandardCharsets.US_ASCII;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.List;
import org.junit.jupiter.api.Test;
-import org.xml.sax.ContentHandler;
-import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.metadata.TikaCoreProperties;
/**
* Test case for parsing gzip files.
*/
-public class GzipParserTest extends AbstractPkgTest {
+public class GzipParserTest extends TikaTest {
/**
* Tests that the ParseContext parser is correctly
@@ -37,23 +38,23 @@ public class GzipParserTest extends AbstractPkgTest {
*/
@Test
public void testEmbedded() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
+ List<Metadata> metadataList =
getRecursiveMetadata("test-documents.tgz");
- try (TikaInputStream tis =
getResourceAsStream("/test-documents/test-documents.tgz")) {
- AUTO_DETECT_PARSER.parse(tis, handler, metadata, trackingContext);
- }
+ // Container plus embedded tar contents
+ assertTrue(metadataList.size() > 1);
- // Should find a single entry, for the (compressed) tar file
- assertEquals(1, tracker.filenames.size());
- assertEquals(1, tracker.mediatypes.size());
- assertEquals(1, tracker.modifiedAts.size());
+ // Embedded documents should have path through the tar file
+ String embeddedPath =
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+ assertTrue(embeddedPath.contains("test-documents.tar"));
+ }
- assertEquals(null, tracker.filenames.get(0));
- assertEquals(null, tracker.mediatypes.get(0));
- assertEquals(null, tracker.modifiedAts.get(0));
+ @Test
+ public void testGzipInternalFileName() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("bob.gz");
+ assertEquals(2, metadataList.size());
- // Tar file starts with the directory name
- assertEquals("test-documents/", new String(tracker.lastSeenStart, 0,
15, US_ASCII));
+ Metadata m1 = metadataList.get(1);
+ assertEquals("alice.txt",
m1.get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("alice.txt", m1.get(TikaCoreProperties.INTERNAL_PATH));
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz
new file mode 100644
index 0000000000..c8abfd7b67
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/bob.gz
differ