This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new ce700b6c8e TIKA-4221 - tmp workaround for pack200 (#2863)
ce700b6c8e is described below

commit ce700b6c8ee786c51bc722158a628f462ad9488e
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jun 4 05:47:32 2026 -0400

    TIKA-4221 - tmp workaround for pack200 (#2863)
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
---
 .../apache/tika/parser/pkg/CompressorParser.java   |  53 ++++++++++++++++++++-
 .../tika/parser/pkg/CompressorParserTest.java      |  22 +++++++++
 .../test/resources/test-documents/testPACK200.pack | Bin 0 -> 530 bytes
 3 files changed, 74 insertions(+), 1 deletion(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 8535d304b8..c71939220c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -34,6 +34,9 @@ import static 
org.apache.tika.detect.zip.CompressorConstants.ZSTD;
 import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -113,6 +116,9 @@ public class CompressorParser implements Parser {
     private static Set<MediaType> SUPPORTED_TYPES;
     private static Map<String, String> MIMES_TO_NAME;
 
+    //pack200 archives start with the 4-byte magic 0xCAFED00D
+    private static final byte[] PACK200_MAGIC = {(byte) 0xCA, (byte) 0xFE, 
(byte) 0xD0, (byte) 0x0D};
+
     private Config defaultConfig = new Config();
 
     static {
@@ -233,7 +239,30 @@ public class CompressorParser implements Parser {
             //trust that and go with the appropriate name
             //to avoid calling CompressorStreamFactory.detect() twice
             String name = getStreamName(metadata);
-            if (name != null) {
+            boolean pack200 = CompressorStreamFactory.PACK200.equals(name);
+            if (name == null) {
+                //No content-type hint: peek to see whether this is pack200 so 
we can route it
+                //through the workaround below. Anything else falls through to 
autodetect unchanged.
+                pack200 = isPack200(tis);
+            }
+            if (pack200) {
+                // TIKA-4221 / COMPRESS-721 workaround: commons-compress' 
Pack200CompressorInputStream
+                // reflects into java.io internals (FilterInputStream.in / 
FileInputStream.path) to
+                // bound its input, which throws InaccessibleObjectException 
on Java 17+. A
+                // TikaInputStream is a FilterInputStream, so it triggers 
this. Spool to a file and
+                // reopen via Files.newInputStream (a ChannelInputStream) -- 
the one input type
+                // commons-compress does not reflect into. 
Pack200CompressorInputStream reads its
+                // input fully in the constructor (IN_MEMORY) and then serves 
bytes from an in-memory
+                // buffer, so the channel stream can be closed immediately 
afterward. Remove this once
+                // Tika depends on a commons-compress release that contains 
the COMPRESS-721 fix.
+                try (InputStream packStream = 
Files.newInputStream(tis.getPath())) {
+                    cis = 
factory.createCompressorInputStream(CompressorStreamFactory.PACK200,
+                            packStream);
+                }
+                if (name == null) {
+                    metadata.set(CONTENT_TYPE, PACK.toString());
+                }
+            } else if (name != null) {
                 cis = factory.createCompressorInputStream(name, tis);
             } else {
                 cis = factory.createCompressorInputStream(tis);
@@ -248,6 +277,11 @@ public class CompressorParser implements Parser {
                 throw new TikaMemoryLimitException(e.getMessage());
             }
             throw new TikaException("Unable to uncompress document stream", e);
+        } catch (IOException e) {
+            //the pack200 workaround (getPath()/Files.newInputStream) can 
throw IOException;
+            //make sure the close shield is removed before propagating
+            tis.removeCloseShield();
+            throw e;
         }
 
 
@@ -328,6 +362,23 @@ public class CompressorParser implements Parser {
         return MIMES_TO_NAME.get(mimeString);
     }
 
+    /**
+     * Peeks at the stream signature to determine whether it is a pack200 
archive, without
+     * consuming the stream. Used so pack200 can be routed through the 
COMPRESS-721 workaround in
+     * {@link #parse}.
+     *
+     * @param tis the input, which must support mark/reset (a TikaInputStream 
always does)
+     * @return {@code true} if the signature matches pack200
+     */
+    private static boolean isPack200(TikaInputStream tis) {
+        try {
+            byte[] sig = new byte[PACK200_MAGIC.length];
+            return tis.peek(sig) == PACK200_MAGIC.length && Arrays.equals(sig, 
PACK200_MAGIC);
+        } catch (IOException e) {
+            return false;
+        }
+    }
+
     public Config getDefaultConfig() {
         return defaultConfig;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 58dcb0a0d6..f3def60901 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -17,11 +17,15 @@
 package org.apache.tika.parser.pkg;
 
 
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
@@ -31,6 +35,8 @@ import org.junit.jupiter.api.Test;
 import org.apache.tika.TikaTest;
 import org.apache.tika.detect.zip.CompressorConstants;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 
@@ -71,6 +77,22 @@ public class CompressorParserTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testPack200() throws Exception {
+        //TIKA-4221: commons-compress' Pack200CompressorInputStream throws an
+        //InaccessibleObjectException on Java 17+ when handed a 
FilterInputStream or a
+        //FileInputStream (a TikaInputStream is a FilterInputStream). 
CompressorParser must route
+        //pack200 through the spool-to-file workaround so it unpacks cleanly.
+        //testPACK200.pack is borrowed from Apache Commons Compress 
(HelloWorld.pack).
+        List<Metadata> metadataList = getRecursiveMetadata("testPACK200.pack");
+        assertEquals("application/x-java-pack200", 
metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        
assertNull(metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION),
+                "pack200 should unpack without an exception");
+        //the pack200 archive must have been unpacked into at least one 
embedded document
+        assertTrue(metadataList.size() > 1,
+                "pack200 should have been unpacked into embedded content");
+    }
+
     @Test
     public void testQuineXHTML() throws Exception {
         //Anti-virus can surreptitiously remove this file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack
new file mode 100644
index 0000000000..7445d85a49
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack
 differ

Reply via email to