This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new ce700b6c8e TIKA-4221 - tmp workaround for pack200 (#2863)
ce700b6c8e is described below
commit ce700b6c8ee786c51bc722158a628f462ad9488e
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jun 4 05:47:32 2026 -0400
TIKA-4221 - tmp workaround for pack200 (#2863)
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
---
.../apache/tika/parser/pkg/CompressorParser.java | 53 ++++++++++++++++++++-
.../tika/parser/pkg/CompressorParserTest.java | 22 +++++++++
.../test/resources/test-documents/testPACK200.pack | Bin 0 -> 530 bytes
3 files changed, 74 insertions(+), 1 deletion(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 8535d304b8..c71939220c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -34,6 +34,9 @@ import static
org.apache.tika.detect.zip.CompressorConstants.ZSTD;
import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
@@ -113,6 +116,9 @@ public class CompressorParser implements Parser {
private static Set<MediaType> SUPPORTED_TYPES;
private static Map<String, String> MIMES_TO_NAME;
+ //pack200 archives start with the 4-byte magic 0xCAFED00D
+ private static final byte[] PACK200_MAGIC = {(byte) 0xCA, (byte) 0xFE,
(byte) 0xD0, (byte) 0x0D};
+
private Config defaultConfig = new Config();
static {
@@ -233,7 +239,30 @@ public class CompressorParser implements Parser {
//trust that and go with the appropriate name
//to avoid calling CompressorStreamFactory.detect() twice
String name = getStreamName(metadata);
- if (name != null) {
+ boolean pack200 = CompressorStreamFactory.PACK200.equals(name);
+ if (name == null) {
+ //No content-type hint: peek to see whether this is pack200 so
we can route it
+ //through the workaround below. Anything else falls through to
autodetect unchanged.
+ pack200 = isPack200(tis);
+ }
+ if (pack200) {
+ // TIKA-4221 / COMPRESS-721 workaround: commons-compress'
Pack200CompressorInputStream
+ // reflects into java.io internals (FilterInputStream.in /
FileInputStream.path) to
+ // bound its input, which throws InaccessibleObjectException
on Java 17+. A
+ // TikaInputStream is a FilterInputStream, so it triggers
this. Spool to a file and
+ // reopen via Files.newInputStream (a ChannelInputStream) --
the one input type
+ // commons-compress does not reflect into.
Pack200CompressorInputStream reads its
+ // input fully in the constructor (IN_MEMORY) and then serves
bytes from an in-memory
+ // buffer, so the channel stream can be closed immediately
afterward. Remove this once
+ // Tika depends on a commons-compress release that contains
the COMPRESS-721 fix.
+ try (InputStream packStream =
Files.newInputStream(tis.getPath())) {
+ cis =
factory.createCompressorInputStream(CompressorStreamFactory.PACK200,
+ packStream);
+ }
+ if (name == null) {
+ metadata.set(CONTENT_TYPE, PACK.toString());
+ }
+ } else if (name != null) {
cis = factory.createCompressorInputStream(name, tis);
} else {
cis = factory.createCompressorInputStream(tis);
@@ -248,6 +277,11 @@ public class CompressorParser implements Parser {
throw new TikaMemoryLimitException(e.getMessage());
}
throw new TikaException("Unable to uncompress document stream", e);
+ } catch (IOException e) {
+ //the pack200 workaround (getPath()/Files.newInputStream) can
throw IOException;
+ //make sure the close shield is removed before propagating
+ tis.removeCloseShield();
+ throw e;
}
@@ -328,6 +362,23 @@ public class CompressorParser implements Parser {
return MIMES_TO_NAME.get(mimeString);
}
+ /**
+ * Peeks at the stream signature to determine whether it is a pack200
archive, without
+ * consuming the stream. Used so pack200 can be routed through the
COMPRESS-721 workaround in
+ * {@link #parse}.
+ *
+ * @param tis the input, which must support mark/reset (a TikaInputStream
always does)
+ * @return {@code true} if the signature matches pack200
+ */
+ private static boolean isPack200(TikaInputStream tis) {
+ try {
+ byte[] sig = new byte[PACK200_MAGIC.length];
+ return tis.peek(sig) == PACK200_MAGIC.length && Arrays.equals(sig,
PACK200_MAGIC);
+ } catch (IOException e) {
+ return false;
+ }
+ }
+
public Config getDefaultConfig() {
return defaultConfig;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 58dcb0a0d6..f3def60901 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -17,11 +17,15 @@
package org.apache.tika.parser.pkg;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
@@ -31,6 +35,8 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.detect.zip.CompressorConstants;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
@@ -71,6 +77,22 @@ public class CompressorParserTest extends TikaTest {
}
}
+ @Test
+ public void testPack200() throws Exception {
+ //TIKA-4221: commons-compress' Pack200CompressorInputStream throws an
+ //InaccessibleObjectException on Java 17+ when handed a
FilterInputStream or a
+ //FileInputStream (a TikaInputStream is a FilterInputStream).
CompressorParser must route
+ //pack200 through the spool-to-file workaround so it unpacks cleanly.
+ //testPACK200.pack is borrowed from Apache Commons Compress
(HelloWorld.pack).
+ List<Metadata> metadataList = getRecursiveMetadata("testPACK200.pack");
+ assertEquals("application/x-java-pack200",
metadataList.get(0).get(Metadata.CONTENT_TYPE));
+
assertNull(metadataList.get(0).get(TikaCoreProperties.CONTAINER_EXCEPTION),
+ "pack200 should unpack without an exception");
+ //the pack200 archive must have been unpacked into at least one
embedded document
+ assertTrue(metadataList.size() > 1,
+ "pack200 should have been unpacked into embedded content");
+ }
+
@Test
public void testQuineXHTML() throws Exception {
//Anti-virus can surreptitiously remove this file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack
new file mode 100644
index 0000000000..7445d85a49
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testPACK200.pack
differ