This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4441 in repository https://gitbox.apache.org/repos/asf/tika.git
commit b58fbb301b1284fbd911dcc6e81f769a29df2571 Author: tallison <[email protected]> AuthorDate: Wed Jun 25 14:02:18 2025 -0400 TIKA-4441 -- revert markLimit and add unit tests --- CHANGES.txt | 5 +- .../detect/microsoft/POIFSContainerDetector.java | 37 +++++++++- .../tika/detect/TestContainerAwareDetector.java | 81 ++++++++++++++++++++++ .../src/test/resources/configs/tika-4441-120.xml | 36 ++++++++++ .../test/resources/configs/tika-4441-12000000.xml | 36 ++++++++++ .../src/test/resources/configs/tika-4441-neg1.xml | 36 ++++++++++ 6 files changed, 228 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index e95b5504a..d32c5af05 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,7 @@ -Release 3.2.1 - 06/20/2025 +Release 3.2.1 - 6/25/2025 + + * Fix POIFSContainerDetector regression when wrapping an InputStream in + a TikaInputStream (TIKA-4441). * Important bug fix for zip-based detection on a non-TikaInputStream (TIKA-4424). diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java index f0605a78d..eed7d36f0 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java @@ -32,6 +32,7 @@ import java.util.Set; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.poi.hssf.model.InternalWorkbook; import org.apache.poi.poifs.filesystem.DirectoryEntry; import org.apache.poi.poifs.filesystem.DirectoryNode; @@ -44,6 +45,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.config.Field; import org.apache.tika.detect.Detector; +import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -254,7 +256,7 @@ public class POIFSContainerDetector implements Detector { @Field - private int markLimit = -1; + private int markLimit = 128 * 1024 * 1024; /** * Internal detection of the specific kind of OLE2 document, based on the @@ -608,11 +610,42 @@ public class POIFSContainerDetector implements Detector { return handleTikaStream(tis, metadata); } if (isOleHeader(input)) { - return OLE; + if (markLimit < 0) { + return OLE; + } + return handleInputStream(input, metadata); } return MediaType.OCTET_STREAM; } + private MediaType handleInputStream(InputStream input, Metadata metadata) throws IOException { + if (markLimit < 0) { + return OLE; + } + BoundedInputStream bis = null; + try { + bis = new BoundedInputStream(markLimit, CloseShieldInputStream.wrap(input)); + bis.mark(markLimit); + try (POIFSFileSystem poifs = new POIFSFileSystem(CloseShieldInputStream.wrap(bis))) { + if (bis.hasHitBound()) { + return OLE; + } + Set<String> names = getTopLevelNames(poifs.getRoot()); + return detect(names, poifs.getRoot()); + } catch (SecurityException e) { + throw e; + } catch (IOException | RuntimeException e) { + //swallow + return OLE; + } + } finally { + if (bis != null) { + bis.reset(); + bis.close(); + } + } + } + private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata) throws IOException { //try for an open container Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java index cb71b925e..c292802b3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -30,10 +30,14 @@ import java.util.List; import java.util.Objects; import java.util.Random; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; +import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; import org.apache.tika.MultiThreadedTikaTest; import org.apache.tika.Tika; @@ -606,4 +610,81 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest { assertTypeByData("testWEBARCHIVE.webarchive", "application/x-bplist-webarchive"); assertTypeByData("testBPList.bplist", "application/x-bplist-itunes"); } + + @Test + public void testPOIFSContainerDetector() throws Exception { + UnsynchronizedByteArrayOutputStream baos = UnsynchronizedByteArrayOutputStream.builder().get(); + try (InputStream is = getResourceAsStream("/test-documents/testWORD.doc")) { + IOUtils.copy(is, baos); + } + byte[] bytes = baos.toByteArray(); + long len = bytes.length; + + //test default + Detector detector = TikaConfig.getDefaultConfig().getDetector(); + try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) { + assertEquals("application/msword", + detector.detect(is, new Metadata()).toString()); + assertEquals(len, countBytes(is)); + } + + detector = loadDetector("tika-4441-neg1.xml"); + try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) { + assertEquals("application/x-tika-msoffice", + detector.detect(is, new Metadata()).toString()); + assertEquals(len, countBytes(is)); + } + + detector = loadDetector("tika-4441-120.xml"); + try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) { + assertEquals("application/x-tika-msoffice", + detector.detect(is, new Metadata()).toString()); + assertEquals(len, countBytes(is)); + } + + detector = loadDetector("tika-4441-12000000.xml"); + try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) { + assertEquals("application/msword", + detector.detect(is, new Metadata()).toString()); + assertEquals(len, countBytes(is)); + } + + //now try wrapping in a TikaInputStream + detector = loadDetector("tika-4441-neg1.xml"); + try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) { + assertEquals("application/msword", + detector.detect(is, new Metadata()).toString()); + assertEquals(len, countBytes(is)); + } + + detector = loadDetector("tika-4441-120.xml"); + try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) { + assertEquals("application/x-tika-msoffice", + detector.detect(is, new Metadata()).toString()); + assertEquals(len, countBytes(is)); + } + + detector = loadDetector("tika-4441-12000000.xml"); + try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) { + assertEquals("application/msword", + detector.detect(is, new Metadata()).toString()); + assertEquals(len, countBytes(is)); + } + } + + private long countBytes(InputStream is) throws IOException { + int b = is.read(); + long len = 0; + while (b > -1) { + len++; + b = is.read(); + } + return len; + } + + private Detector loadDetector(String tikaConfigName) throws IOException, TikaException, SAXException { + try (InputStream is = TestContainerAwareDetector.class.getResourceAsStream("/configs/" + tikaConfigName)) { + return new TikaConfig(is).getDetector(); + } + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml new file mode 100644 index 000000000..6e9bf3517 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <detectors> + <detector class="org.gagravarr.tika.OggDetector"/> + <detector class="org.apache.tika.detect.apple.BPListDetector"/> + <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/> + <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector"> + <params> + <param name="markLimit" type="int">120</param> + </params> + </detector> + <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/> + <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector"> + <params> + <param name="markLimit" type="int">16777216</param> + </params> + </detector> + <detector class="org.apache.tika.mime.MimeTypes"/> + </detectors> +</properties> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml new file mode 100644 index 000000000..a438b5c63 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <detectors> + <detector class="org.gagravarr.tika.OggDetector"/> + <detector class="org.apache.tika.detect.apple.BPListDetector"/> + <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/> + <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector"> + <params> + <param name="markLimit" type="int">12000000</param> + </params> + </detector> + <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/> + <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector"> + <params> + <param name="markLimit" type="int">16777216</param> + </params> + </detector> + <detector class="org.apache.tika.mime.MimeTypes"/> + </detectors> +</properties> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml new file mode 100644 index 000000000..74c011268 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml @@ -0,0 +1,36 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <detectors> + <detector class="org.gagravarr.tika.OggDetector"/> + <detector class="org.apache.tika.detect.apple.BPListDetector"/> + <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/> + <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector"> + <params> + <param name="markLimit" type="int">-1</param> + </params> + </detector> + <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/> + <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector"> + <params> + <param name="markLimit" type="int">16777216</param> + </params> + </detector> + <detector class="org.apache.tika.mime.MimeTypes"/> + </detectors> +</properties> \ No newline at end of file
