This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new f8869e351 TIKA-4441 -- revert markLimit and add unit tests (#2261)
f8869e351 is described below

commit f8869e3516da2d45ae8e02cf509d29a6bec0d9c0
Author: Tim Allison <[email protected]>
AuthorDate: Wed Jun 25 14:26:31 2025 -0400

    TIKA-4441 -- revert markLimit and add unit tests (#2261)
---
 CHANGES.txt                                        |  5 +-
 .../detect/microsoft/POIFSContainerDetector.java   | 37 +++++++++-
 .../tika/detect/TestContainerAwareDetector.java    | 81 ++++++++++++++++++++++
 .../src/test/resources/configs/tika-4441-120.xml   | 36 ++++++++++
 .../test/resources/configs/tika-4441-12000000.xml  | 36 ++++++++++
 .../src/test/resources/configs/tika-4441-neg1.xml  | 36 ++++++++++
 6 files changed, 228 insertions(+), 3 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index e95b5504a..d32c5af05 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,7 @@
-Release 3.2.1 - 06/20/2025
+Release 3.2.1 - 6/25/2025
+
+  * Fix POIFSContainerDetector regression when wrapping an InputStream in
+    a TikaInputStream (TIKA-4441).
 
   * Important bug fix for zip-based detection on a non-TikaInputStream 
(TIKA-4424).
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index f0605a78d..eed7d36f0 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -32,6 +32,7 @@ import java.util.Set;
 import java.util.regex.Pattern;
 
 import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.hssf.model.InternalWorkbook;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
@@ -44,6 +45,7 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.Field;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -254,7 +256,7 @@ public class POIFSContainerDetector implements Detector {
 
 
     @Field
-    private int markLimit = -1;
+    private int markLimit = 128 * 1024 * 1024;
 
     /**
      * Internal detection of the specific kind of OLE2 document, based on the
@@ -608,11 +610,42 @@ public class POIFSContainerDetector implements Detector {
             return handleTikaStream(tis, metadata);
         }
         if (isOleHeader(input)) {
-            return OLE;
+            if (markLimit < 0) {
+                return OLE;
+            }
+            return handleInputStream(input, metadata);
         }
         return MediaType.OCTET_STREAM;
     }
 
+    private MediaType handleInputStream(InputStream input, Metadata metadata) 
throws IOException {
+        if (markLimit < 0) {
+            return OLE;
+        }
+        BoundedInputStream bis = null;
+        try {
+            bis = new BoundedInputStream(markLimit, 
CloseShieldInputStream.wrap(input));
+            bis.mark(markLimit);
+            try (POIFSFileSystem poifs = new 
POIFSFileSystem(CloseShieldInputStream.wrap(bis))) {
+                if (bis.hasHitBound()) {
+                    return OLE;
+                }
+                Set<String> names = getTopLevelNames(poifs.getRoot());
+                return detect(names, poifs.getRoot());
+            } catch (SecurityException e) {
+                throw e;
+            } catch (IOException | RuntimeException e) {
+                //swallow
+                return OLE;
+            }
+        } finally {
+            if (bis != null) {
+                bis.reset();
+                bis.close();
+            }
+        }
+    }
+
     private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata) 
throws IOException {
         //try for an open container
         Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index cb71b925e..c292802b3 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -30,10 +30,14 @@ import java.util.List;
 import java.util.Objects;
 import java.util.Random;
 
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.junit.jupiter.api.AfterEach;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
+import org.xml.sax.SAXException;
 
 import org.apache.tika.MultiThreadedTikaTest;
 import org.apache.tika.Tika;
@@ -606,4 +610,81 @@ public class TestContainerAwareDetector extends 
MultiThreadedTikaTest {
         assertTypeByData("testWEBARCHIVE.webarchive", 
"application/x-bplist-webarchive");
         assertTypeByData("testBPList.bplist", "application/x-bplist-itunes");
     }
+
+    @Test
+    public void testPOIFSContainerDetector() throws Exception {
+        UnsynchronizedByteArrayOutputStream baos = 
UnsynchronizedByteArrayOutputStream.builder().get();
+        try (InputStream is = 
getResourceAsStream("/test-documents/testWORD.doc")) {
+            IOUtils.copy(is, baos);
+        }
+        byte[] bytes = baos.toByteArray();
+        long len = bytes.length;
+
+        //test default
+        Detector detector = TikaConfig.getDefaultConfig().getDetector();
+        try (InputStream is = 
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
+            assertEquals("application/msword",
+                    detector.detect(is, new Metadata()).toString());
+            assertEquals(len, countBytes(is));
+        }
+
+        detector = loadDetector("tika-4441-neg1.xml");
+        try (InputStream is = 
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
+            assertEquals("application/x-tika-msoffice",
+                    detector.detect(is, new Metadata()).toString());
+            assertEquals(len, countBytes(is));
+        }
+
+        detector = loadDetector("tika-4441-120.xml");
+        try (InputStream is = 
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
+            assertEquals("application/x-tika-msoffice",
+                    detector.detect(is, new Metadata()).toString());
+            assertEquals(len, countBytes(is));
+        }
+
+        detector = loadDetector("tika-4441-12000000.xml");
+        try (InputStream is = 
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
+            assertEquals("application/msword",
+                    detector.detect(is, new Metadata()).toString());
+            assertEquals(len, countBytes(is));
+        }
+
+        //now try wrapping in a TikaInputStream
+        detector = loadDetector("tika-4441-neg1.xml");
+        try (InputStream is = 
TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()))
 {
+            assertEquals("application/msword",
+                    detector.detect(is, new Metadata()).toString());
+            assertEquals(len, countBytes(is));
+        }
+
+        detector = loadDetector("tika-4441-120.xml");
+        try (InputStream is = 
TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()))
 {
+            assertEquals("application/x-tika-msoffice",
+                    detector.detect(is, new Metadata()).toString());
+            assertEquals(len, countBytes(is));
+        }
+
+        detector = loadDetector("tika-4441-12000000.xml");
+        try (InputStream is = 
TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()))
 {
+            assertEquals("application/msword",
+                    detector.detect(is, new Metadata()).toString());
+            assertEquals(len, countBytes(is));
+        }
+    }
+
+    private long countBytes(InputStream is) throws IOException {
+        int b = is.read();
+        long len = 0;
+        while (b > -1) {
+            len++;
+            b = is.read();
+        }
+        return len;
+    }
+
+    private Detector loadDetector(String tikaConfigName) throws IOException, 
TikaException, SAXException {
+        try (InputStream is = 
TestContainerAwareDetector.class.getResourceAsStream("/configs/" + 
tikaConfigName)) {
+            return new TikaConfig(is).getDetector();
+        }
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
new file mode 100644
index 000000000..6e9bf3517
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <detectors>
+    <detector class="org.gagravarr.tika.OggDetector"/>
+    <detector class="org.apache.tika.detect.apple.BPListDetector"/>
+    <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
+    <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
+      <params>
+        <param name="markLimit" type="int">120</param>
+      </params>
+    </detector>
+    <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
+    <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
+      <params>
+        <param name="markLimit" type="int">16777216</param>
+      </params>
+    </detector>
+    <detector class="org.apache.tika.mime.MimeTypes"/>
+  </detectors>
+</properties>
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
new file mode 100644
index 000000000..a438b5c63
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <detectors>
+    <detector class="org.gagravarr.tika.OggDetector"/>
+    <detector class="org.apache.tika.detect.apple.BPListDetector"/>
+    <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
+    <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
+      <params>
+        <param name="markLimit" type="int">12000000</param>
+      </params>
+    </detector>
+    <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
+    <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
+      <params>
+        <param name="markLimit" type="int">16777216</param>
+      </params>
+    </detector>
+    <detector class="org.apache.tika.mime.MimeTypes"/>
+  </detectors>
+</properties>
\ No newline at end of file
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
new file mode 100644
index 000000000..74c011268
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <detectors>
+    <detector class="org.gagravarr.tika.OggDetector"/>
+    <detector class="org.apache.tika.detect.apple.BPListDetector"/>
+    <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
+    <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
+      <params>
+        <param name="markLimit" type="int">-1</param>
+      </params>
+    </detector>
+    <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
+    <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
+      <params>
+        <param name="markLimit" type="int">16777216</param>
+      </params>
+    </detector>
+    <detector class="org.apache.tika.mime.MimeTypes"/>
+  </detectors>
+</properties>
\ No newline at end of file

Reply via email to