This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new fbac9f217 TIKA-4441 -- revert markLimit and add unit tests (#2261)
fbac9f217 is described below
commit fbac9f217088a06360fedc2017fe656d2c7a4c49
Author: Tim Allison <[email protected]>
AuthorDate: Wed Jun 25 14:26:31 2025 -0400
TIKA-4441 -- revert markLimit and add unit tests (#2261)
# Conflicts:
# CHANGES.txt
---
CHANGES.txt | 3 +
.../detect/microsoft/POIFSContainerDetector.java | 35 +++++++++-
.../tika/detect/TestContainerAwareDetector.java | 81 ++++++++++++++++++++++
.../src/test/resources/configs/tika-4441-120.xml | 36 ++++++++++
.../test/resources/configs/tika-4441-12000000.xml | 36 ++++++++++
.../src/test/resources/configs/tika-4441-neg1.xml | 36 ++++++++++
6 files changed, 226 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index d4de220e6..6c506e88b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -11,6 +11,9 @@ Release 4.0.0-BETA1 - ???
Release 3.2.1 - ???
+ * Fix POIFSContainerDetector regression when wrapping an InputStream in
+ a TikaInputStream (TIKA-4441).
+
* Important bug fix for zip-based detection on a non-TikaInputStream
(TIKA-4424).
* Improve text extraction from EMF (TIKA-4432).
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index f0605a78d..ef36eaed3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -32,6 +32,7 @@ import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
@@ -44,6 +45,7 @@ import org.slf4j.LoggerFactory;
import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
+import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -608,11 +610,42 @@ public class POIFSContainerDetector implements Detector {
return handleTikaStream(tis, metadata);
}
if (isOleHeader(input)) {
- return OLE;
+ if (markLimit < 0) {
+ return OLE;
+ }
+ return handleInputStream(input, metadata);
}
return MediaType.OCTET_STREAM;
}
+ private MediaType handleInputStream(InputStream input, Metadata metadata)
throws IOException {
+ if (markLimit < 0) {
+ return OLE;
+ }
+ BoundedInputStream bis = null;
+ try {
+ bis = new BoundedInputStream(markLimit,
CloseShieldInputStream.wrap(input));
+ bis.mark(markLimit);
+ try (POIFSFileSystem poifs = new
POIFSFileSystem(CloseShieldInputStream.wrap(bis))) {
+ if (bis.hasHitBound()) {
+ return OLE;
+ }
+ Set<String> names = getTopLevelNames(poifs.getRoot());
+ return detect(names, poifs.getRoot());
+ } catch (SecurityException e) {
+ throw e;
+ } catch (IOException | RuntimeException e) {
+ //swallow
+ return OLE;
+ }
+ } finally {
+ if (bis != null) {
+ bis.reset();
+ bis.close();
+ }
+ }
+ }
+
private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata)
throws IOException {
//try for an open container
Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index cb71b925e..04e431361 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -30,10 +30,14 @@ import java.util.List;
import java.util.Objects;
import java.util.Random;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
+import org.xml.sax.SAXException;
import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.Tika;
@@ -606,4 +610,81 @@ public class TestContainerAwareDetector extends
MultiThreadedTikaTest {
assertTypeByData("testWEBARCHIVE.webarchive",
"application/x-bplist-webarchive");
assertTypeByData("testBPList.bplist", "application/x-bplist-itunes");
}
+
+ @Test
+ public void testPOIFSContainerDetector() throws Exception {
+ UnsynchronizedByteArrayOutputStream baos =
UnsynchronizedByteArrayOutputStream.builder().get();
+ try (InputStream is =
getResourceAsStream("/test-documents/testWORD.doc")) {
+ IOUtils.copy(is, baos);
+ }
+ byte[] bytes = baos.toByteArray();
+ long len = bytes.length;
+
+ //test default
+ Detector detector = TikaConfig.getDefaultConfig().getDetector();
+ try (InputStream is =
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
+ assertEquals("application/x-tika-msoffice",
+ detector.detect(is, new Metadata()).toString());
+ assertEquals(len, countBytes(is));
+ }
+
+ detector = loadDetector("tika-4441-neg1.xml");
+ try (InputStream is =
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
+ assertEquals("application/x-tika-msoffice",
+ detector.detect(is, new Metadata()).toString());
+ assertEquals(len, countBytes(is));
+ }
+
+ detector = loadDetector("tika-4441-120.xml");
+ try (InputStream is =
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
+ assertEquals("application/x-tika-msoffice",
+ detector.detect(is, new Metadata()).toString());
+ assertEquals(len, countBytes(is));
+ }
+
+ detector = loadDetector("tika-4441-12000000.xml");
+ try (InputStream is =
UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
+ assertEquals("application/msword",
+ detector.detect(is, new Metadata()).toString());
+ assertEquals(len, countBytes(is));
+ }
+
+ //now try wrapping in a TikaInputStream
+ detector = loadDetector("tika-4441-neg1.xml");
+ try (InputStream is =
TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()))
{
+ assertEquals("application/msword",
+ detector.detect(is, new Metadata()).toString());
+ assertEquals(len, countBytes(is));
+ }
+
+ detector = loadDetector("tika-4441-120.xml");
+ try (InputStream is =
TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()))
{
+ assertEquals("application/x-tika-msoffice",
+ detector.detect(is, new Metadata()).toString());
+ assertEquals(len, countBytes(is));
+ }
+
+ detector = loadDetector("tika-4441-12000000.xml");
+ try (InputStream is =
TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()))
{
+ assertEquals("application/msword",
+ detector.detect(is, new Metadata()).toString());
+ assertEquals(len, countBytes(is));
+ }
+ }
+
+ private long countBytes(InputStream is) throws IOException {
+ int b = is.read();
+ long len = 0;
+ while (b > -1) {
+ len++;
+ b = is.read();
+ }
+ return len;
+ }
+
+ private Detector loadDetector(String tikaConfigName) throws IOException,
TikaException, SAXException {
+ try (InputStream is =
TestContainerAwareDetector.class.getResourceAsStream("/configs/" +
tikaConfigName)) {
+ return new TikaConfig(is).getDetector();
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
new file mode 100644
index 000000000..6e9bf3517
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-120.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <detectors>
+ <detector class="org.gagravarr.tika.OggDetector"/>
+ <detector class="org.apache.tika.detect.apple.BPListDetector"/>
+ <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
+ <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
+ <params>
+ <param name="markLimit" type="int">120</param>
+ </params>
+ </detector>
+ <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
+ <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
+ <params>
+ <param name="markLimit" type="int">16777216</param>
+ </params>
+ </detector>
+ <detector class="org.apache.tika.mime.MimeTypes"/>
+ </detectors>
+</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
new file mode 100644
index 000000000..a438b5c63
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-12000000.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <detectors>
+ <detector class="org.gagravarr.tika.OggDetector"/>
+ <detector class="org.apache.tika.detect.apple.BPListDetector"/>
+ <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
+ <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
+ <params>
+ <param name="markLimit" type="int">12000000</param>
+ </params>
+ </detector>
+ <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
+ <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
+ <params>
+ <param name="markLimit" type="int">16777216</param>
+ </params>
+ </detector>
+ <detector class="org.apache.tika.mime.MimeTypes"/>
+ </detectors>
+</properties>
\ No newline at end of file
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
new file mode 100644
index 000000000..74c011268
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4441-neg1.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <detectors>
+ <detector class="org.gagravarr.tika.OggDetector"/>
+ <detector class="org.apache.tika.detect.apple.BPListDetector"/>
+ <detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
+ <detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
+ <params>
+ <param name="markLimit" type="int">-1</param>
+ </params>
+ </detector>
+ <detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
+ <detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
+ <params>
+ <param name="markLimit" type="int">16777216</param>
+ </params>
+ </detector>
+ <detector class="org.apache.tika.mime.MimeTypes"/>
+ </detectors>
+</properties>
\ No newline at end of file