This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4395 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9f99c9a64bca53fd145fe632f890de234a097cc0 Author: tallison <[email protected]> AuthorDate: Thu Apr 10 08:17:47 2025 -0400 TIKA-4395 -- improve handling logging of container detection --- .../org/apache/tika/MultiThreadedTikaTest.java | 1 - .../detect/microsoft/POIFSContainerDetector.java | 23 ++++++++++++++-------- .../src/test/resources/log4j2.xml | 3 +++ .../detect/zip/DefaultZipContainerDetector.java | 4 ++-- .../tika/config/TikaConfigSerializerTest.java | 2 +- .../tika/detect/TestContainerAwareDetector.java | 4 ++++ 6 files changed, 25 insertions(+), 12 deletions(-) diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java index fd3f381d4..ee87f9bf7 100644 --- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java @@ -109,7 +109,6 @@ public class MultiThreadedTikaTest extends TikaTest { baseline.put(f, new Extract(metadataList)); } catch (Exception e) { - e.printStackTrace(); //swallow } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java index 2285630fb..f0605a78d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java @@ -16,6 +16,7 @@ */ package org.apache.tika.detect.microsoft; +import static org.apache.tika.mime.MediaType.OCTET_STREAM; import static org.apache.tika.mime.MediaType.application; import static org.apache.tika.mime.MediaType.image; @@ -602,19 +603,26 @@ public class POIFSContainerDetector implements Detector { return MediaType.OCTET_STREAM; } - if (! isOleHeader(input)) { - return MediaType.OCTET_STREAM; - } - TikaInputStream tis = TikaInputStream.cast(input); - if (tis == null) { - LOG.warn("POIFSContainerDetector requires a TikaInputStream for precise detection."); + if (tis != null) { + return handleTikaStream(tis, metadata); + } + if (isOleHeader(input)) { return OLE; } + return MediaType.OCTET_STREAM; + } + private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata) throws IOException { + //try for an open container Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata); - // We can only detect the exact type when given a TikaInputStream + //if that didn't work, confirm the bytes are OLE + if (names == null && ! isOleHeader(tis)) { + return OCTET_STREAM; + } + + // If OLE, spool to disk if (names == null) { // spool to disk and try detection names = getTopLevelNames(tis); @@ -625,7 +633,6 @@ public class POIFSContainerDetector implements Detector { tis.getOpenContainer() instanceof POIFSFileSystem) { return detect(names, ((POIFSFileSystem) tis.getOpenContainer()).getRoot()); } else { - //can we actually get here? return detect(names, null); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml index 1e9327e01..d609d7631 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/log4j2.xml @@ -36,5 +36,8 @@ <Logger name="org.apache.poi" level="ERROR" additivity="false"> <AppenderRef ref="Console"/> </Logger> + <Logger name="org.apache.tika.detect.microsoft.POIFSContainerDetector" level="ERROR" additivity="false"> + <AppenderRef ref="Console"/> + </Logger> </Loggers> </Configuration> \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java index 5b6567308..2c2669b85 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/DefaultZipContainerDetector.java @@ -262,9 +262,9 @@ public class DefaultZipContainerDetector implements Detector { } //problem opening zip file (truncated?) try (InputStream is = new BufferedInputStream(Files.newInputStream(tis.getPath()))) { - return detectStreaming(is, metadata); + return detectStreaming(is, metadata, false); } catch (IOException e) { - //swallow + //swallow } return MediaType.APPLICATION_ZIP; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java index d7313db6f..9ba4b4ab6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java @@ -46,7 +46,7 @@ public class TikaConfigSerializerTest { assertContains(encodingNeedle, xml); String detectorNeedle = "<detector class=\"org.apache.tika.detect.zip.DefaultZipContainerDetector\">" + - " <params> <param name=\"markLimit\" type=\"int\">16777216</param> </params>"; + " <params> <param name=\"markLimit\" type=\"int\">-1</param> </params>"; assertContains(detectorNeedle, xml); String parserNeedle = "<parser class=\"org.apache.tika.parser.pdf.PDFParser\">" + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java index d35df67bf..cb71b925e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java @@ -65,6 +65,10 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest { private final StreamingZipContainerDetector streamingZipDetector = new StreamingZipContainerDetector(); + TestContainerAwareDetector() { + streamingZipDetector.setMarkLimit(128 * 1024 * 1024); + } + @AfterEach public void tearDown() throws TikaException { //make sure to reset pool size because it is being randomly resized during the tests
