This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-3976 in repository https://gitbox.apache.org/repos/asf/tika.git
commit e6c389c5452ed7792c1fff5dcfad9de527d32833 Author: tallison <[email protected]> AuthorDate: Fri Feb 17 11:32:40 2023 -0500 TIKA-3976 -- allow users to turn off exception on zero-byte files --- .../org/apache/tika/parser/AutoDetectParser.java | 10 ++++++---- .../apache/tika/parser/AutoDetectParserConfig.java | 13 +++++++++++- .../src/test/java/org/apache/tika/TikaTest.java | 18 ++++++++++------- .../tika/parser/AutoDetectParserConfigTest.java | 23 +++++++++++++++++++++- .../test/resources/configs/tika-config-digests.xml | 1 + 5 files changed, 52 insertions(+), 13 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index 12c0e82ae..491ad572e 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -177,11 +177,13 @@ public class AutoDetectParser extends CompositeParser { } //check for zero-byte inputstream if (tis.getOpenContainer() == null) { - tis.mark(1); - if (tis.read() == -1) { - throw new ZeroByteFileException("InputStream must have > 0 bytes"); + if (autoDetectParserConfig.getThrowOnZeroBytes()) { + tis.mark(1); + if (tis.read() == -1) { + throw new ZeroByteFileException("InputStream must have > 0 bytes"); + } + tis.reset(); } - tis.reset(); } handler = decorateHandler(handler, metadata, context, autoDetectParserConfig); // TIKA-216: Zip bomb prevention diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java index 215b0bc32..d5a1567e1 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java @@ -100,6 +100,8 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable { private DigestingParser.DigesterFactory digesterFactory = null; + private boolean throwOnZeroBytes = true; + /** * Creates a SecureContentHandlerConfig using the passed in parameters. * @@ -198,6 +200,14 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable { return this.digesterFactory; } + public void setThrowOnZeroBytes(boolean throwOnZeroBytes) { + this.throwOnZeroBytes = throwOnZeroBytes; + } + + public boolean getThrowOnZeroBytes() { + return throwOnZeroBytes; + } + @Override public String toString() { return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" + @@ -206,7 +216,8 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable { maximumPackageEntryDepth + ", metadataWriteFilterFactory=" + metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" + embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" + - contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + '}'; + contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + + ", throwOnZeroBytes=" + throwOnZeroBytes + '}'; } } diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index fa112ca4c..a00d7b2b0 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -342,29 +342,33 @@ public abstract class TikaTest { protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context, boolean suppressException) throws Exception { - try (TikaInputStream tis = TikaInputStream.get(path)) { - return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, new Metadata(), context, + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { + return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, metadata, context, suppressException); } } protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception { - try (TikaInputStream tis = TikaInputStream.get(path)) { - return getRecursiveMetadata(tis, parser, new Metadata(), new ParseContext(), + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(path, metadata)) { + return getRecursiveMetadata(tis, parser, metadata, new ParseContext(), suppressException); } } protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException) throws Exception { - try (TikaInputStream tis = TikaInputStream.get(p)) { - return getRecursiveMetadata(tis, new Metadata(), new ParseContext(), suppressException); + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(p, metadata)) { + return getRecursiveMetadata(tis, metadata, new ParseContext(), suppressException); } } protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception { - try (TikaInputStream tis = TikaInputStream.get(filePath)) { + Metadata metadata = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(filePath, metadata)) { return getRecursiveMetadata(tis, true); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java index 2a5dbf2b9..7ef747157 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java @@ -19,6 +19,8 @@ package org.apache.tika.parser; import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import org.junit.jupiter.api.Test; @@ -104,7 +106,7 @@ public class AutoDetectParserConfigTest extends TikaTest { //test to make sure that the decorator is only applied once for //legacy (e.g. not RecursiveParserWrapperHandler) parsing TikaConfig tikaConfig = null; - try (InputStream is = OOXMLParserTest.class.getResourceAsStream( + try (InputStream is = AutoDetectParserConfigTest.class.getResourceAsStream( "/configs/tika-config-digests.xml")) { tikaConfig = new TikaConfig(is); } @@ -138,4 +140,23 @@ public class AutoDetectParserConfigTest extends TikaTest { assertEquals("org.apache.tika.parser.EmptyParser", metadataList.get(0).get("X-TIKA:Parsed-By")); } + + @Test + public void testContainerZeroBytes() throws Exception { + Path tmp = Files.createTempFile("tika-test", ""); + try { + TikaConfig tikaConfig = null; + try (InputStream is = AutoDetectParserConfigTest.class.getResourceAsStream( + "/configs/tika-config-digests.xml")) { + tikaConfig = new TikaConfig(is); + } + Parser p = new AutoDetectParser(tikaConfig); + List<Metadata> metadataList = getRecursiveMetadata(tmp, p, true); + assertEquals("d41d8cd98f00b204e9800998ecf8427e", + metadataList.get(0).get("X-TIKA:digest:MD5")); + assertEquals("0", metadataList.get(0).get(Metadata.CONTENT_LENGTH)); + } finally { + Files.delete(tmp); + } + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml index 0ec913d50..c1fbb7b48 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml @@ -27,5 +27,6 @@ <markLimit>100000</markLimit> <algorithmString>sha256:32,md5</algorithmString> </digesterFactory> + <throwOnZeroBytes>false</throwOnZeroBytes> </autoDetectParserConfig> </properties>
