This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3976
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e6c389c5452ed7792c1fff5dcfad9de527d32833
Author: tallison <[email protected]>
AuthorDate: Fri Feb 17 11:32:40 2023 -0500

    TIKA-3976 -- allow users to turn off exception on zero-byte files
---
 .../org/apache/tika/parser/AutoDetectParser.java   | 10 ++++++----
 .../apache/tika/parser/AutoDetectParserConfig.java | 13 +++++++++++-
 .../src/test/java/org/apache/tika/TikaTest.java    | 18 ++++++++++-------
 .../tika/parser/AutoDetectParserConfigTest.java    | 23 +++++++++++++++++++++-
 .../test/resources/configs/tika-config-digests.xml |  1 +
 5 files changed, 52 insertions(+), 13 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 12c0e82ae..491ad572e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -177,11 +177,13 @@ public class AutoDetectParser extends CompositeParser {
             }
             //check for zero-byte inputstream
             if (tis.getOpenContainer() == null) {
-                tis.mark(1);
-                if (tis.read() == -1) {
-                    throw new ZeroByteFileException("InputStream must have > 0 
bytes");
+                if (autoDetectParserConfig.getThrowOnZeroBytes()) {
+                    tis.mark(1);
+                    if (tis.read() == -1) {
+                        throw new ZeroByteFileException("InputStream must have 
> 0 bytes");
+                    }
+                    tis.reset();
                 }
-                tis.reset();
             }
             handler = decorateHandler(handler, metadata, context, 
autoDetectParserConfig);
             // TIKA-216: Zip bomb prevention
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 215b0bc32..d5a1567e1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -100,6 +100,8 @@ public class AutoDetectParserConfig extends ConfigBase 
implements Serializable {
 
     private DigestingParser.DigesterFactory digesterFactory = null;
 
+    private boolean throwOnZeroBytes = true;
+
     /**
      * Creates a SecureContentHandlerConfig using the passed in parameters.
      *
@@ -198,6 +200,14 @@ public class AutoDetectParserConfig extends ConfigBase 
implements Serializable {
         return this.digesterFactory;
     }
 
+    public void setThrowOnZeroBytes(boolean throwOnZeroBytes) {
+        this.throwOnZeroBytes = throwOnZeroBytes;
+    }
+
+    public boolean getThrowOnZeroBytes() {
+        return throwOnZeroBytes;
+    }
+
     @Override
     public String toString() {
         return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", 
outputThreshold=" +
@@ -206,7 +216,8 @@ public class AutoDetectParserConfig extends ConfigBase 
implements Serializable {
                 maximumPackageEntryDepth + ", metadataWriteFilterFactory=" +
                 metadataWriteFilterFactory + ", 
embeddedDocumentExtractorFactory=" +
                 embeddedDocumentExtractorFactory + ", 
contentHandlerDecoratorFactory=" +
-                contentHandlerDecoratorFactory + ", digesterFactory=" + 
digesterFactory + '}';
+                contentHandlerDecoratorFactory + ", digesterFactory=" + 
digesterFactory +
+                ", throwOnZeroBytes=" + throwOnZeroBytes + '}';
     }
 }
 
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index fa112ca4c..a00d7b2b0 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -342,29 +342,33 @@ public abstract class TikaTest {
 
     protected List<Metadata> getRecursiveMetadata(Path path, ParseContext 
context,
                                                   boolean suppressException) 
throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, new 
Metadata(), context,
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, metadata, 
context,
                     suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
                                                   boolean suppressException) 
throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, parser, new Metadata(), new 
ParseContext(),
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, parser, metadata, new 
ParseContext(),
                     suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path p, boolean 
suppressException)
             throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(p)) {
-            return getRecursiveMetadata(tis, new Metadata(), new 
ParseContext(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+            return getRecursiveMetadata(tis, metadata, new ParseContext(), 
suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path filePath) throws 
Exception {
-        try (TikaInputStream tis = TikaInputStream.get(filePath)) {
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(filePath, metadata)) {
             return getRecursiveMetadata(tis, true);
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 2a5dbf2b9..7ef747157 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -19,6 +19,8 @@ package org.apache.tika.parser;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.List;
 
 import org.junit.jupiter.api.Test;
@@ -104,7 +106,7 @@ public class AutoDetectParserConfigTest extends TikaTest {
         //test to make sure that the decorator is only applied once for
         //legacy (e.g. not RecursiveParserWrapperHandler) parsing
         TikaConfig tikaConfig = null;
-        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+        try (InputStream is = 
AutoDetectParserConfigTest.class.getResourceAsStream(
                 "/configs/tika-config-digests.xml")) {
             tikaConfig = new TikaConfig(is);
         }
@@ -138,4 +140,23 @@ public class AutoDetectParserConfigTest extends TikaTest {
         assertEquals("org.apache.tika.parser.EmptyParser",
                 metadataList.get(0).get("X-TIKA:Parsed-By"));
     }
+
+    @Test
+    public void testContainerZeroBytes() throws Exception {
+        Path tmp = Files.createTempFile("tika-test", "");
+        try {
+            TikaConfig tikaConfig = null;
+            try (InputStream is = 
AutoDetectParserConfigTest.class.getResourceAsStream(
+                    "/configs/tika-config-digests.xml")) {
+                tikaConfig = new TikaConfig(is);
+            }
+            Parser p = new AutoDetectParser(tikaConfig);
+            List<Metadata> metadataList = getRecursiveMetadata(tmp, p, true);
+            assertEquals("d41d8cd98f00b204e9800998ecf8427e",
+                    metadataList.get(0).get("X-TIKA:digest:MD5"));
+            assertEquals("0", 
metadataList.get(0).get(Metadata.CONTENT_LENGTH));
+        } finally {
+            Files.delete(tmp);
+        }
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
index 0ec913d50..c1fbb7b48 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
@@ -27,5 +27,6 @@
       <markLimit>100000</markLimit>
       <algorithmString>sha256:32,md5</algorithmString>
     </digesterFactory>
+    <throwOnZeroBytes>false</throwOnZeroBytes>
   </autoDetectParserConfig>
 </properties>

Reply via email to