This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e48b10fe9 TIKA-3976 (#972)
e48b10fe9 is described below
commit e48b10fe917b47cb7660227e558c8be4e15a84dd
Author: Tim Allison <[email protected]>
AuthorDate: Fri Feb 17 13:34:58 2023 -0500
TIKA-3976 (#972)
* TIKA-3976 -- allow users to turn off exception on zero-byte files
---
CHANGES.txt | 3 +++
.../org/apache/tika/parser/AutoDetectParser.java | 10 ++++++----
.../apache/tika/parser/AutoDetectParserConfig.java | 13 +++++++++++-
.../src/test/java/org/apache/tika/TikaTest.java | 18 ++++++++++-------
.../tika/parser/AutoDetectParserConfigTest.java | 23 +++++++++++++++++++++-
.../test/resources/configs/tika-config-digests.xml | 1 +
6 files changed, 55 insertions(+), 13 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 5c48a2efd..87671965c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.7.1 - ???
+ * Users may now avoid the ZeroByteFileException via a
+ setting on the AutoDetectParserConfig (TIKA-3976).
+
* Fix bug in closing <a> elements in the presence of <b> elements
in RTF files (TIKA-3972).
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 12c0e82ae..491ad572e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -177,11 +177,13 @@ public class AutoDetectParser extends CompositeParser {
}
//check for zero-byte inputstream
if (tis.getOpenContainer() == null) {
- tis.mark(1);
- if (tis.read() == -1) {
- throw new ZeroByteFileException("InputStream must have > 0
bytes");
+ if (autoDetectParserConfig.getThrowOnZeroBytes()) {
+ tis.mark(1);
+ if (tis.read() == -1) {
+ throw new ZeroByteFileException("InputStream must have
> 0 bytes");
+ }
+ tis.reset();
}
- tis.reset();
}
handler = decorateHandler(handler, metadata, context,
autoDetectParserConfig);
// TIKA-216: Zip bomb prevention
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 215b0bc32..d5a1567e1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -100,6 +100,8 @@ public class AutoDetectParserConfig extends ConfigBase
implements Serializable {
private DigestingParser.DigesterFactory digesterFactory = null;
+ private boolean throwOnZeroBytes = true;
+
/**
* Creates a SecureContentHandlerConfig using the passed in parameters.
*
@@ -198,6 +200,14 @@ public class AutoDetectParserConfig extends ConfigBase
implements Serializable {
return this.digesterFactory;
}
+ public void setThrowOnZeroBytes(boolean throwOnZeroBytes) {
+ this.throwOnZeroBytes = throwOnZeroBytes;
+ }
+
+ public boolean getThrowOnZeroBytes() {
+ return throwOnZeroBytes;
+ }
+
@Override
public String toString() {
return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ",
outputThreshold=" +
@@ -206,7 +216,8 @@ public class AutoDetectParserConfig extends ConfigBase
implements Serializable {
maximumPackageEntryDepth + ", metadataWriteFilterFactory=" +
metadataWriteFilterFactory + ",
embeddedDocumentExtractorFactory=" +
embeddedDocumentExtractorFactory + ",
contentHandlerDecoratorFactory=" +
- contentHandlerDecoratorFactory + ", digesterFactory=" +
digesterFactory + '}';
+ contentHandlerDecoratorFactory + ", digesterFactory=" +
digesterFactory +
+ ", throwOnZeroBytes=" + throwOnZeroBytes + '}';
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index fa112ca4c..a00d7b2b0 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -342,29 +342,33 @@ public abstract class TikaTest {
protected List<Metadata> getRecursiveMetadata(Path path, ParseContext
context,
boolean suppressException)
throws Exception {
- try (TikaInputStream tis = TikaInputStream.get(path)) {
- return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, new
Metadata(), context,
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+ return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, metadata,
context,
suppressException);
}
}
protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
boolean suppressException)
throws Exception {
- try (TikaInputStream tis = TikaInputStream.get(path)) {
- return getRecursiveMetadata(tis, parser, new Metadata(), new
ParseContext(),
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+ return getRecursiveMetadata(tis, parser, metadata, new
ParseContext(),
suppressException);
}
}
protected List<Metadata> getRecursiveMetadata(Path p, boolean
suppressException)
throws Exception {
- try (TikaInputStream tis = TikaInputStream.get(p)) {
- return getRecursiveMetadata(tis, new Metadata(), new
ParseContext(), suppressException);
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+ return getRecursiveMetadata(tis, metadata, new ParseContext(),
suppressException);
}
}
protected List<Metadata> getRecursiveMetadata(Path filePath) throws
Exception {
- try (TikaInputStream tis = TikaInputStream.get(filePath)) {
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(filePath, metadata)) {
return getRecursiveMetadata(tis, true);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 2a5dbf2b9..7ef747157 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -19,6 +19,8 @@ package org.apache.tika.parser;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
@@ -104,7 +106,7 @@ public class AutoDetectParserConfigTest extends TikaTest {
//test to make sure that the decorator is only applied once for
//legacy (e.g. not RecursiveParserWrapperHandler) parsing
TikaConfig tikaConfig = null;
- try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ try (InputStream is =
AutoDetectParserConfigTest.class.getResourceAsStream(
"/configs/tika-config-digests.xml")) {
tikaConfig = new TikaConfig(is);
}
@@ -138,4 +140,23 @@ public class AutoDetectParserConfigTest extends TikaTest {
assertEquals("org.apache.tika.parser.EmptyParser",
metadataList.get(0).get("X-TIKA:Parsed-By"));
}
+
+ @Test
+ public void testContainerZeroBytes() throws Exception {
+ Path tmp = Files.createTempFile("tika-test", "");
+ try {
+ TikaConfig tikaConfig = null;
+ try (InputStream is =
AutoDetectParserConfigTest.class.getResourceAsStream(
+ "/configs/tika-config-digests.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList = getRecursiveMetadata(tmp, p, true);
+ assertEquals("d41d8cd98f00b204e9800998ecf8427e",
+ metadataList.get(0).get("X-TIKA:digest:MD5"));
+ assertEquals("0",
metadataList.get(0).get(Metadata.CONTENT_LENGTH));
+ } finally {
+ Files.delete(tmp);
+ }
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
index 0ec913d50..c1fbb7b48 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
@@ -27,5 +27,6 @@
<markLimit>100000</markLimit>
<algorithmString>sha256:32,md5</algorithmString>
</digesterFactory>
+ <throwOnZeroBytes>false</throwOnZeroBytes>
</autoDetectParserConfig>
</properties>