This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new a153397 TIKA-2450 -- AutoDetectParser should throw a
ZeroByteFileException for zero-byte files after detection on the file extension.
a153397 is described below
commit a1533977852307c5095efaebfcc5a896d914a57c
Author: tballison <[email protected]>
AuthorDate: Wed Aug 30 13:16:15 2017 -0400
TIKA-2450 -- AutoDetectParser should throw a ZeroByteFileException for
zero-byte files after detection on the file extension.
---
CHANGES.txt | 3 +
.../tika/exception/ZeroByteFileException.java | 11 ++++
.../org/apache/tika/parser/AutoDetectParser.java | 10 +++-
.../apache/tika/parser/AutoDetectParserTest.java | 64 ++++++++++++++++------
4 files changed, 71 insertions(+), 17 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 17573b1..a66b15c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.17 - ???
+ * AutoDetectParser throws ZeroByteFileException for zero-byte files after
+ detection on the file extension (TIKA-2450).
+
* Extract phonetic runs in docx with experimental SAX parser (TIKA-2448).
* Extract phonetic runs from xls and allow users to turn off extraction
diff --git
a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
new file mode 100644
index 0000000..b497f1d
--- /dev/null
+++
b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
@@ -0,0 +1,11 @@
+package org.apache.tika.exception;
+
+/**
+ * Exception thrown by the AutoDetectParser when a file contains zero-bytes.
+ */
+public class ZeroByteFileException extends TikaException {
+
+ public ZeroByteFileException(String msg) {
+ super(msg);
+ }
+}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index fe1c659..a42c7ca 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -23,6 +23,7 @@ import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TemporaryResources;
@@ -114,7 +115,14 @@ public class AutoDetectParser extends CompositeParser {
// Automatically detect the MIME type of the document
MediaType type = detector.detect(tis, metadata);
metadata.set(Metadata.CONTENT_TYPE, type.toString());
-
+ //check for zero-byte inputstream
+ if (tis.getOpenContainer() == null) {
+ tis.mark(1);
+ if (tis.read() == -1) {
+ throw new ZeroByteFileException("InputStream must have > 0
bytes");
+ }
+ tis.reset();
+ }
// TIKA-216: Zip bomb prevention
SecureContentHandler sch =
handler != null ? new SecureContentHandler(handler, tis) :
null;
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 817308f..a46564e 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -16,24 +16,12 @@
*/
package org.apache.tika.parser;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipOutputStream;
-
+import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.XMPDM;
@@ -45,7 +33,22 @@ import org.gagravarr.tika.VorbisParser;
import org.junit.Test;
import org.xml.sax.ContentHandler;
-public class AutoDetectParserTest {
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class AutoDetectParserTest extends TikaTest {
private TikaConfig tika = TikaConfig.getDefaultConfig();
// Easy to read constants for the MIME types:
@@ -380,6 +383,35 @@ public class AutoDetectParserTest {
assertEquals("value", metadata.get("MyParser"));
}
+ @Test
+ public void testZeroByteFileException() throws Exception {
+ String[] exts = new String[]{
+ "xls",
+ "doc",
+ "pdf",
+ "rtf"
+ };
+
+ String[] mimes = new String[]{
+ EXCEL,
+ WORD,
+ PDF,
+ RTF
+ };
+
+ for (int i = 0; i < exts.length; i++) {
+ Metadata m = new Metadata();
+ m.set(Metadata.RESOURCE_NAME_KEY, "file." + exts[i]);
+ try {
+ getXML(TikaInputStream.get(new byte[0]), new
AutoDetectParser(), m);
+ fail("should have thrown zero byte exception");
+ } catch (ZeroByteFileException e) {
+
+ }
+ assertEquals(mimes[i], m.get(Metadata.CONTENT_TYPE));
+ }
+ }
+
private static final MediaType MY_MEDIA_TYPE = new
MediaType("application", "x-myparser");
/**
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].