This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new a153397  TIKA-2450 -- AutoDetectParser should throw a 
ZeroByteFileException for zero-byte files after detection on the file extension.
a153397 is described below

commit a1533977852307c5095efaebfcc5a896d914a57c
Author: tballison <[email protected]>
AuthorDate: Wed Aug 30 13:16:15 2017 -0400

    TIKA-2450 -- AutoDetectParser should throw a ZeroByteFileException for 
zero-byte files after detection on the file extension.
---
 CHANGES.txt                                        |  3 +
 .../tika/exception/ZeroByteFileException.java      | 11 ++++
 .../org/apache/tika/parser/AutoDetectParser.java   | 10 +++-
 .../apache/tika/parser/AutoDetectParserTest.java   | 64 ++++++++++++++++------
 4 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 17573b1..a66b15c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.17 - ???
 
+  * AutoDetectParser throws ZeroByteFileException for zero-byte files after
+    detection on the file extension (TIKA-2450).
+
   * Extract phonetic runs in docx with experimental SAX parser (TIKA-2448).
 
   * Extract phonetic runs from xls and allow users to turn off extraction
diff --git 
a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java 
b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
new file mode 100644
index 0000000..b497f1d
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
@@ -0,0 +1,11 @@
+package org.apache.tika.exception;
+
+/**
+ * Exception thrown by the AutoDetectParser when a file contains zero-bytes.
+ */
+public class ZeroByteFileException extends TikaException {
+
+    public ZeroByteFileException(String msg) {
+        super(msg);
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index fe1c659..a42c7ca 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -23,6 +23,7 @@ import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TemporaryResources;
@@ -114,7 +115,14 @@ public class AutoDetectParser extends CompositeParser {
             // Automatically detect the MIME type of the document
             MediaType type = detector.detect(tis, metadata);
             metadata.set(Metadata.CONTENT_TYPE, type.toString());
-
+            //check for zero-byte inputstream
+            if (tis.getOpenContainer() == null) {
+                tis.mark(1);
+                if (tis.read() == -1) {
+                    throw new ZeroByteFileException("InputStream must have > 0 
bytes");
+                }
+                tis.reset();
+            }
             // TIKA-216: Zip bomb prevention
             SecureContentHandler sch = 
                 handler != null ? new SecureContentHandler(handler, tis) : 
null;
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 817308f..a46564e 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -16,24 +16,12 @@
  */
 package org.apache.tika.parser;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipOutputStream;
-
+import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.XMPDM;
@@ -45,7 +33,22 @@ import org.gagravarr.tika.VorbisParser;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
-public class AutoDetectParserTest {
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class AutoDetectParserTest extends TikaTest {
     private TikaConfig tika = TikaConfig.getDefaultConfig();
 
     // Easy to read constants for the MIME types:
@@ -380,6 +383,35 @@ public class AutoDetectParserTest {
         assertEquals("value", metadata.get("MyParser"));
     }
 
+    @Test
+    public void testZeroByteFileException() throws Exception {
+        String[] exts = new String[]{
+                "xls",
+                "doc",
+                "pdf",
+                "rtf"
+        };
+
+        String[] mimes = new String[]{
+                EXCEL,
+                WORD,
+                PDF,
+                RTF
+        };
+
+        for (int i = 0; i < exts.length; i++) {
+            Metadata m = new Metadata();
+            m.set(Metadata.RESOURCE_NAME_KEY, "file." + exts[i]);
+            try {
+                getXML(TikaInputStream.get(new byte[0]), new 
AutoDetectParser(), m);
+                fail("should have thrown zero byte exception");
+            } catch (ZeroByteFileException e) {
+
+            }
+            assertEquals(mimes[i], m.get(Metadata.CONTENT_TYPE));
+        }
+    }
+
     private static final MediaType MY_MEDIA_TYPE = new 
MediaType("application", "x-myparser");
     
     /**

-- 
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].

Reply via email to