Repository: tika
Updated Branches:
  refs/heads/master 0a4b0e80b -> 9b497d1fe


TIKA-2098 small clean up.  Test for writelimitreached for each catchable 
IOException.  Many thanks to Alexander Kazakov for finding this and submitting 
https://github.com/apache/tika/pull/134


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9b497d1f
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9b497d1f
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9b497d1f

Branch: refs/heads/master
Commit: 9b497d1fef2fe183b2099f1a835113dade8a0227
Parents: 0a4b0e8
Author: tballison <[email protected]>
Authored: Mon Sep 26 16:01:16 2016 -0400
Committer: tballison <[email protected]>
Committed: Mon Sep 26 16:01:16 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 7 +++++++
 .../src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java | 9 ++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/9b497d1f/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 4b46ce9..ba430ab 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -235,6 +235,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     void handleCatchableIOE(IOException e) throws IOException {
         if (config.isCatchIntermediateIOExceptions()) {
+            if (e.getCause() instanceof SAXException && 
e.getCause().getMessage() != null &&
+                    e.getCause().getMessage().contains("Your document 
contained more than")) {
+                //TODO -- is there a cleaner way of checking for:
+                // WriteOutContentHandler.WriteLimitReachedException?
+                throw e;
+            }
+
             String msg = e.getMessage();
             if (msg == null) {
                 msg = "IOException, no message";

http://git-wip-us.apache.org/repos/asf/tika/blob/9b497d1f/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 5dd0680..e9c27eb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -121,11 +121,6 @@ class PDF2XHTML extends AbstractPDF2XHTML {
                 public void close() {
                 }
             });
-
-            if (pdf2XHTML.exceptions.size() > 0) {
-                //throw the first
-                throw pdf2XHTML.exceptions.get(0);
-            }
         } catch (IOException e) {
             if (e.getCause() instanceof SAXException) {
                 throw (SAXException) e.getCause();
@@ -133,6 +128,10 @@ class PDF2XHTML extends AbstractPDF2XHTML {
                 throw new TikaException("Unable to extract PDF content", e);
             }
         }
+        if (pdf2XHTML.exceptions.size() > 0) {
+            //throw the first
+            throw new TikaException("Unable to extract PDF content", 
pdf2XHTML.exceptions.get(0));
+        }
     }
 
 

Reply via email to