Repository: tika Updated Branches: refs/heads/master 0a4b0e80b -> 9b497d1fe
TIKA-2098 small clean up. Test for writelimitreached for each catchable IOException. Many thanks to Alexander Kazakov for finding this and submitting https://github.com/apache/tika/pull/134 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9b497d1f Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9b497d1f Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9b497d1f Branch: refs/heads/master Commit: 9b497d1fef2fe183b2099f1a835113dade8a0227 Parents: 0a4b0e8 Author: tballison <[email protected]> Authored: Mon Sep 26 16:01:16 2016 -0400 Committer: tballison <[email protected]> Committed: Mon Sep 26 16:01:16 2016 -0400 ---------------------------------------------------------------------- .../java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 7 +++++++ .../src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java | 9 ++++----- 2 files changed, 11 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/9b497d1f/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 4b46ce9..ba430ab 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -235,6 +235,13 @@ class AbstractPDF2XHTML extends PDFTextStripper { void handleCatchableIOE(IOException e) throws IOException { if (config.isCatchIntermediateIOExceptions()) { + if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null && + e.getCause().getMessage().contains("Your document contained more than")) { + //TODO -- is there a cleaner way of checking for: + // WriteOutContentHandler.WriteLimitReachedException? + throw e; + } + String msg = e.getMessage(); if (msg == null) { msg = "IOException, no message"; http://git-wip-us.apache.org/repos/asf/tika/blob/9b497d1f/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index 5dd0680..e9c27eb 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -121,11 +121,6 @@ class PDF2XHTML extends AbstractPDF2XHTML { public void close() { } }); - - if (pdf2XHTML.exceptions.size() > 0) { - //throw the first - throw pdf2XHTML.exceptions.get(0); - } } catch (IOException e) { if (e.getCause() instanceof SAXException) { throw (SAXException) e.getCause(); @@ -133,6 +128,10 @@ class PDF2XHTML extends AbstractPDF2XHTML { throw new TikaException("Unable to extract PDF content", e); } } + if (pdf2XHTML.exceptions.size() > 0) { + //throw the first + throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0)); + } }
