This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new ba72c86 TIKA-3382 -- improve writelimitreached handling in numerous
parsers -- clean up PDFParser
ba72c86 is described below
commit ba72c8625ee7e9cc3956e2b141c84ee7319c4f78
Author: tallison <[email protected]>
AuthorDate: Mon May 3 18:06:01 2021 -0400
TIKA-3382 -- improve writelimitreached handling in numerous parsers --
clean up PDFParser
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 20 ++------------------
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 11 -----------
2 files changed, 2 insertions(+), 29 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 2cc610e..1a4f768 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -88,6 +88,7 @@ import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
@@ -426,8 +427,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
void handleCatchableIOE(IOException e) throws IOException {
if (config.getCatchIntermediateIOExceptions()) {
-
- if (isWriteLimitReached(e, 0)) {
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
throw e;
}
@@ -442,22 +442,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- boolean isWriteLimitReached(Throwable t, int depth) {
- if (depth > MAX_RECURSION_DEPTH) {
- return false;
- }
- if (t == null) {
- return false;
- }
- if (t instanceof SAXException) {
-
- String msg = t.getMessage();
- if (msg != null && msg.contains("Your document contained more
than")) {
- return true;
- }
- }
- return isWriteLimitReached(t.getCause(), depth + 1);
- }
void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
if (config.getOcrStrategy().equals(NO_OCR)) {
return;
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index f92fdfd..5f9e4b9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -116,22 +116,11 @@ class PDF2XHTML extends AbstractPDF2XHTML {
}
}
if (pdf2XHTML.exceptions.size() > 0) {
- tryWriteLimitReached(pdf2XHTML.exceptions);
//throw the first
throw new TikaException("Unable to extract PDF content",
pdf2XHTML.exceptions.get(0));
}
}
- private static void tryWriteLimitReached(List<IOException> exceptions) {
- WriteOutContentHandler tmp = new WriteOutContentHandler();
- for (IOException e : exceptions) {
- if (tmp.isWriteLimitReached(e)) {
-
- }
-
- }
- }
-
@Override
public void processPage(PDPage page) throws IOException {
try {