Repository: tika Updated Branches: refs/heads/2.x 4392681af -> cde4c0aa8
TIKA-2098 small clean up. Test for writelimitreached for each catchable IOException. Many thanks to Alexander Kazakov for finding this and submitting https://github.com/apache/tika/pull/134 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/cde4c0aa Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/cde4c0aa Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/cde4c0aa Branch: refs/heads/2.x Commit: cde4c0aa8b668e0964f2b83fab67588292ffc993 Parents: 4392681 Author: tballison <[email protected]> Authored: Mon Sep 26 16:07:18 2016 -0400 Committer: tballison <[email protected]> Committed: Mon Sep 26 16:07:18 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 13 +++++++++---- .../java/org/apache/tika/parser/pdf/PDFParserTest.java | 9 +++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/cde4c0aa/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 832b06e..44e7032 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -16,6 +16,9 @@ */ package org.apache.tika.parser.pdf; +import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; + +import javax.xml.stream.XMLStreamException; import java.awt.image.BufferedImage; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; @@ -33,9 +36,6 @@ import java.util.Locale; import java.util.Map; import java.util.TreeMap; -import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; - -import javax.xml.stream.XMLStreamException; import org.apache.commons.io.IOExceptionWithCause; import org.apache.commons.io.IOUtils; import org.apache.pdfbox.pdmodel.PDDocument; @@ -79,7 +79,6 @@ import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; -import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR; class AbstractPDF2XHTML extends PDFTextStripper { @@ -236,6 +235,12 @@ class AbstractPDF2XHTML extends PDFTextStripper { void handleCatchableIOE(IOException e) throws IOException { if (config.isCatchIntermediateIOExceptions()) { + if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null && + e.getCause().getMessage().contains("Your document contained more than")) { + //TODO -- is there a cleaner way of checking for: + // WriteOutContentHandler.WriteLimitReachedException? + throw e; + } String msg = e.getMessage(); if (msg == null) { msg = "IOException, no message"; http://git-wip-us.apache.org/repos/asf/tika/blob/cde4c0aa/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 9621d32..ff74e50 100644 --- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -32,6 +32,7 @@ import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.log4j.Level; import org.apache.log4j.Logger; +import org.apache.tika.Tika; import org.apache.tika.TikaTest; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; @@ -1204,6 +1205,14 @@ public class PDFParserTest extends TikaTest { assertEquals("Sample Title", m.get(TikaCoreProperties.TITLE)); } + @Test + public void testMaxLength() throws Exception { + InputStream is = getResourceAsStream("/test-documents/testPDF.pdf"); + String content = new Tika().parseToString(is, new Metadata(), 100); + + assertTrue(content.length() <= 100); + } + private void assertException(String path, Parser parser, ParseContext context, Class expected) { boolean noEx = false; InputStream is = getResourceAsStream(path);
