Hi All, with regards to the thread "[PDFParser] - read limited number of characters" on Mar 29, I would like to propose the attached patch. I noticed that in Tika 1.6 there have been some work around a better handling of the WriteLimitReachedException condition, but I believe it could be even improved.
What do you think? Ste
Index: tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java =================================================================== --- tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (revision 1627940) +++ tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java (working copy) @@ -50,6 +50,11 @@ private int writeCount = 0; /** + * Flag to mark if the limit has been reached + */ + private boolean writeLimitReached = false; + + /** * Creates a content handler that writes content up to the given * write limit to the given content handler. * @@ -138,6 +143,7 @@ } else { super.characters(ch, start, writeLimit - writeCount); writeCount = writeLimit; + writeLimitReached = true; throw new WriteLimitReachedException( "Your document contained more than " + writeLimit + " characters, and so your requested limit has been" @@ -156,6 +162,7 @@ } else { super.ignorableWhitespace(ch, start, writeLimit - writeCount); writeCount = writeLimit; + writeLimitReached = true; throw new WriteLimitReachedException( "Your document contained more than " + writeLimit + " characters, and so your requested limit has been" @@ -173,31 +180,26 @@ * @param t throwable * @return <code>true</code> if the write limit was reached, * <code>false</code> otherwise + * + * Deprecated in Tika 1.6, use isWriteLimitReached(); the current + * implementation ignores the given Throwable and is equivalent to + * isWriteLimitReached() + * */ + @Deprecated public boolean isWriteLimitReached(Throwable t) { - if (t instanceof WriteLimitReachedException) { - return tag.equals(((WriteLimitReachedException) t).tag); - } else { - return t.getCause() != null && isWriteLimitReached(t.getCause()); - } + return isWriteLimitReached(); } - + /** - * The exception used as a signal when the write limit has been reached. + * Returns true if the limit has been reached, false otherwise. + * + * @since Apache Tika 1.6 + * @return <code>true</code> if the write limit was reached, + * <code>false</code> otherwise */ - private static class WriteLimitReachedException extends SAXException { - - /** Serial version UID */ - private static final long serialVersionUID = -1850581945459429943L; - - /** Serializable tag of the handler that caused this exception */ - private final Serializable tag; - - public WriteLimitReachedException(String message, Serializable tag) { - super(message); - this.tag = tag; - } - + public boolean isWriteLimitReached() { + return writeLimitReached; } } Index: tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java =================================================================== --- tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (revision 1627940) +++ tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (working copy) @@ -52,6 +52,7 @@ import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.PasswordProvider; +import org.apache.tika.sax.WriteLimitReachedException; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -157,7 +158,13 @@ metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); if (handler != null) { - PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); + try { + PDF2Text.process(pdfDocument, handler, context, metadata, localConfig); + } catch (WriteLimitReachedException x) { + // + // This is a valid condition; just ignoring the exception + // + } } } finally { Index: tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java =================================================================== --- tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (revision 1627940) +++ tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (working copy) @@ -144,21 +144,21 @@ * * @param pdf2XHTML */ - public void configure(PDF2XHTML pdf2XHTML) { - pdf2XHTML.setForceParsing(true); - pdf2XHTML.setSortByPosition(getSortByPosition()); + public void configure(PDF2Text pdf2text) { + pdf2text.setForceParsing(true); + pdf2text.setSortByPosition(getSortByPosition()); if (getEnableAutoSpace()) { - pdf2XHTML.setWordSeparator(" "); + pdf2text.setWordSeparator(" "); } else { - pdf2XHTML.setWordSeparator(""); + pdf2text.setWordSeparator(""); } if (getAverageCharTolerance() != null) { - pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance()); + pdf2text.setAverageCharTolerance(getAverageCharTolerance()); } if (getSpacingTolerance() != null) { - pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); + pdf2text.setSpacingTolerance(getSpacingTolerance()); } - pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); + pdf2text.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); } Index: tika-parsers/src/test/java/org/apache/tika/TikaTest.java =================================================================== --- tika-parsers/src/test/java/org/apache/tika/TikaTest.java (revision 1627940) +++ tika-parsers/src/test/java/org/apache/tika/TikaTest.java (working copy) @@ -16,9 +16,7 @@ */ package org.apache.tika; -import org.apache.tika.extractor.EmbeddedResourceHandler; -import org.apache.tika.io.IOUtils; -import org.apache.tika.io.TikaInputStream; + import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; @@ -38,6 +36,9 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import org.apache.tika.extractor.EmbeddedResourceHandler; +import org.apache.tika.io.IOUtils; +import org.apache.tika.io.TikaInputStream; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; @@ -102,6 +103,10 @@ protected XMLResult getXML(String filePath) throws Exception { return getXML(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata()); } + + protected String getText(String filePath) throws Exception { + return getText(getResourceAsStream("/test-documents/" + filePath), new AutoDetectParser(), new Metadata()); + } protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata) throws Exception { ParseContext context = new ParseContext(); Index: tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java =================================================================== --- tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (revision 1627940) +++ tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (working copy) @@ -16,6 +16,7 @@ */ package org.apache.tika.parser.pdf; + import org.apache.tika.TikaTest; import org.apache.tika.extractor.ContainerExtractor; import org.apache.tika.extractor.DocumentSelector; @@ -910,7 +911,30 @@ } - + @Test + public void testLimitTextToParse() throws Exception { + ContentHandler handler = new BodyContentHandler(); + + new PDFParser().parse( + getResourceAsStream("/test-documents/testPDF.pdf"), + handler, + new Metadata(), + new ParseContext() + ); + + assertEquals(1067, handler.toString().length()); + + handler = new BodyContentHandler(500); + + new PDFParser().parse( + getResourceAsStream("/test-documents/testPDF.pdf"), + handler, + new Metadata(), + new ParseContext() + ); + + assertEquals(500, handler.toString().length()); + } @Test public void testInlineConfig() throws Exception { Index: tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java =================================================================== --- tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (revision 1627940) +++ tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java (working copy) @@ -91,7 +91,7 @@ @Test public void testUmlautSpacesExtraction2() throws Exception { - String content = getText("testRTFUmlautSpaces2.rtf"); + String content = getResultText("testRTFUmlautSpaces2.rtf"); content = content.replaceAll("\\s+", ""); assertEquals("\u00DCbersicht", content); } @@ -98,7 +98,7 @@ @Test public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception { - String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf"); + String content = getResultText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf"); assertContains("\u5E74", content); assertContains("\u5ff5", content); @@ -109,13 +109,13 @@ @Test public void testHexEscapeInsideWord() throws Exception { - String content = getText("testRTFHexEscapeInsideWord.rtf"); + String content = getResultText("testRTFHexEscapeInsideWord.rtf"); assertContains("ESP\u00cdRITO", content); } @Test public void testWindowsCodepage1250() throws Exception { - String content = getText("testRTFWindowsCodepage1250.rtf"); + String content = getResultText("testRTFWindowsCodepage1250.rtf"); assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content); assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content); } @@ -131,7 +131,7 @@ @Test public void testTableCellSeparation2() throws Exception { - String content = getText("testRTFTableCellSeparation2.rtf"); + String content = getResultText("testRTFTableCellSeparation2.rtf"); // TODO: why do we insert extra whitespace...? content = content.replaceAll("\\s+"," "); assertContains("Station Fax", content); @@ -175,7 +175,7 @@ @Test public void testGothic() throws Exception { - String content = getText("testRTFUnicodeGothic.rtf"); + String content = getResultText("testRTFUnicodeGothic.rtf"); assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content); } @@ -231,7 +231,7 @@ @Test public void testTextWithCurlyBraces() throws Exception { - String content = getText("testRTFWithCurlyBraces.rtf"); + String content = getResultText("testRTFWithCurlyBraces.rtf"); assertContains("{ some text inside curly brackets }", content); } @@ -597,7 +597,8 @@ return new Result(content, metadata); } - private String getText(String filename) throws Exception { + private String getResultText(String filename) throws Exception { return getResult(filename).text; } } + \ No newline at end of file