This is an automated email from the ASF dual-hosted git repository.
tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new b723055144 Add maxPages option to PDFParserConfig to limit page
processing (#2803)
b723055144 is described below
commit b723055144eae4bce977c221380a5adcbe88a861
Author: Julien Nioche <[email protected]>
AuthorDate: Fri May 8 13:34:37 2026 +0100
Add maxPages option to PDFParserConfig to limit page processing (#2803)
Signed-off-by: Julien Nioche <[email protected]>
---
.../resources/config-examples/pdf-parser-full.json | 1 +
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 +++++
.../apache/tika/parser/pdf/PDFParserConfig.java | 24 +++++++++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 28 ++++++++++++++++++++++
4 files changed, 59 insertions(+)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
index 9f455918de..b5446871fc 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
@@ -25,6 +25,7 @@
"imageStrategy": "NONE",
"maxIncrementalUpdates": 10,
"maxMainMemoryBytes": 536870912,
+ "maxPages": -1,
"ocr": {
"dpi": 300,
// Options: PNG, TIFF, JPEG
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index f2c21ee743..ec68f8c2bd 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -1486,9 +1486,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
*/
@Override
protected void processPages(PDPageTree pages) throws IOException {
+ int maxPages = config.getMaxPages();
+ int pagesProcessed = 0;
for (PDPage page : pages) {
+ if (maxPages > 0 && pagesProcessed >= maxPages) {
+ break;
+ }
if (getCurrentPageNo() >= getStartPage() && getCurrentPageNo() <=
getEndPage()) {
processPage(page);
+ pagesProcessed++;
}
pageIndex++;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 8679605eb7..96b3faa572 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -147,6 +147,8 @@ public class PDFParserConfig implements Serializable {
int maxIncrementalUpdates = 10;
+ private int maxPages = -1;
+
private boolean throwOnEncryptedPayload = false;
/**
@@ -770,6 +772,28 @@ public class PDFParserConfig implements Serializable {
this.maxIncrementalUpdates = maxIncrementalUpdates;
}
+ /**
+ * @return maximum number of pages to process, or -1 for no limit
+ */
+ public int getMaxPages() {
+ return maxPages;
+ }
+
+ /**
+ * Set the maximum number of pages to process per document.
+ * Use -1 (the default) for no limit.
+ *
+ * @param maxPages must be -1 or >= 1
+ * @throws IllegalArgumentException if the value is 0 or less than -1
+ */
+ public void setMaxPages(int maxPages) {
+ if (maxPages != -1 && maxPages < 1) {
+ throw new IllegalArgumentException(
+ "maxPages must be -1 (no limit) or >= 1, got: " +
maxPages);
+ }
+ this.maxPages = maxPages;
+ }
+
public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
this.throwOnEncryptedPayload = throwOnEncryptedPayload;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 653b0e8d62..6ce771c6f9 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1509,6 +1509,34 @@ public class PDFParserTest extends TikaTest {
assertArrayEquals(expectedSubjectVals,
m.getValues(TikaCoreProperties.SUBJECT));
}
+ @Test
+ public void testMaxPages() throws Exception {
+ PDFParser parser = new PDFParser();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setMaxPages(3);
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+
+ // testJournalParser.pdf has 10 pages; limiting to 3 must process only
the first 3 pages
+ String truncated = getText("testJournalParser.pdf", parser, new
Metadata(), context);
+ String full = getText("testJournalParser.pdf", parser);
+ assertTrue(full.length() > truncated.length(),
+ "Full parse should yield more content than a 3-page-limited
parse");
+ assertTrue(truncated.contains("Scalability of Controlling"),
+ "Content from page 1 should be present in truncated output");
+ assertFalse(truncated.contains("CONCLUSION"),
+ "Content from page 10 should not be present in truncated
output");
+ }
+
+ @Test
+ public void testMaxPagesInvalidValue() {
+ PDFParserConfig config = new PDFParserConfig();
+ assertThrows(IllegalArgumentException.class, () ->
config.setMaxPages(0));
+ assertThrows(IllegalArgumentException.class, () ->
config.setMaxPages(-2));
+ config.setMaxPages(-1);
+ config.setMaxPages(1);
+ }
+
/**
@Test
public void testWriteLimit() throws Exception {