This is an automated email from the ASF dual-hosted git repository.

tballison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new b723055144 Add maxPages option to PDFParserConfig to limit page 
processing (#2803)
b723055144 is described below

commit b723055144eae4bce977c221380a5adcbe88a861
Author: Julien Nioche <[email protected]>
AuthorDate: Fri May 8 13:34:37 2026 +0100

    Add maxPages option to PDFParserConfig to limit page processing (#2803)
    
    Signed-off-by: Julien Nioche <[email protected]>
---
 .../resources/config-examples/pdf-parser-full.json |  1 +
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  6 +++++
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 24 +++++++++++++++++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  | 28 ++++++++++++++++++++++
 4 files changed, 59 insertions(+)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
index 9f455918de..b5446871fc 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-integration-tests/src/test/resources/config-examples/pdf-parser-full.json
@@ -25,6 +25,7 @@
         "imageStrategy": "NONE",
         "maxIncrementalUpdates": 10,
         "maxMainMemoryBytes": 536870912,
+        "maxPages": -1,
         "ocr": {
           "dpi": 300,
           // Options: PNG, TIFF, JPEG
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index f2c21ee743..ec68f8c2bd 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -1486,9 +1486,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
      */
     @Override
     protected void processPages(PDPageTree pages) throws IOException {
+        int maxPages = config.getMaxPages();
+        int pagesProcessed = 0;
         for (PDPage page : pages) {
+            if (maxPages > 0 && pagesProcessed >= maxPages) {
+                break;
+            }
             if (getCurrentPageNo() >= getStartPage() && getCurrentPageNo() <= 
getEndPage()) {
                 processPage(page);
+                pagesProcessed++;
             }
             pageIndex++;
         }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 8679605eb7..96b3faa572 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -147,6 +147,8 @@ public class PDFParserConfig implements Serializable {
 
     int maxIncrementalUpdates = 10;
 
+    private int maxPages = -1;
+
     private boolean throwOnEncryptedPayload = false;
 
     /**
@@ -770,6 +772,28 @@ public class PDFParserConfig implements Serializable {
         this.maxIncrementalUpdates = maxIncrementalUpdates;
     }
 
+    /**
+     * @return maximum number of pages to process, or -1 for no limit
+     */
+    public int getMaxPages() {
+        return maxPages;
+    }
+
+    /**
+     * Set the maximum number of pages to process per document.
+     * Use -1 (the default) for no limit.
+     *
+     * @param maxPages must be -1 or &gt;= 1
+     * @throws IllegalArgumentException if the value is 0 or less than -1
+     */
+    public void setMaxPages(int maxPages) {
+        if (maxPages != -1 && maxPages < 1) {
+            throw new IllegalArgumentException(
+                    "maxPages must be -1 (no limit) or >= 1, got: " + 
maxPages);
+        }
+        this.maxPages = maxPages;
+    }
+
     public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
         this.throwOnEncryptedPayload = throwOnEncryptedPayload;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 653b0e8d62..6ce771c6f9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1509,6 +1509,34 @@ public class PDFParserTest extends TikaTest {
         assertArrayEquals(expectedSubjectVals, 
m.getValues(TikaCoreProperties.SUBJECT));
     }
 
+    @Test
+    public void testMaxPages() throws Exception {
+        PDFParser parser = new PDFParser();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setMaxPages(3);
+        ParseContext context = new ParseContext();
+        context.set(PDFParserConfig.class, config);
+
+        // testJournalParser.pdf has 10 pages; limiting to 3 must process only 
the first 3 pages
+        String truncated = getText("testJournalParser.pdf", parser, new 
Metadata(), context);
+        String full = getText("testJournalParser.pdf", parser);
+        assertTrue(full.length() > truncated.length(),
+                "Full parse should yield more content than a 3-page-limited 
parse");
+        assertTrue(truncated.contains("Scalability of Controlling"),
+                "Content from page 1 should be present in truncated output");
+        assertFalse(truncated.contains("CONCLUSION"),
+                "Content from page 10 should not be present in truncated 
output");
+    }
+
+    @Test
+    public void testMaxPagesInvalidValue() {
+        PDFParserConfig config = new PDFParserConfig();
+        assertThrows(IllegalArgumentException.class, () -> 
config.setMaxPages(0));
+        assertThrows(IllegalArgumentException.class, () -> 
config.setMaxPages(-2));
+        config.setMaxPages(-1);
+        config.setMaxPages(1);
+    }
+
     /**
     @Test
     public void testWriteLimit() throws Exception {

Reply via email to