This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8918c66384 TIKA-4664 - add Poppler renderer, replace MuPDF, add OCR
safety limits (#2612)
8918c66384 is described below
commit 8918c66384678100e9214c5546e35d01b7aea6e2
Author: Tim Allison <[email protected]>
AuthorDate: Tue Feb 17 16:03:11 2026 -0500
TIKA-4664 - add Poppler renderer, replace MuPDF, add OCR safety limits
(#2612)
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 27 ++
.../java/org/apache/tika/parser/pdf/OcrConfig.java | 60 +++++
.../apache/tika/parser/pdf/PDFParserConfig.java | 33 +++
.../tika/renderer/pdf/mutool/MuPDFRenderer.java | 150 -----------
.../tika/renderer/pdf/poppler/PopplerRenderer.java | 293 +++++++++++++++++++++
.../renderer/pdf/poppler/PopplerRendererTest.java | 167 ++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 21 +-
...fig.json => tika-rendering-poppler-config.json} | 2 +-
8 files changed, 590 insertions(+), 163 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index b2dc1f8a36..ea63c47a42 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -62,6 +62,7 @@ import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
import
org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
@@ -541,6 +542,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (c != null) {
c.increment();
}
+
+ // Enforce maxPagesToOcr limit
+ int maxPagesToOcr = config.getOcrMaxPagesToOcr();
+ if (maxPagesToOcr > 0 && c != null && c.getCount() > maxPagesToOcr) {
+ return;
+ }
MediaType ocrImageMediaType = MediaType.image("ocr-" +
config.getOcrImageFormat().getFormatName());
if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType))
{
if (ocrStrategy == OCR_ONLY || ocrStrategy ==
OCR_AND_TEXT_EXTRACTION) {
@@ -672,6 +679,26 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int id = renderingTracker.getNextId();
try {
+ // Check estimated pixel dimensions before rendering to
+ // prevent OOM on pathologically large pages
+ long maxPixels = config.getOcrMaxImagePixels();
+ if (maxPixels > 0) {
+ PDPage currentPage = pdDocument.getPage(pageIndex);
+ PDRectangle mediaBox = currentPage.getMediaBox();
+ long estWidth = (long) Math.ceil(mediaBox.getWidth() / 72.0 *
dpi);
+ long estHeight = (long) Math.ceil(mediaBox.getHeight() / 72.0
* dpi);
+ long estPixels = estWidth * estHeight;
+ if (estPixels > maxPixels) {
+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
+ "Skipping OCR for page " + (pageIndex + 1)
+ + ": estimated " + estPixels
+ + " pixels exceeds maxImagePixels="
+ + maxPixels);
+ return new RenderResult(RenderResult.STATUS.EXCEPTION,
+ id, null, pageMetadata);
+ }
+ }
+
BufferedImage image =
renderer.renderImageWithDPI(pageIndex, dpi,
config.getOcrImageType().getPdfBoxImageType());
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java
index f0c56198c9..45454b7275 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java
@@ -123,6 +123,31 @@ public class OcrConfig implements Serializable {
private ImageFormat imageFormat = ImageFormat.PNG;
private float imageQuality = 1.0f;
+ /**
+ * Maximum total pixels (width × height) allowed for a rendered
+ * page image before OCR is skipped for that page. This prevents OOM
+ * from rendering pathologically large PDF pages (e.g., architectural
+ * drawings, maps) via PDFBox's in-process renderer.
+ * <p>
+ * When using the Poppler renderer, prefer {@code maxScaleTo} on
+ * {@code PopplerRenderer} instead — it prevents the large image from
+ * ever being created. This limit is the safety net for the PDFBox
+ * rendering path.
+ * <p>
+ * Default is 100,000,000 (100 megapixels, roughly 10,000 ×
+ * 10,000). Set to {@code -1} for no limit (not recommended).
+ */
+ private long maxImagePixels = 100_000_000L;
+
+ /**
+ * Maximum number of pages to OCR per document. Pages beyond this
+ * limit are processed for text extraction only (if applicable)
+ * but not rendered or sent to OCR.
+ * <p>
+ * Default is {@code -1} (no limit — all pages are eligible for OCR).
+ */
+ private int maxPagesToOcr = -1;
+
public Strategy getStrategy() {
return strategy;
}
@@ -178,4 +203,39 @@ public class OcrConfig implements Serializable {
public void setImageQuality(float imageQuality) {
this.imageQuality = imageQuality;
}
+
+ public long getMaxImagePixels() {
+ return maxImagePixels;
+ }
+
+ /**
+ * Set the maximum total pixels (width × height) for a rendered
+ * page image. Pages exceeding this limit are skipped for OCR.
+ * Default is 100,000,000. Set to {@code -1} for no limit (not
recommended).
+ */
+ public void setMaxImagePixels(long maxImagePixels) {
+ if (maxImagePixels < 1 && maxImagePixels != -1) {
+ throw new IllegalArgumentException(
+ "maxImagePixels must be -1 (no limit) or at least 1, got: "
+ + maxImagePixels);
+ }
+ this.maxImagePixels = maxImagePixels;
+ }
+
+ public int getMaxPagesToOcr() {
+ return maxPagesToOcr;
+ }
+
+ /**
+ * Set the maximum number of pages to OCR per document.
+ * Default is {@code -1} (no limit). Must be {@code -1} or at least {@code
1}.
+ */
+ public void setMaxPagesToOcr(int maxPagesToOcr) {
+ if (maxPagesToOcr < 1 && maxPagesToOcr != -1) {
+ throw new IllegalArgumentException(
+ "maxPagesToOcr must be -1 (no limit) or at least 1, got: "
+ + maxPagesToOcr);
+ }
+ this.maxPagesToOcr = maxPagesToOcr;
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index c87acefcd3..8679605eb7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -604,6 +604,39 @@ public class PDFParserConfig implements Serializable {
ocr.setImageQuality(ocrImageQuality);
}
+ /**
+ * @return maximum total pixels (width × height) allowed for a
+ * rendered page image before OCR is skipped
+ */
+ public long getOcrMaxImagePixels() {
+ return ocr.getMaxImagePixels();
+ }
+
+ /**
+ * Set the maximum total pixels (width × height) for a rendered
+ * page image. Pages exceeding this limit are skipped for OCR.
+ * Default is 100,000,000 (100 megapixels).
+ */
+ public void setOcrMaxImagePixels(long ocrMaxImagePixels) {
+ ocr.setMaxImagePixels(ocrMaxImagePixels);
+ }
+
+ /**
+ * @return maximum number of pages to OCR per document, or {@code -1}
+ * for no limit
+ */
+ public int getOcrMaxPagesToOcr() {
+ return ocr.getMaxPagesToOcr();
+ }
+
+ /**
+ * Set the maximum number of pages to OCR per document.
+ * Default is {@code -1} (no limit).
+ */
+ public void setOcrMaxPagesToOcr(int ocrMaxPagesToOcr) {
+ ocr.setMaxPagesToOcr(ocrMaxPagesToOcr);
+ }
+
/**
* @return whether or not to extract PDActions
* @see #setExtractActions(boolean)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
deleted file mode 100644
index 73bad664ed..0000000000
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.renderer.pdf.mutool;
-
-import java.io.Closeable;
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.config.TikaComponent;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.TikaPagedText;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.renderer.PageBasedRenderResults;
-import org.apache.tika.renderer.PageRangeRequest;
-import org.apache.tika.renderer.RenderRequest;
-import org.apache.tika.renderer.RenderResult;
-import org.apache.tika.renderer.RenderResults;
-import org.apache.tika.renderer.Renderer;
-import org.apache.tika.renderer.RenderingTracker;
-import org.apache.tika.utils.FileProcessResult;
-import org.apache.tika.utils.ProcessUtils;
-
-@TikaComponent(name = "mupdf-renderer", spi = false)
-public class MuPDFRenderer implements Renderer {
-
- Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("pdf"));
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public RenderResults render(TikaInputStream tis, Metadata metadata,
ParseContext parseContext,
- RenderRequest... requests) throws IOException,
TikaException {
- TemporaryResources tmp = new TemporaryResources();
- PageBasedRenderResults results = new PageBasedRenderResults(tmp);
- Path path = tis.getPath();
- for (RenderRequest request : requests) {
- renderRequest(path, metadata, parseContext, request, results, tmp);
- }
- return results;
- }
-
- private RenderResults renderRequest(Path pdf, Metadata metadata,
ParseContext parseContext,
- RenderRequest request, RenderResults
results,
- TemporaryResources tmp) throws
TikaException, IOException {
- if (! (request instanceof PageRangeRequest)) {
- throw new TikaException("I regret that this renderer can only
handle " +
- "PageRangeRequests, not " + request.getClass());
- }
- PageRangeRequest rangeRequest = (PageRangeRequest)request;
- RenderingTracker tracker = parseContext.get(RenderingTracker.class);
- if (tracker == null) {
- tracker = new RenderingTracker();
- parseContext.set(RenderingTracker.class, tracker);
- }
-
- Path dir = Files.createTempDirectory("tika-render-");
- //TODO -- this assumes files have been deleted first
- //do something smarter
- tmp.addResource(new Closeable() {
- @Override
- public void close() throws IOException {
- Files.delete(dir);
- }
- });
- //TODO -- run mutool pages to get page sizes
- //and then use that information in the -O to get proper scaling
- //etc.
- // This would also allow us to run on a single page at a time if
that's of any interest
- String[] args = createCommandLine(pdf, dir, rangeRequest);
-
- ProcessBuilder builder = new ProcessBuilder();
- builder.command(args);
- //TODO: parameterize timeout
- FileProcessResult result = ProcessUtils.execute(builder, 60000, 10,
1000);
- if (result.getExitValue() != 0) {
- throw new TikaException(result.getStderr());
- }
- //TODO -- fix this
- Matcher m =
Pattern.compile("tika-mutool-render-(\\d+)\\.png").matcher("");
- for (File f : dir.toFile().listFiles()) {
- String n = f.getName();
- if (m.reset(n).find()) {
- int pageIndex = Integer.parseInt(m.group(1));
- Metadata renderMetadata = Metadata.newInstance(parseContext);
- renderMetadata.set(TikaPagedText.PAGE_NUMBER, pageIndex);
- renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-
TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
- results.add(new RenderResult(RenderResult.STATUS.SUCCESS,
tracker.getNextId(),
- TikaInputStream.get(f.toPath()), renderMetadata));
- }
- }
-
- return results;
- }
-
- private String[] createCommandLine(Path pdf, Path dir, PageRangeRequest
request) {
- //TODO parameterize all the things; mutool path, colorspace and size
and format and...
- List<String> args = new ArrayList<>();
- args.add("mutool");
- args.add("convert");
- args.add("-O colorspace=gray");
- args.add("-o");
- args.add(
- ProcessUtils.escapeCommandLine(
- dir.toAbsolutePath().toString() + "/" +
"tika-mutool-render-%d.png"));
-
args.add(ProcessUtils.escapeCommandLine(pdf.toAbsolutePath().toString()));
- if (request != PageRangeRequest.RENDER_ALL) {
- StringBuilder sb = new StringBuilder();
- int cnt = 0;
- for (int i = request.getFrom(); i <= request.getTo(); i++) {
- if (cnt++ > 0) {
- sb.append(",");
- }
- sb.append(i);
- }
- args.add(sb.toString());
- }
- return args.toArray(new String[0]);
- }
-}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java
new file mode 100644
index 0000000000..bdaec24f29
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java
@@ -0,0 +1,293 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf.poppler;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.PageBasedRenderResults;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.RenderingTracker;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * Renderer that uses Poppler's {@code pdftoppm} command to convert PDF
+ * pages to PNG images.
+ * <p>
+ * Poppler is pre-installed on most Linux distributions and is the
+ * fastest widely-available PDF renderer. On macOS it can be installed
+ * via {@code brew install poppler}; on Windows via MSYS2 or Chocolatey.
+ * <p>
+ * Configuration key: {@code "poppler-renderer"}
+ *
+ * @since Apache Tika 4.0
+ */
+@TikaComponent(name = "poppler-renderer", spi = false)
+public class PopplerRenderer implements Renderer {
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("pdf"));
+
+ /**
+ * Matches the Poppler output pattern: {@code prefix-01.png},
+ * {@code prefix-02.png}, etc.
+ */
+ private static final Pattern PAGE_FILE_PATTERN =
+ Pattern.compile("tika-poppler-(\\d+)\\.png");
+
+ private String pdftoppmPath = "pdftoppm";
+ private int dpi = 300;
+ private boolean gray = true;
+ private int timeoutMs = 120000;
+
+ /**
+ * Maximum pixel dimension (in pixels) for the longest edge of a rendered
+ * page image. Maps to pdftoppm's {@code -scale-to} flag.
+ * <p>
+ * If a PDF page would render larger than this value (in pixels) at the
+ * configured DPI, pdftoppm scales the output image down so that its
+ * longest edge equals {@code maxScaleTo} pixels, preserving the aspect
+ * ratio. For example, with {@code maxScaleTo=4096}, a landscape page
+ * that would normally render to 6000×4000 pixels is scaled to
+ * 4096×2731 pixels instead.
+ * <p>
+ * If the rendered image is already smaller than {@code maxScaleTo}
+ * on both edges, no scaling is applied — the image is not enlarged.
+ * <p>
+ * This is the primary defense against pathologically large PDF pages
+ * (e.g., architectural drawings, maps, posters) that would otherwise
+ * produce multi-gigabyte images and cause OOM.
+ * <p>
+ * Default is 4096 pixels. Set to {@code -1} to disable scaling
+ * (not recommended).
+ */
+ private int maxScaleTo = 4096;
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public RenderResults render(TikaInputStream tis, Metadata metadata,
+ ParseContext parseContext,
+ RenderRequest... requests)
+ throws IOException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ PageBasedRenderResults results = new PageBasedRenderResults(tmp);
+ Path path = tis.getPath();
+ for (RenderRequest request : requests) {
+ renderRequest(path, metadata, parseContext, request, results, tmp);
+ }
+ return results;
+ }
+
+ private void renderRequest(Path pdf, Metadata metadata,
+ ParseContext parseContext,
+ RenderRequest request,
+ PageBasedRenderResults results,
+ TemporaryResources tmp)
+ throws TikaException, IOException {
+ if (!(request instanceof PageRangeRequest)) {
+ throw new TikaException(
+ "I regret that this renderer can only handle "
+ + "PageRangeRequests, not " + request.getClass());
+ }
+ PageRangeRequest rangeRequest = (PageRangeRequest) request;
+
+ RenderingTracker tracker = parseContext.get(RenderingTracker.class);
+ if (tracker == null) {
+ tracker = new RenderingTracker();
+ parseContext.set(RenderingTracker.class, tracker);
+ }
+
+ Path dir = Files.createTempDirectory("tika-render-");
+ tmp.addResource(new Closeable() {
+ @Override
+ public void close() throws IOException {
+ Files.delete(dir);
+ }
+ });
+
+ String[] args = createCommandLine(pdf, dir, rangeRequest);
+
+ ProcessBuilder builder = new ProcessBuilder();
+ builder.command(args);
+ FileProcessResult result = ProcessUtils.execute(
+ builder, timeoutMs, 10, 1000);
+ if (result.getExitValue() != 0) {
+ throw new TikaException(
+ "pdftoppm failed (exit " + result.getExitValue()
+ + "): " + result.getStderr());
+ }
+
+ Matcher m = PAGE_FILE_PATTERN.matcher("");
+ File[] files = dir.toFile().listFiles();
+ if (files == null) {
+ return;
+ }
+ for (File f : files) {
+ if (m.reset(f.getName()).find()) {
+ int pageNumber = Integer.parseInt(m.group(1));
+ Metadata renderMetadata = Metadata.newInstance(parseContext);
+ renderMetadata.set(TikaPagedText.PAGE_NUMBER, pageNumber);
+ renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.RENDERING
+ .name());
+ results.add(new RenderResult(
+ RenderResult.STATUS.SUCCESS,
+ tracker.getNextId(),
+ f.toPath(),
+ renderMetadata));
+ }
+ }
+ }
+
+ String[] createCommandLine(Path pdf, Path dir,
+ PageRangeRequest request) {
+ List<String> args = new ArrayList<>();
+ args.add(pdftoppmPath);
+
+ // Output format
+ args.add("-png");
+
+ // Resolution
+ args.add("-r");
+ args.add(String.valueOf(dpi));
+
+ // Scale cap — prevents OOM on huge pages
+ if (maxScaleTo > 0) {
+ args.add("-scale-to");
+ args.add(String.valueOf(maxScaleTo));
+ }
+
+ // Colorspace
+ if (gray) {
+ args.add("-gray");
+ }
+
+ // Page range
+ if (request != PageRangeRequest.RENDER_ALL) {
+ args.add("-f");
+ args.add(String.valueOf(request.getFrom()));
+ args.add("-l");
+ args.add(String.valueOf(request.getTo()));
+ }
+
+ // Input PDF
+ args.add(ProcessUtils.escapeCommandLine(
+ pdf.toAbsolutePath().toString()));
+
+ // Output prefix (pdftoppm appends -NN.png)
+ args.add(ProcessUtils.escapeCommandLine(
+ dir.toAbsolutePath().toString() + "/tika-poppler"));
+
+ return args.toArray(new String[0]);
+ }
+
+ // ---- config getters/setters -------------------------------------------
+
+ public String getPdftoppmPath() {
+ return pdftoppmPath;
+ }
+
+ /**
+ * Set the path to the {@code pdftoppm} executable. Defaults to
+ * {@code "pdftoppm"} (assumes it is on the system path).
+ */
+ public void setPdftoppmPath(String pdftoppmPath) {
+ this.pdftoppmPath = pdftoppmPath;
+ }
+
+ public int getDpi() {
+ return dpi;
+ }
+
+ /**
+ * Set the rendering resolution in DPI. Defaults to 300.
+ */
+ public void setDpi(int dpi) {
+ this.dpi = dpi;
+ }
+
+ public boolean isGray() {
+ return gray;
+ }
+
+ /**
+ * If true (the default), render in grayscale. Set to false for
+ * full-color rendering.
+ */
+ public void setGray(boolean gray) {
+ this.gray = gray;
+ }
+
+ public int getTimeoutMs() {
+ return timeoutMs;
+ }
+
+ /**
+ * Set the timeout in milliseconds for the pdftoppm process.
+ * Defaults to 120000 (2 minutes).
+ */
+ public void setTimeoutMs(int timeoutMs) {
+ this.timeoutMs = timeoutMs;
+ }
+
+ public int getMaxScaleTo() {
+ return maxScaleTo;
+ }
+
+ /**
+ * Set the maximum pixel dimension (in pixels) for the longest edge
+ * of rendered page images. Maps to pdftoppm's {@code -scale-to} flag.
+ * Pages that would render smaller than this are not enlarged.
+ * <p>
+ * Default is 4096 pixels. Set to {@code -1} to disable (not recommended).
+ */
+ public void setMaxScaleTo(int maxScaleTo) {
+ if (maxScaleTo < 1 && maxScaleTo != -1) {
+ throw new IllegalArgumentException(
+ "maxScaleTo must be -1 (disabled) or at least 1, got: "
+ + maxScaleTo);
+ }
+ this.maxScaleTo = maxScaleTo;
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
new file mode 100644
index 0000000000..69316a74a8
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf.poppler;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.renderer.PageBasedRenderResults;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderResult;
+
+public class PopplerRendererTest {
+
+ private static boolean hasPoppler;
+
+ @BeforeAll
+ static void checkPoppler() {
+ hasPoppler = ExternalParser.check(new String[]{"pdftoppm", "-v"});
+ }
+
+ @Test
+ void testRenderAllPages() throws Exception {
+ assumeTrue(hasPoppler, "pdftoppm not available");
+
+ PopplerRenderer renderer = new PopplerRenderer();
+
+ try (InputStream is = getClass().getResourceAsStream(
+ "/test-documents/testPDF_bookmarks.pdf")) {
+ assertNotNull(is, "test PDF not found");
+
+ try (TikaInputStream tis = TikaInputStream.get(is)) {
+ PageBasedRenderResults results =
+ (PageBasedRenderResults) renderer.render(
+ tis, new Metadata(), new ParseContext(),
+ PageRangeRequest.RENDER_ALL);
+
+ List<RenderResult> allResults = results.getResults();
+ assertEquals(2, allResults.size(),
+ "testPDF_bookmarks.pdf has 2 pages");
+
+ Set<Integer> pageNumbers = new HashSet<>();
+ for (RenderResult rr : allResults) {
+ assertEquals(RenderResult.STATUS.SUCCESS, rr.getStatus());
+
+ Metadata rm = rr.getMetadata();
+ Integer page = rm.getInt(TikaPagedText.PAGE_NUMBER);
+ assertNotNull(page, "page number should be set");
+ pageNumbers.add(page);
+
+ assertEquals(
+ TikaCoreProperties.EmbeddedResourceType.RENDERING
+ .name(),
+ rm.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+
+ // Verify we can actually read the rendered image bytes
+ try (TikaInputStream imageTis = rr.getInputStream()) {
+ byte[] imageBytes = imageTis.readAllBytes();
+ assertTrue(imageBytes.length > 100,
+ "rendered page should be a non-trivial PNG");
+ // PNG magic bytes
+ assertEquals((byte) 0x89, imageBytes[0]);
+ assertEquals((byte) 'P', imageBytes[1]);
+ assertEquals((byte) 'N', imageBytes[2]);
+ assertEquals((byte) 'G', imageBytes[3]);
+ }
+ }
+
+ assertEquals(Set.of(1, 2), pageNumbers,
+ "should have pages 1 and 2");
+
+ results.close();
+ }
+ }
+ }
+
+ @Test
+ void testRenderPageRange() throws Exception {
+ assumeTrue(hasPoppler, "pdftoppm not available");
+
+ PopplerRenderer renderer = new PopplerRenderer();
+
+ try (InputStream is = getClass().getResourceAsStream(
+ "/test-documents/testPDF_bookmarks.pdf")) {
+ assertNotNull(is, "test PDF not found");
+
+ try (TikaInputStream tis = TikaInputStream.get(is)) {
+ // Render only page 2
+ PageBasedRenderResults results =
+ (PageBasedRenderResults) renderer.render(
+ tis, new Metadata(), new ParseContext(),
+ new PageRangeRequest(2, 2));
+
+ List<RenderResult> allResults = results.getResults();
+ assertEquals(1, allResults.size(),
+ "should render exactly 1 page");
+
+ assertEquals(2,
+ allResults.get(0).getMetadata()
+ .getInt(TikaPagedText.PAGE_NUMBER));
+
+ results.close();
+ }
+ }
+ }
+
+ @Test
+ void testCustomDpi() throws Exception {
+ assumeTrue(hasPoppler, "pdftoppm not available");
+
+ PopplerRenderer renderer = new PopplerRenderer();
+ renderer.setDpi(72);
+ renderer.setGray(false);
+
+ try (InputStream is = getClass().getResourceAsStream(
+ "/test-documents/testPDF_bookmarks.pdf")) {
+ assertNotNull(is, "test PDF not found");
+
+ try (TikaInputStream tis = TikaInputStream.get(is)) {
+ PageBasedRenderResults results =
+ (PageBasedRenderResults) renderer.render(
+ tis, new Metadata(), new ParseContext(),
+ PageRangeRequest.RENDER_ALL);
+
+ assertEquals(2, results.getResults().size());
+
+ // 72 DPI should produce smaller images than 300 DPI
+ try (TikaInputStream imageTis =
+ results.getResults().get(0).getInputStream()) {
+ byte[] imageBytes = imageTis.readAllBytes();
+ assertTrue(imageBytes.length > 0);
+ }
+
+ results.close();
+ }
+ }
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 8dfaac15cc..d5ce0bde8a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -82,7 +82,7 @@ public class PDFParserTest extends TikaTest {
public static Level PDFBOX_LOG_LEVEL = Level.INFO;
private static Boolean hasTesseract = null;
- private static Boolean hasMuPDF = null;
+ private static Boolean hasPoppler = null;
public static boolean canRunOCR() throws TikaConfigException {
if (hasTesseract != null) {
@@ -92,12 +92,12 @@ public class PDFParserTest extends TikaTest {
return hasTesseract;
}
- public static boolean hasMuPDF() throws TikaConfigException {
- if (hasMuPDF != null) {
- return hasMuPDF;
+ public static boolean hasPoppler() throws TikaConfigException {
+ if (hasPoppler != null) {
+ return hasPoppler;
}
- hasMuPDF = ExternalParser.check(new String[]{"mutool", "-v"});
- return hasMuPDF;
+ hasPoppler = ExternalParser.check(new String[]{"pdftoppm", "-v"});
+ return hasPoppler;
}
@BeforeAll
@@ -459,13 +459,10 @@ public class PDFParserTest extends TikaTest {
}
@Test
- @Disabled("there's a subtle problem in setting the bytes in the
TikaInputStream that needs to be fixed")
- public void testMuPDFInOCR() throws Exception {
- //TODO -- need to add "rendered by" to confirm that mutool was
actually called
- //and that there wasn't some backoff to PDFBox the PDFParser
+ public void testPopplerInOCR() throws Exception {
assumeTrue(canRunOCR(), "can't run OCR");
- assumeTrue(hasMuPDF(), "does not have mupdf");
- Parser p =
TikaLoaderHelper.getLoader("tika-rendering-mupdf-config.json").loadAutoDetectParser();
+ assumeTrue(hasPoppler(), "does not have poppler (pdftoppm)");
+ Parser p =
TikaLoaderHelper.getLoader("tika-rendering-poppler-config.json").loadAutoDetectParser();
String text =
getText(getResourceAsStream("/test-documents/testOCR.pdf"), p);
assertContains("Happy", text.trim());
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.json
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-poppler-config.json
similarity index 85%
rename from
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.json
rename to
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-poppler-config.json
index 3084474439..a4810ae111 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.json
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-poppler-config.json
@@ -11,7 +11,7 @@
],
"renderers": [
{
- "mupdf-renderer": {}
+ "poppler-renderer": {}
}
]
}