This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4664-poppler-renderer in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4b1362ab29f0afdd845b79ffcea671cffbd9b605 Author: tballison <[email protected]> AuthorDate: Tue Feb 17 14:30:35 2026 -0500 TIKA-4664 - add Poppler renderer, replace MuPDF, add OCR safety limits - Add PopplerRenderer that shells out to pdftoppm for PDF page rendering, with maxScaleTo (default 4096px) to prevent OOM on huge pages. - Remove MuPDFRenderer (replaced by Poppler). - Add maxImagePixels (default 100MP) to OcrConfig to skip OCR on pathologically large rendered page images in the PDFBox rendering path. - Add maxPagesToOcr to OcrConfig to cap per-document OCR page count. - Expose new OcrConfig limits through PDFParserConfig. - Update PDFParserTest to use Poppler and re-enable the rendering test. Co-authored-by: Cursor <[email protected]> --- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 27 ++ .../java/org/apache/tika/parser/pdf/OcrConfig.java | 60 +++++ .../apache/tika/parser/pdf/PDFParserConfig.java | 33 +++ .../tika/renderer/pdf/mutool/MuPDFRenderer.java | 150 ----------- .../tika/renderer/pdf/poppler/PopplerRenderer.java | 293 +++++++++++++++++++++ .../renderer/pdf/poppler/PopplerRendererTest.java | 167 ++++++++++++ .../org/apache/tika/parser/pdf/PDFParserTest.java | 21 +- ...fig.json => tika-rendering-poppler-config.json} | 2 +- 8 files changed, 590 insertions(+), 163 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index b2dc1f8a36..ea63c47a42 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -62,6 +62,7 @@ import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.common.COSObjectable; import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction; import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; +import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification; @@ -541,6 +542,12 @@ class AbstractPDF2XHTML extends PDFTextStripper { if (c != null) { c.increment(); } + + // Enforce maxPagesToOcr limit + int maxPagesToOcr = config.getOcrMaxPagesToOcr(); + if (maxPagesToOcr > 0 && c != null && c.getCount() > maxPagesToOcr) { + return; + } MediaType ocrImageMediaType = MediaType.image("ocr-" + config.getOcrImageFormat().getFormatName()); if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType)) { if (ocrStrategy == OCR_ONLY || ocrStrategy == OCR_AND_TEXT_EXTRACTION) { @@ -672,6 +679,26 @@ class AbstractPDF2XHTML extends PDFTextStripper { int id = renderingTracker.getNextId(); try { + // Check estimated pixel dimensions before rendering to + // prevent OOM on pathologically large pages + long maxPixels = config.getOcrMaxImagePixels(); + if (maxPixels > 0) { + PDPage currentPage = pdDocument.getPage(pageIndex); + PDRectangle mediaBox = currentPage.getMediaBox(); + long estWidth = (long) Math.ceil(mediaBox.getWidth() / 72.0 * dpi); + long estHeight = (long) Math.ceil(mediaBox.getHeight() / 72.0 * dpi); + long estPixels = estWidth * estHeight; + if (estPixels > maxPixels) { + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM, + "Skipping OCR for page " + (pageIndex + 1) + + ": estimated " + estPixels + + " pixels exceeds maxImagePixels=" + + maxPixels); + return new RenderResult(RenderResult.STATUS.EXCEPTION, + id, null, pageMetadata); + } + } + BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType().getPdfBoxImageType()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java index f0c56198c9..45454b7275 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OcrConfig.java @@ -123,6 +123,31 @@ public class OcrConfig implements Serializable { private ImageFormat imageFormat = ImageFormat.PNG; private float imageQuality = 1.0f; + /** + * Maximum total pixels (width × height) allowed for a rendered + * page image before OCR is skipped for that page. This prevents OOM + * from rendering pathologically large PDF pages (e.g., architectural + * drawings, maps) via PDFBox's in-process renderer. + * <p> + * When using the Poppler renderer, prefer {@code maxScaleTo} on + * {@code PopplerRenderer} instead — it prevents the large image from + * ever being created. This limit is the safety net for the PDFBox + * rendering path. + * <p> + * Default is 100,000,000 (100 megapixels, roughly 10,000 × + * 10,000). Set to {@code -1} for no limit (not recommended). + */ + private long maxImagePixels = 100_000_000L; + + /** + * Maximum number of pages to OCR per document. Pages beyond this + * limit are processed for text extraction only (if applicable) + * but not rendered or sent to OCR. + * <p> + * Default is {@code -1} (no limit — all pages are eligible for OCR). + */ + private int maxPagesToOcr = -1; + public Strategy getStrategy() { return strategy; } @@ -178,4 +203,39 @@ public class OcrConfig implements Serializable { public void setImageQuality(float imageQuality) { this.imageQuality = imageQuality; } + + public long getMaxImagePixels() { + return maxImagePixels; + } + + /** + * Set the maximum total pixels (width × height) for a rendered + * page image. Pages exceeding this limit are skipped for OCR. + * Default is 100,000,000. Set to {@code -1} for no limit (not recommended). + */ + public void setMaxImagePixels(long maxImagePixels) { + if (maxImagePixels < 1 && maxImagePixels != -1) { + throw new IllegalArgumentException( + "maxImagePixels must be -1 (no limit) or at least 1, got: " + + maxImagePixels); + } + this.maxImagePixels = maxImagePixels; + } + + public int getMaxPagesToOcr() { + return maxPagesToOcr; + } + + /** + * Set the maximum number of pages to OCR per document. + * Default is {@code -1} (no limit). Must be {@code -1} or at least {@code 1}. + */ + public void setMaxPagesToOcr(int maxPagesToOcr) { + if (maxPagesToOcr < 1 && maxPagesToOcr != -1) { + throw new IllegalArgumentException( + "maxPagesToOcr must be -1 (no limit) or at least 1, got: " + + maxPagesToOcr); + } + this.maxPagesToOcr = maxPagesToOcr; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index c87acefcd3..8679605eb7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -604,6 +604,39 @@ public class PDFParserConfig implements Serializable { ocr.setImageQuality(ocrImageQuality); } + /** + * @return maximum total pixels (width × height) allowed for a + * rendered page image before OCR is skipped + */ + public long getOcrMaxImagePixels() { + return ocr.getMaxImagePixels(); + } + + /** + * Set the maximum total pixels (width × height) for a rendered + * page image. Pages exceeding this limit are skipped for OCR. + * Default is 100,000,000 (100 megapixels). + */ + public void setOcrMaxImagePixels(long ocrMaxImagePixels) { + ocr.setMaxImagePixels(ocrMaxImagePixels); + } + + /** + * @return maximum number of pages to OCR per document, or {@code -1} + * for no limit + */ + public int getOcrMaxPagesToOcr() { + return ocr.getMaxPagesToOcr(); + } + + /** + * Set the maximum number of pages to OCR per document. + * Default is {@code -1} (no limit). + */ + public void setOcrMaxPagesToOcr(int ocrMaxPagesToOcr) { + ocr.setMaxPagesToOcr(ocrMaxPagesToOcr); + } + /** * @return whether or not to extract PDActions * @see #setExtractActions(boolean) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java deleted file mode 100644 index 73bad664ed..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.renderer.pdf.mutool; - -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.tika.config.TikaComponent; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.TikaPagedText; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.renderer.PageBasedRenderResults; -import org.apache.tika.renderer.PageRangeRequest; -import org.apache.tika.renderer.RenderRequest; -import org.apache.tika.renderer.RenderResult; -import org.apache.tika.renderer.RenderResults; -import org.apache.tika.renderer.Renderer; -import org.apache.tika.renderer.RenderingTracker; -import org.apache.tika.utils.FileProcessResult; -import org.apache.tika.utils.ProcessUtils; - -@TikaComponent(name = "mupdf-renderer", spi = false) -public class MuPDFRenderer implements Renderer { - - Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf")); - - @Override - public Set<MediaType> getSupportedTypes(ParseContext context) { - return SUPPORTED_TYPES; - } - - @Override - public RenderResults render(TikaInputStream tis, Metadata metadata, ParseContext parseContext, - RenderRequest... requests) throws IOException, TikaException { - TemporaryResources tmp = new TemporaryResources(); - PageBasedRenderResults results = new PageBasedRenderResults(tmp); - Path path = tis.getPath(); - for (RenderRequest request : requests) { - renderRequest(path, metadata, parseContext, request, results, tmp); - } - return results; - } - - private RenderResults renderRequest(Path pdf, Metadata metadata, ParseContext parseContext, - RenderRequest request, RenderResults results, - TemporaryResources tmp) throws TikaException, IOException { - if (! (request instanceof PageRangeRequest)) { - throw new TikaException("I regret that this renderer can only handle " + - "PageRangeRequests, not " + request.getClass()); - } - PageRangeRequest rangeRequest = (PageRangeRequest)request; - RenderingTracker tracker = parseContext.get(RenderingTracker.class); - if (tracker == null) { - tracker = new RenderingTracker(); - parseContext.set(RenderingTracker.class, tracker); - } - - Path dir = Files.createTempDirectory("tika-render-"); - //TODO -- this assumes files have been deleted first - //do something smarter - tmp.addResource(new Closeable() { - @Override - public void close() throws IOException { - Files.delete(dir); - } - }); - //TODO -- run mutool pages to get page sizes - //and then use that information in the -O to get proper scaling - //etc. - // This would also allow us to run on a single page at a time if that's of any interest - String[] args = createCommandLine(pdf, dir, rangeRequest); - - ProcessBuilder builder = new ProcessBuilder(); - builder.command(args); - //TODO: parameterize timeout - FileProcessResult result = ProcessUtils.execute(builder, 60000, 10, 1000); - if (result.getExitValue() != 0) { - throw new TikaException(result.getStderr()); - } - //TODO -- fix this - Matcher m = Pattern.compile("tika-mutool-render-(\\d+)\\.png").matcher(""); - for (File f : dir.toFile().listFiles()) { - String n = f.getName(); - if (m.reset(n).find()) { - int pageIndex = Integer.parseInt(m.group(1)); - Metadata renderMetadata = Metadata.newInstance(parseContext); - renderMetadata.set(TikaPagedText.PAGE_NUMBER, pageIndex); - renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.RENDERING.name()); - results.add(new RenderResult(RenderResult.STATUS.SUCCESS, tracker.getNextId(), - TikaInputStream.get(f.toPath()), renderMetadata)); - } - } - - return results; - } - - private String[] createCommandLine(Path pdf, Path dir, PageRangeRequest request) { - //TODO parameterize all the things; mutool path, colorspace and size and format and... - List<String> args = new ArrayList<>(); - args.add("mutool"); - args.add("convert"); - args.add("-O colorspace=gray"); - args.add("-o"); - args.add( - ProcessUtils.escapeCommandLine( - dir.toAbsolutePath().toString() + "/" + "tika-mutool-render-%d.png")); - args.add(ProcessUtils.escapeCommandLine(pdf.toAbsolutePath().toString())); - if (request != PageRangeRequest.RENDER_ALL) { - StringBuilder sb = new StringBuilder(); - int cnt = 0; - for (int i = request.getFrom(); i <= request.getTo(); i++) { - if (cnt++ > 0) { - sb.append(","); - } - sb.append(i); - } - args.add(sb.toString()); - } - return args.toArray(new String[0]); - } -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java new file mode 100644 index 0000000000..bdaec24f29 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/poppler/PopplerRenderer.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.renderer.pdf.poppler; + +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.TikaPagedText; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.renderer.PageBasedRenderResults; +import org.apache.tika.renderer.PageRangeRequest; +import org.apache.tika.renderer.RenderRequest; +import org.apache.tika.renderer.RenderResult; +import org.apache.tika.renderer.RenderResults; +import org.apache.tika.renderer.Renderer; +import org.apache.tika.renderer.RenderingTracker; +import org.apache.tika.utils.FileProcessResult; +import org.apache.tika.utils.ProcessUtils; + +/** + * Renderer that uses Poppler's {@code pdftoppm} command to convert PDF + * pages to PNG images. + * <p> + * Poppler is pre-installed on most Linux distributions and is the + * fastest widely-available PDF renderer. On macOS it can be installed + * via {@code brew install poppler}; on Windows via MSYS2 or Chocolatey. + * <p> + * Configuration key: {@code "poppler-renderer"} + * + * @since Apache Tika 4.0 + */ +@TikaComponent(name = "poppler-renderer", spi = false) +public class PopplerRenderer implements Renderer { + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("pdf")); + + /** + * Matches the Poppler output pattern: {@code prefix-01.png}, + * {@code prefix-02.png}, etc. + */ + private static final Pattern PAGE_FILE_PATTERN = + Pattern.compile("tika-poppler-(\\d+)\\.png"); + + private String pdftoppmPath = "pdftoppm"; + private int dpi = 300; + private boolean gray = true; + private int timeoutMs = 120000; + + /** + * Maximum pixel dimension (in pixels) for the longest edge of a rendered + * page image. Maps to pdftoppm's {@code -scale-to} flag. + * <p> + * If a PDF page would render larger than this value (in pixels) at the + * configured DPI, pdftoppm scales the output image down so that its + * longest edge equals {@code maxScaleTo} pixels, preserving the aspect + * ratio. For example, with {@code maxScaleTo=4096}, a landscape page + * that would normally render to 6000×4000 pixels is scaled to + * 4096×2731 pixels instead. + * <p> + * If the rendered image is already smaller than {@code maxScaleTo} + * on both edges, no scaling is applied — the image is not enlarged. + * <p> + * This is the primary defense against pathologically large PDF pages + * (e.g., architectural drawings, maps, posters) that would otherwise + * produce multi-gigabyte images and cause OOM. + * <p> + * Default is 4096 pixels. Set to {@code -1} to disable scaling + * (not recommended). + */ + private int maxScaleTo = 4096; + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public RenderResults render(TikaInputStream tis, Metadata metadata, + ParseContext parseContext, + RenderRequest... requests) + throws IOException, TikaException { + TemporaryResources tmp = new TemporaryResources(); + PageBasedRenderResults results = new PageBasedRenderResults(tmp); + Path path = tis.getPath(); + for (RenderRequest request : requests) { + renderRequest(path, metadata, parseContext, request, results, tmp); + } + return results; + } + + private void renderRequest(Path pdf, Metadata metadata, + ParseContext parseContext, + RenderRequest request, + PageBasedRenderResults results, + TemporaryResources tmp) + throws TikaException, IOException { + if (!(request instanceof PageRangeRequest)) { + throw new TikaException( + "I regret that this renderer can only handle " + + "PageRangeRequests, not " + request.getClass()); + } + PageRangeRequest rangeRequest = (PageRangeRequest) request; + + RenderingTracker tracker = parseContext.get(RenderingTracker.class); + if (tracker == null) { + tracker = new RenderingTracker(); + parseContext.set(RenderingTracker.class, tracker); + } + + Path dir = Files.createTempDirectory("tika-render-"); + tmp.addResource(new Closeable() { + @Override + public void close() throws IOException { + Files.delete(dir); + } + }); + + String[] args = createCommandLine(pdf, dir, rangeRequest); + + ProcessBuilder builder = new ProcessBuilder(); + builder.command(args); + FileProcessResult result = ProcessUtils.execute( + builder, timeoutMs, 10, 1000); + if (result.getExitValue() != 0) { + throw new TikaException( + "pdftoppm failed (exit " + result.getExitValue() + + "): " + result.getStderr()); + } + + Matcher m = PAGE_FILE_PATTERN.matcher(""); + File[] files = dir.toFile().listFiles(); + if (files == null) { + return; + } + for (File f : files) { + if (m.reset(f.getName()).find()) { + int pageNumber = Integer.parseInt(m.group(1)); + Metadata renderMetadata = Metadata.newInstance(parseContext); + renderMetadata.set(TikaPagedText.PAGE_NUMBER, pageNumber); + renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.RENDERING + .name()); + results.add(new RenderResult( + RenderResult.STATUS.SUCCESS, + tracker.getNextId(), + f.toPath(), + renderMetadata)); + } + } + } + + String[] createCommandLine(Path pdf, Path dir, + PageRangeRequest request) { + List<String> args = new ArrayList<>(); + args.add(pdftoppmPath); + + // Output format + args.add("-png"); + + // Resolution + args.add("-r"); + args.add(String.valueOf(dpi)); + + // Scale cap — prevents OOM on huge pages + if (maxScaleTo > 0) { + args.add("-scale-to"); + args.add(String.valueOf(maxScaleTo)); + } + + // Colorspace + if (gray) { + args.add("-gray"); + } + + // Page range + if (request != PageRangeRequest.RENDER_ALL) { + args.add("-f"); + args.add(String.valueOf(request.getFrom())); + args.add("-l"); + args.add(String.valueOf(request.getTo())); + } + + // Input PDF + args.add(ProcessUtils.escapeCommandLine( + pdf.toAbsolutePath().toString())); + + // Output prefix (pdftoppm appends -NN.png) + args.add(ProcessUtils.escapeCommandLine( + dir.toAbsolutePath().toString() + "/tika-poppler")); + + return args.toArray(new String[0]); + } + + // ---- config getters/setters ------------------------------------------- + + public String getPdftoppmPath() { + return pdftoppmPath; + } + + /** + * Set the path to the {@code pdftoppm} executable. Defaults to + * {@code "pdftoppm"} (assumes it is on the system path). + */ + public void setPdftoppmPath(String pdftoppmPath) { + this.pdftoppmPath = pdftoppmPath; + } + + public int getDpi() { + return dpi; + } + + /** + * Set the rendering resolution in DPI. Defaults to 300. + */ + public void setDpi(int dpi) { + this.dpi = dpi; + } + + public boolean isGray() { + return gray; + } + + /** + * If true (the default), render in grayscale. Set to false for + * full-color rendering. + */ + public void setGray(boolean gray) { + this.gray = gray; + } + + public int getTimeoutMs() { + return timeoutMs; + } + + /** + * Set the timeout in milliseconds for the pdftoppm process. + * Defaults to 120000 (2 minutes). + */ + public void setTimeoutMs(int timeoutMs) { + this.timeoutMs = timeoutMs; + } + + public int getMaxScaleTo() { + return maxScaleTo; + } + + /** + * Set the maximum pixel dimension (in pixels) for the longest edge + * of rendered page images. Maps to pdftoppm's {@code -scale-to} flag. + * Pages that would render smaller than this are not enlarged. + * <p> + * Default is 4096 pixels. Set to {@code -1} to disable (not recommended). + */ + public void setMaxScaleTo(int maxScaleTo) { + if (maxScaleTo < 1 && maxScaleTo != -1) { + throw new IllegalArgumentException( + "maxScaleTo must be -1 (disabled) or at least 1, got: " + + maxScaleTo); + } + this.maxScaleTo = maxScaleTo; + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java new file mode 100644 index 0000000000..69316a74a8 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.renderer.pdf.poppler; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +import java.io.InputStream; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.TikaPagedText; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.renderer.PageBasedRenderResults; +import org.apache.tika.renderer.PageRangeRequest; +import org.apache.tika.renderer.RenderResult; + +public class PopplerRendererTest { + + private static boolean hasPoppler; + + @BeforeAll + static void checkPoppler() { + hasPoppler = ExternalParser.check(new String[]{"pdftoppm", "-v"}); + } + + @Test + void testRenderAllPages() throws Exception { + assumeTrue(hasPoppler, "pdftoppm not available"); + + PopplerRenderer renderer = new PopplerRenderer(); + + try (InputStream is = getClass().getResourceAsStream( + "/test-documents/testPDF_bookmarks.pdf")) { + assertNotNull(is, "test PDF not found"); + + try (TikaInputStream tis = TikaInputStream.get(is)) { + PageBasedRenderResults results = + (PageBasedRenderResults) renderer.render( + tis, new Metadata(), new ParseContext(), + PageRangeRequest.RENDER_ALL); + + List<RenderResult> allResults = results.getResults(); + assertEquals(2, allResults.size(), + "testPDF_bookmarks.pdf has 2 pages"); + + Set<Integer> pageNumbers = new HashSet<>(); + for (RenderResult rr : allResults) { + assertEquals(RenderResult.STATUS.SUCCESS, rr.getStatus()); + + Metadata rm = rr.getMetadata(); + Integer page = rm.getInt(TikaPagedText.PAGE_NUMBER); + assertNotNull(page, "page number should be set"); + pageNumbers.add(page); + + assertEquals( + TikaCoreProperties.EmbeddedResourceType.RENDERING + .name(), + rm.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + + // Verify we can actually read the rendered image bytes + try (TikaInputStream imageTis = rr.getInputStream()) { + byte[] imageBytes = imageTis.readAllBytes(); + assertTrue(imageBytes.length > 100, + "rendered page should be a non-trivial PNG"); + // PNG magic bytes + assertEquals((byte) 0x89, imageBytes[0]); + assertEquals((byte) 'P', imageBytes[1]); + assertEquals((byte) 'N', imageBytes[2]); + assertEquals((byte) 'G', imageBytes[3]); + } + } + + assertEquals(Set.of(1, 2), pageNumbers, + "should have pages 1 and 2"); + + results.close(); + } + } + } + + @Test + void testRenderPageRange() throws Exception { + assumeTrue(hasPoppler, "pdftoppm not available"); + + PopplerRenderer renderer = new PopplerRenderer(); + + try (InputStream is = getClass().getResourceAsStream( + "/test-documents/testPDF_bookmarks.pdf")) { + assertNotNull(is, "test PDF not found"); + + try (TikaInputStream tis = TikaInputStream.get(is)) { + // Render only page 2 + PageBasedRenderResults results = + (PageBasedRenderResults) renderer.render( + tis, new Metadata(), new ParseContext(), + new PageRangeRequest(2, 2)); + + List<RenderResult> allResults = results.getResults(); + assertEquals(1, allResults.size(), + "should render exactly 1 page"); + + assertEquals(2, + allResults.get(0).getMetadata() + .getInt(TikaPagedText.PAGE_NUMBER)); + + results.close(); + } + } + } + + @Test + void testCustomDpi() throws Exception { + assumeTrue(hasPoppler, "pdftoppm not available"); + + PopplerRenderer renderer = new PopplerRenderer(); + renderer.setDpi(72); + renderer.setGray(false); + + try (InputStream is = getClass().getResourceAsStream( + "/test-documents/testPDF_bookmarks.pdf")) { + assertNotNull(is, "test PDF not found"); + + try (TikaInputStream tis = TikaInputStream.get(is)) { + PageBasedRenderResults results = + (PageBasedRenderResults) renderer.render( + tis, new Metadata(), new ParseContext(), + PageRangeRequest.RENDER_ALL); + + assertEquals(2, results.getResults().size()); + + // 72 DPI should produce smaller images than 300 DPI + try (TikaInputStream imageTis = + results.getResults().get(0).getInputStream()) { + byte[] imageBytes = imageTis.readAllBytes(); + assertTrue(imageBytes.length > 0); + } + + results.close(); + } + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 8dfaac15cc..d5ce0bde8a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -82,7 +82,7 @@ public class PDFParserTest extends TikaTest { public static Level PDFBOX_LOG_LEVEL = Level.INFO; private static Boolean hasTesseract = null; - private static Boolean hasMuPDF = null; + private static Boolean hasPoppler = null; public static boolean canRunOCR() throws TikaConfigException { if (hasTesseract != null) { @@ -92,12 +92,12 @@ public class PDFParserTest extends TikaTest { return hasTesseract; } - public static boolean hasMuPDF() throws TikaConfigException { - if (hasMuPDF != null) { - return hasMuPDF; + public static boolean hasPoppler() throws TikaConfigException { + if (hasPoppler != null) { + return hasPoppler; } - hasMuPDF = ExternalParser.check(new String[]{"mutool", "-v"}); - return hasMuPDF; + hasPoppler = ExternalParser.check(new String[]{"pdftoppm", "-v"}); + return hasPoppler; } @BeforeAll @@ -459,13 +459,10 @@ public class PDFParserTest extends TikaTest { } @Test - @Disabled("there's a subtle problem in setting the bytes in the TikaInputStream that needs to be fixed") - public void testMuPDFInOCR() throws Exception { - //TODO -- need to add "rendered by" to confirm that mutool was actually called - //and that there wasn't some backoff to PDFBox the PDFParser + public void testPopplerInOCR() throws Exception { assumeTrue(canRunOCR(), "can't run OCR"); - assumeTrue(hasMuPDF(), "does not have mupdf"); - Parser p = TikaLoaderHelper.getLoader("tika-rendering-mupdf-config.json").loadAutoDetectParser(); + assumeTrue(hasPoppler(), "does not have poppler (pdftoppm)"); + Parser p = TikaLoaderHelper.getLoader("tika-rendering-poppler-config.json").loadAutoDetectParser(); String text = getText(getResourceAsStream("/test-documents/testOCR.pdf"), p); assertContains("Happy", text.trim()); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-poppler-config.json similarity index 85% rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.json rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-poppler-config.json index 3084474439..a4810ae111 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-poppler-config.json @@ -11,7 +11,7 @@ ], "renderers": [ { - "mupdf-renderer": {} + "poppler-renderer": {} } ] }
