This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8028a006f85cf8d72cf132829a212daf80052bce Author: tallison <[email protected]> AuthorDate: Fri Mar 12 16:41:50 2021 -0500 improve robustness of image processing in PDFs --- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 25 ++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index 53b7ee4..3007bfe 100644 --- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -103,6 +103,7 @@ import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.ExceptionUtils; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -448,12 +449,24 @@ class AbstractPDF2XHTML extends PDFTextStripper { try (TemporaryResources tmp = new TemporaryResources()) { int dpi = config.getOcrDPI(); - BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType()); - Path tmpFile = tmp.createTempFile(); - try (OutputStream os = Files.newOutputStream(tmpFile)) { - //TODO: get output format from TesseractConfig - ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), - os, dpi, config.getOcrImageQuality()); + Path tmpFile = null; + try { + BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType()); + tmpFile = tmp.createTempFile(); + try (OutputStream os = Files.newOutputStream(tmpFile)) { + //TODO: get output format from TesseractConfig + ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), + os, dpi, config.getOcrImageQuality()); + } + } catch (SecurityException e) { + //throw SecurityExceptions immediately + throw e; + } catch (IOException|RuntimeException e) { + //image rendering can throw a variety of runtime exceptions, not just IOExceptions... + //need to have a wide catch + metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM, + ExceptionUtils.getStackTrace(e)); + return; } try (InputStream is = TikaInputStream.get(tmpFile)) { metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, ocrImageMediaType.toString());
