This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit e61ead66bac80722c18ba70f9a2e1b2d229080f4 Author: tallison <[email protected]> AuthorDate: Mon Jun 22 12:17:33 2020 -0400 TIKA-3122 -- extract some image metadata without rendering images --- .../tika/exception/ZeroByteFileException.java | 15 ++++++++++ .../apache/tika/parser/RecursiveParserWrapper.java | 6 +++- .../src/test/java/org/apache/tika/TikaTest.java | 5 ++++ .../tika/parser/pdf/ImageGraphicsEngine.java | 34 ++++++++++++++++++++++ .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 3 +- .../java/org/apache/tika/parser/pdf/PDFParser.java | 5 ++++ .../apache/tika/parser/pdf/PDFParserConfig.java | 31 ++++++++++++++++++++ .../org/apache/tika/parser/pdf/PDFParserTest.java | 19 ++++++++++-- 8 files changed, 114 insertions(+), 4 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java index 65e57e8..9232461 100644 --- a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java +++ b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java @@ -22,6 +22,21 @@ package org.apache.tika.exception; */ public class ZeroByteFileException extends TikaException { + + public static class IgnoreZeroByteFileException {} + + //If this is in the parse context, the AutoDetectParser and the + //RecursiveParserWrapper should ignore zero byte files + //and not throw a Zero} + /** + * If this is in the {@link org.apache.tika.parser.ParseContext}, the + * {@link org.apache.tika.parser.AutoDetectParser} and the + * {@link org.apache.tika.parser.RecursiveParserWrapper} will + * ignore embedded files with zero-byte length inputstreams + */ + public static IgnoreZeroByteFileException IGNORE_ZERO_BYTE_FILE_EXCEPTION + = new IgnoreZeroByteFileException(); + public ZeroByteFileException(String msg) { super(msg); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 3f38e32..e9de9ba 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -19,6 +19,7 @@ package org.apache.tika.parser; import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -399,7 +400,10 @@ public class RecursiveParserWrapper extends ParserDecorator { } catch(CorruptedFileException e) { throw e; } catch (TikaException e) { - if (catchEmbeddedExceptions) { + if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null + && e instanceof ZeroByteFileException) { + //do nothing + } else if (catchEmbeddedExceptions) { ParserUtils.recordParserFailure(this, e, metadata); } else { throw e; diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 5c50ea3..e21f752 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -246,6 +246,11 @@ public abstract class TikaTest { } } + protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context, boolean suppressException) throws Exception { + try (TikaInputStream tis = TikaInputStream.get(path)) { + return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, new Metadata(), suppressException); + } + } protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception { try (TikaInputStream tis = TikaInputStream.get(path)) { return getRecursiveMetadata(tis, parser, new ParseContext(), new Metadata(), suppressException); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java index 95af12d..2e942f0 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java @@ -41,14 +41,19 @@ import org.apache.pdfbox.util.Matrix; import org.apache.pdfbox.util.Vector; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaMemoryLimitException; +import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.IOExceptionWithCause; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.SAXException; @@ -97,6 +102,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine { private final Metadata parentMetadata; private final XHTMLContentHandler xhtml; private final ParseContext parseContext; + private final boolean extractInlineImageMetadataOnly; //TODO: this is an embarrassment of an initializer...fix protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor, @@ -111,6 +117,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine { this.xhtml = xhtml; this.parentMetadata = parentMetadata; this.parseContext = parseContext; + this.extractInlineImageMetadataOnly = pdfParserConfig.getExtractInlineImageMetadataOnly(); } void run() throws IOException { @@ -289,6 +296,11 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine { metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + if (extractInlineImageMetadataOnly) { + extractInlineImageMetadataOnly(pdImage, metadata); + return; + } + if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); if (pdImage instanceof PDImageXObject) { @@ -315,6 +327,28 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine { } + private void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata) throws IOException, SAXException { + if (pdImage instanceof PDImageXObject) { + PDMetadataExtractor.extract(((PDImageXObject) pdImage).getMetadata(), + metadata, parseContext); + } + metadata.set(Metadata.IMAGE_WIDTH, pdImage.getWidth()); + metadata.set(Metadata.IMAGE_LENGTH, pdImage.getHeight()); + //TODO: what else can we extract from the PDImage without rendering? + ZeroByteFileException.IgnoreZeroByteFileException before = + parseContext.get(ZeroByteFileException.IgnoreZeroByteFileException.class); + try { + parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, + ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION); + embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new byte[0]), + new EmbeddedContentHandler(xhtml), metadata, false); + } finally { + //replace whatever was there before + parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, + before); + } + } + private String getSuffix(PDImage pdImage, Metadata metadata) throws IOException { String suffix = pdImage.getSuffix(); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index 8c2f3f2..572087d 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -147,7 +147,8 @@ class PDF2XHTML extends AbstractPDF2XHTML { } void extractImages(PDPage page) throws SAXException, IOException { - if (config.getExtractInlineImages() == false) { + if (config.getExtractInlineImages() == false + && config.getExtractInlineImageMetadataOnly() == false) { return; } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 6d8b5b1..3b36c99 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -524,6 +524,11 @@ public class PDFParser extends AbstractParser implements Initializable { } @Field + void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) { + defaultConfig.setExtractInlineImageMetadataOnly(extractInlineImageMetadataOnly); + } + + @Field void setAverageCharTolerance(float averageCharTolerance) { defaultConfig.setAverageCharTolerance(averageCharTolerance); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index da8b309..9613781 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -105,6 +105,10 @@ public class PDFParserConfig implements Serializable { //True if inline PDXImage objects should be extracted private boolean extractInlineImages = false; + //True if inline images should only have their metadata + //extracted. + private boolean extractInlineImageMetadataOnly = false; + //True if inline images (as identified by their object id within //a pdf file) should only be extracted once. private boolean extractUniqueInlineImagesOnly = true; @@ -215,6 +219,10 @@ public class PDFParserConfig implements Serializable { setExtractUniqueInlineImagesOnly( getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), getExtractUniqueInlineImagesOnly())); + setExtractInlineImageMetadataOnly( + getBooleanProp(props.getProperty("extractInlineImageMetadataOnly"), + getExtractInlineImageMetadataOnly()) + ); setExtractFontNames( getBooleanProp(props.getProperty("extractFontNames"), getExtractFontNames())); @@ -264,6 +272,29 @@ public class PDFParserConfig implements Serializable { } /** + * Use this when you want to know how many images of what formats are in a PDF + * but you don't need to render the images (e.g. for OCR). This is far + * faster than {@link #extractInlineImages} because it doesn't have to render the + * images, which can be very slow. This does not extract metadata from + * within each image, rather it extracts the XMP that may be stored + * external to an image in PDImageXObjects. + * + * @param extractInlineImageMetadataOnly + * @since 1.25 + */ + void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) { + this.extractInlineImageMetadataOnly = extractInlineImageMetadataOnly; + } + + /** + * + * @return whether or not to extract only inline image metadata and not render the images + */ + boolean getExtractInlineImageMetadataOnly() { + return extractInlineImageMetadataOnly; + } + + /** * If the PDF contains marked content, try to extract text and its marked structure. * If the PDF does not contain marked content, backoff to the regular PDF2XHTML for * text extraction. As of 1.24, this is an "alpha" version. diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index f9cbffd..9e267dd 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -26,7 +26,6 @@ import static org.junit.Assert.fail; import static org.junit.Assume.assumeTrue; import java.io.InputStream; -import java.nio.file.Paths; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -47,6 +46,7 @@ import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.extractor.ContainerExtractor; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.extractor.ParserContainerExtractor; @@ -1596,6 +1596,22 @@ public class PDFParserTest extends TikaTest { assertEquals("Hewlett-Packard MFP", m.get(XMP.CREATOR_TOOL)); assertEquals("1998-08-29T13:53:15Z", m.get(XMP.CREATE_DATE)); } + + @Test + public void testExtractInlineImageMetadata() throws Exception { + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImageMetadataOnly(true); + context.set(PDFParserConfig.class, config); + List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf", context); + assertNull(context.get(ZeroByteFileException.IgnoreZeroByteFileException.class)); + assertEquals(2, metadataList.size()); + assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE)); + assertEquals("/image0.png", metadataList.get(1).get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)); + assertEquals(261, (int)metadataList.get(1).getInt(Metadata.IMAGE_LENGTH)); + assertEquals(934, (int)metadataList.get(1).getInt(Metadata.IMAGE_WIDTH)); + assertEquals("image0.png", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + } /** * Simple class to count end of document events. If functionality is useful, * move to org.apache.tika in src/test @@ -1625,5 +1641,4 @@ public class PDFParserTest extends TikaTest { } } - }
