This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 40a06ae TIKA-3122 -- extract some image metadata without rendering
images
40a06ae is described below
commit 40a06ae1277f0b211326a1121c2be615f8aefb3e
Author: tallison <[email protected]>
AuthorDate: Mon Jun 22 12:17:33 2020 -0400
TIKA-3122 -- extract some image metadata without rendering images
---
.../tika/exception/ZeroByteFileException.java | 15 ++++++++++
.../apache/tika/parser/RecursiveParserWrapper.java | 6 +++-
.../src/test/java/org/apache/tika/TikaTest.java | 5 ++++
.../tika/parser/pdf/ImageGraphicsEngine.java | 34 ++++++++++++++++++++++
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 3 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 5 ++++
.../apache/tika/parser/pdf/PDFParserConfig.java | 31 ++++++++++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 21 +++++++++++--
8 files changed, 115 insertions(+), 5 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
index 65e57e8..9232461 100644
---
a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
+++
b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
@@ -22,6 +22,21 @@ package org.apache.tika.exception;
*/
public class ZeroByteFileException extends TikaException {
+
+ public static class IgnoreZeroByteFileException {}
+
+ //If this is in the parse context, the AutoDetectParser and the
+ //RecursiveParserWrapper should ignore zero byte files
+ //and not throw a Zero}
+ /**
+ * If this is in the {@link org.apache.tika.parser.ParseContext}, the
+ * {@link org.apache.tika.parser.AutoDetectParser} and the
+ * {@link org.apache.tika.parser.RecursiveParserWrapper} will
+ * ignore embedded files with zero-byte length inputstreams
+ */
+ public static IgnoreZeroByteFileException IGNORE_ZERO_BYTE_FILE_EXCEPTION
+ = new IgnoreZeroByteFileException();
+
public ZeroByteFileException(String msg) {
super(msg);
}
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 2c94ae0..7f7d3dc 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser;
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -398,7 +399,10 @@ public class RecursiveParserWrapper extends
ParserDecorator {
} catch(CorruptedFileException e) {
throw e;
} catch (TikaException e) {
- if (catchEmbeddedExceptions) {
+ if
(context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null
+ && e instanceof ZeroByteFileException) {
+ //do nothing
+ } else if (catchEmbeddedExceptions) {
ParserUtils.recordParserFailure(this, e, metadata);
} else {
throw e;
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 50d4590..5e60ce0 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -247,6 +247,11 @@ public abstract class TikaTest {
}
}
+ protected List<Metadata> getRecursiveMetadata(Path path, ParseContext
context, boolean suppressException) throws Exception {
+ try (TikaInputStream tis = TikaInputStream.get(path)) {
+ return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, new
Metadata(), suppressException);
+ }
+ }
protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
boolean suppressException) throws Exception {
try (TikaInputStream tis = TikaInputStream.get(path)) {
return getRecursiveMetadata(tis, parser, new ParseContext(), new
Metadata(), suppressException);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
index 6301abf..2d83bee 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -41,14 +41,19 @@ import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -97,6 +102,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
private final Metadata parentMetadata;
private final XHTMLContentHandler xhtml;
private final ParseContext parseContext;
+ private final boolean extractInlineImageMetadataOnly;
//TODO: this is an embarrassment of an initializer...fix
protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor
embeddedDocumentExtractor,
@@ -111,6 +117,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
this.xhtml = xhtml;
this.parentMetadata = parentMetadata;
this.parseContext = parseContext;
+ this.extractInlineImageMetadataOnly =
pdfParserConfig.getExtractInlineImageMetadataOnly();
}
void run() throws IOException {
@@ -289,6 +296,11 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+ if (extractInlineImageMetadataOnly) {
+ extractInlineImageMetadataOnly(pdImage, metadata);
+ return;
+ }
+
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
if (pdImage instanceof PDImageXObject) {
@@ -315,6 +327,28 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
}
+ private void extractInlineImageMetadataOnly(PDImage pdImage, Metadata
metadata) throws IOException, SAXException {
+ if (pdImage instanceof PDImageXObject) {
+ PDMetadataExtractor.extract(((PDImageXObject)
pdImage).getMetadata(),
+ metadata, parseContext);
+ }
+ metadata.set(Metadata.IMAGE_WIDTH, pdImage.getWidth());
+ metadata.set(Metadata.IMAGE_LENGTH, pdImage.getHeight());
+ //TODO: what else can we extract from the PDImage without rendering?
+ ZeroByteFileException.IgnoreZeroByteFileException before =
+
parseContext.get(ZeroByteFileException.IgnoreZeroByteFileException.class);
+ try {
+
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
+ ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
+ embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new
byte[0]),
+ new EmbeddedContentHandler(xhtml), metadata, false);
+ } finally {
+ //replace whatever was there before
+
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
+ before);
+ }
+ }
+
private String getSuffix(PDImage pdImage, Metadata metadata) throws
IOException {
String suffix = pdImage.getSuffix();
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 8c2f3f2..572087d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -147,7 +147,8 @@ class PDF2XHTML extends AbstractPDF2XHTML {
}
void extractImages(PDPage page) throws SAXException, IOException {
- if (config.getExtractInlineImages() == false) {
+ if (config.getExtractInlineImages() == false
+ && config.getExtractInlineImageMetadataOnly() == false) {
return;
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index ec80dc4..c09ca81 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -504,6 +504,11 @@ public class PDFParser extends AbstractParser implements
Initializable {
}
@Field
+ void setExtractInlineImageMetadataOnly(boolean
extractInlineImageMetadataOnly) {
+
defaultConfig.setExtractInlineImageMetadataOnly(extractInlineImageMetadataOnly);
+ }
+
+ @Field
void setAverageCharTolerance(float averageCharTolerance) {
defaultConfig.setAverageCharTolerance(averageCharTolerance);
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index fc8bea6..81d7e0f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -105,6 +105,10 @@ public class PDFParserConfig implements Serializable {
//True if inline PDXImage objects should be extracted
private boolean extractInlineImages = false;
+ //True if inline images should only have their metadata
+ //extracted.
+ private boolean extractInlineImageMetadataOnly = false;
+
//True if inline images (as identified by their object id within
//a pdf file) should only be extracted once.
private boolean extractUniqueInlineImagesOnly = true;
@@ -211,6 +215,10 @@ public class PDFParserConfig implements Serializable {
setExtractUniqueInlineImagesOnly(
getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
getExtractUniqueInlineImagesOnly()));
+ setExtractInlineImageMetadataOnly(
+
getBooleanProp(props.getProperty("extractInlineImageMetadataOnly"),
+ getExtractInlineImageMetadataOnly())
+ );
setExtractFontNames(
getBooleanProp(props.getProperty("extractFontNames"),
getExtractFontNames()));
@@ -258,6 +266,29 @@ public class PDFParserConfig implements Serializable {
}
/**
+ * Use this when you want to know how many images of what formats are in a
PDF
+ * but you don't need to render the images (e.g. for OCR). This is far
+ * faster than {@link #extractInlineImages} because it doesn't have to
render the
+ * images, which can be very slow. This does not extract metadata from
+ * within each image, rather it extracts the XMP that may be stored
+ * external to an image in PDImageXObjects.
+ *
+ * @param extractInlineImageMetadataOnly
+ * @since 1.25
+ */
+ void setExtractInlineImageMetadataOnly(boolean
extractInlineImageMetadataOnly) {
+ this.extractInlineImageMetadataOnly = extractInlineImageMetadataOnly;
+ }
+
+ /**
+ *
+ * @return whether or not to extract only inline image metadata and not
render the images
+ */
+ boolean getExtractInlineImageMetadataOnly() {
+ return extractInlineImageMetadataOnly;
+ }
+
+ /**
* If the PDF contains marked content, try to extract text and its marked
structure.
* If the PDF does not contain marked content, backoff to the regular
PDF2XHTML for
* text extraction. As of 1.24, this is an "alpha" version.
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 0dea151..ef1bd3c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -26,7 +26,6 @@ import static org.junit.Assert.fail;
import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
-import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
@@ -47,6 +46,7 @@ import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.AccessPermissionException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.DocumentSelector;
import org.apache.tika.extractor.ParserContainerExtractor;
@@ -491,7 +491,7 @@ public class PDFParserTest extends TikaTest {
// Column text is now interleaved:
assertContains("Left column line 1 Right column line 1 Left colu mn
line 2 Right column line 2", content);
- //now try setting autodetect via parsecontext
+ //now try setting autodetect via parsecontext
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
context.set(PDFParserConfig.class, config);
@@ -1564,6 +1564,22 @@ public class PDFParserTest extends TikaTest {
assertEquals("Hewlett-Packard MFP", m.get(XMP.CREATOR_TOOL));
assertEquals("1998-08-29T13:53:15Z", m.get(XMP.CREATE_DATE));
}
+
+ @Test
+ public void testExtractInlineImageMetadata() throws Exception {
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractInlineImageMetadataOnly(true);
+ context.set(PDFParserConfig.class, config);
+ List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf",
context);
+
assertNull(context.get(ZeroByteFileException.IgnoreZeroByteFileException.class));
+ assertEquals(2, metadataList.size());
+ assertEquals("image/png",
metadataList.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals("/image0.png",
metadataList.get(1).get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
+ assertEquals(261,
(int)metadataList.get(1).getInt(Metadata.IMAGE_LENGTH));
+ assertEquals(934,
(int)metadataList.get(1).getInt(Metadata.IMAGE_WIDTH));
+ assertEquals("image0.png",
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ }
/**
* Simple class to count end of document events. If functionality is
useful,
* move to org.apache.tika in src/test
@@ -1593,5 +1609,4 @@ public class PDFParserTest extends TikaTest {
}
}
-
}