Author: tallison Date: Tue May 27 19:33:07 2014 New Revision: 1597856 URL: http://svn.apache.org/r1597856 Log: TIKA-1294 add ability to turn off image extraction from PDFs
Modified: tika/trunk/CHANGES.txt tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Tue May 27 19:33:07 2014 @@ -1,5 +1,8 @@ Release 1.6 - ??/??/2014 + * Added ability to turn off image extraction from PDFs (TIKA-1294). + Users must now turn on this capability via the PDFParserConfig. + * Upgrade to PDFBox 1.8.5 (TIKA-1290, TIKA-1231, TIKA-1233) * Zip Container Detection for DWFX and XPS formats, which are OPC Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java Tue May 27 19:33:07 2014 @@ -36,6 +36,22 @@ package org.apache.tika.metadata; */ @SuppressWarnings("deprecation") public interface TikaCoreProperties { + + /** + * A file might contain different types of embedded documents. + * The most common is the attachment. + * An "inline" embedded resource should be used for embedded image + * files that are used to render the page image (as in PDXObjImages in PDF files). + * <p> + * Not all parsers have yet implemented this + * + */ + public enum EmbeddedResourceType { + inline, + attachment + }; + + /** * @see DublinCore#FORMAT */ @@ -246,5 +262,13 @@ public interface TikaCoreProperties { @Deprecated public static final Property TRANSITION_SUBJECT_TO_OO_SUBJECT = Property.composite(OfficeOpenXMLCore.SUBJECT, new Property[] { Property.internalText(Metadata.SUBJECT) }); + + /** + * See {@link #EMBEDDED_RESOURCE_TYPE} + */ + public static final Property EMBEDDED_RESOURCE_TYPE = + Property.internalClosedChoise(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE, + new String[]{EmbeddedResourceType.attachment.toString(), EmbeddedResourceType.inline.toString()}); + } Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java Tue May 27 19:33:07 2014 @@ -26,4 +26,7 @@ public interface TikaMetadataKeys { String PROTECTED = "protected"; String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId"; + + String EMBEDDED_RESOURCE_TYPE = "embeddedResourceType"; + } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Tue May 27 19:33:07 2014 @@ -22,10 +22,12 @@ import java.io.IOException; import java.io.Writer; import java.text.SimpleDateFormat; import java.util.Calendar; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import org.apache.pdfbox.pdmodel.PDDocument; @@ -63,6 +65,7 @@ import org.apache.tika.extractor.Parsing import org.apache.tika.io.IOExceptionWithCause; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; @@ -93,6 +96,14 @@ class PDF2XHTML extends PDFTextStripper private boolean inParagraph = false; /** + * This keeps track of the pdf object ids for inline + * images that have been processed. If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly() + * is true, this will be checked before extracting an embedded image. + */ + private Set<String> processedInlineImages = new HashSet<String>(); + + + /** * Converts the given PDF document (and related metadata) to a stream * of XHTML SAX events sent to the given content handler. * @@ -279,14 +290,27 @@ class PDF2XHTML extends PDFTextStripper } private void extractImages(PDResources resources) throws SAXException { - if (resources == null) { + if (resources == null || config.getExtractInlineImages() == false) { return; } - for (PDXObject object : resources.getXObjects().values()) { + for (Map.Entry<String, PDXObject> entry : resources.getXObjects().entrySet()) { + + PDXObject object = entry.getValue(); if (object instanceof PDXObjectForm) { extractImages(((PDXObjectForm) object).getResources()); } else if (object instanceof PDXObjectImage) { + + //Do we only want to process unique COSObject ids? + //If so, have we already processed this one? + if (config.getExtractUniqueInlineImagesOnly() == true) { + String cosObjectId = entry.getKey(); + if (processedInlineImages.contains(cosObjectId)){ + continue; + } + processedInlineImages.add(cosObjectId); + } + PDXObjectImage image = (PDXObjectImage) object; Metadata metadata = new Metadata(); @@ -297,6 +321,8 @@ class PDF2XHTML extends PDFTextStripper } else if (image instanceof PDPixelMap) { metadata.set(Metadata.CONTENT_TYPE, "image/png"); } + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.inline.toString()); EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); @@ -454,6 +480,8 @@ class PDF2XHTML extends PDFTextStripper metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey()); metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.attachment.toString()); if (extractor.shouldParseEmbedded(metadata)) { TikaInputStream stream = TikaInputStream.get(file.createInputStream()); Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue May 27 19:33:07 2014 @@ -63,6 +63,13 @@ import org.xml.sax.SAXException; * the PDF contains any embedded documents (for example as part of a PDF * package) then this parser will use the {@link EmbeddedDocumentExtractor} * to handle them. + * <p> + * As of Tika 1.6, it is possible to extract inline images with + * the {@link EmbeddedDocumentExtractor} as if they were regular + * attachments. By default, this feature is turned off because of + * the potentially enormous number and size of inline images. To + * turn this feature on, see + * {@link PDFParserConfig#setExtractInlineImages(boolean)}. */ public class PDFParser extends AbstractParser { Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Tue May 27 19:33:07 2014 @@ -1,11 +1,5 @@ package org.apache.tika.parser.pdf; -import java.io.IOException; -import java.io.InputStream; -import java.io.Serializable; -import java.util.Properties; - -import org.apache.pdfbox.util.PDFTextStripper; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -23,6 +17,13 @@ import org.apache.pdfbox.util.PDFTextStr * limitations under the License. */ +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.Properties; + +import org.apache.pdfbox.util.PDFTextStripper; + /** * Config for PDFParser. * @@ -64,6 +65,13 @@ public class PDFParserConfig implements //True if acroform content should be extracted private boolean extractAcroFormContent = true; + + //True if inline PDXImage objects should be extracted + private boolean extractInlineImages = false; + + //True if inline images (as identified by their object id within + //a pdf file) should only be extracted once. + private boolean extractUniqueInlineImagesOnly = true; //The character width-based tolerance value used to estimate where spaces in text should be added private Float averageCharTolerance; @@ -122,6 +130,12 @@ public class PDFParserConfig implements setExtractAcroFormContent( getProp(props.getProperty("extractAcroFormContent"), getExtractAcroFormContent())); + setExtractInlineImages( + getProp(props.getProperty("extractInlineImages"), + getExtractInlineImages())); + setExtractUniqueInlineImagesOnly( + getProp(props.getProperty("extractUniqueInlineImagesOnly"), + getExtractUniqueInlineImagesOnly())); } /** @@ -163,6 +177,56 @@ public class PDFParserConfig implements return extractAcroFormContent; } + /** + * If true, extract inline embedded OBXImages. + * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain + * thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5, + * there can be surprisingly large memory consumption and/or out of memory errors. + * Set to <code>true</code> with caution. + * <p> + * The default is <code>false</code>. + * <p> + * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)}; + * + * @param extractInlineImages + */ + public void setExtractInlineImages(boolean extractInlineImages) { + this.extractInlineImages = extractInlineImages; + } + + /** @see #setExtractInlineImages(boolean) */ + public boolean getExtractInlineImages() { + return extractInlineImages; + } + + /** + * Multiple pages within a PDF file might refer to the same underlying image. + * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, the + * parser will call the EmbeddedExtractor each time the image appears on a page. + * This might be desired for some use cases. However, to avoid duplication of + * extracted images, set this to <code>true</code>. The default is <code>true</code>. + * <p> + * Note that uniqueness is determined only by the underlying PDF COSObject id, not by + * file hash or similar equality metric. + * If the PDF actually contains multiple copies of the same image + * -- all with different object ids -- then all images will be extracted. + * <p> + * For this parameter to have any effect, {@link #extractInlineImages} must be + * set to <code>true</code>. + * + * @param extractUniqueInlineImagesOnly + */ + public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { + this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly; + + } + + /** @see #setExtractUniqueInlineImagesOnly(boolean) */ + public boolean getExtractUniqueInlineImagesOnly() { + return extractUniqueInlineImagesOnly; + } + + /** @see #setEnableAutoSpace. */ public boolean getEnableAutoSpace() { return enableAutoSpace; @@ -286,10 +350,19 @@ public class PDFParserConfig implements public int hashCode() { final int prime = 31; int result = 1; + result = prime + * result + + ((averageCharTolerance == null) ? 0 : averageCharTolerance + .hashCode()); result = prime * result + (enableAutoSpace ? 1231 : 1237); result = prime * result + (extractAcroFormContent ? 1231 : 1237); result = prime * result + (extractAnnotationText ? 1231 : 1237); + result = prime * result + (extractInlineImages ? 1231 : 1237); + result = prime * result + (extractUniqueInlineImagesOnly ? 1231 : 1237); result = prime * result + (sortByPosition ? 1231 : 1237); + result = prime + * result + + ((spacingTolerance == null) ? 0 : spacingTolerance.hashCode()); result = prime * result + (suppressDuplicateOverlappingText ? 1231 : 1237); result = prime * result + (useNonSequentialParser ? 1231 : 1237); @@ -305,14 +378,28 @@ public class PDFParserConfig implements if (getClass() != obj.getClass()) return false; PDFParserConfig other = (PDFParserConfig) obj; + if (averageCharTolerance == null) { + if (other.averageCharTolerance != null) + return false; + } else if (!averageCharTolerance.equals(other.averageCharTolerance)) + return false; if (enableAutoSpace != other.enableAutoSpace) return false; if (extractAcroFormContent != other.extractAcroFormContent) return false; if (extractAnnotationText != other.extractAnnotationText) return false; + if (extractInlineImages != other.extractInlineImages) + return false; + if (extractUniqueInlineImagesOnly != other.extractUniqueInlineImagesOnly) + return false; if (sortByPosition != other.sortByPosition) return false; + if (spacingTolerance == null) { + if (other.spacingTolerance != null) + return false; + } else if (!spacingTolerance.equals(other.spacingTolerance)) + return false; if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText) return false; if (useNonSequentialParser != other.useNonSequentialParser) @@ -327,6 +414,11 @@ public class PDFParserConfig implements + suppressDuplicateOverlappingText + ", extractAnnotationText=" + extractAnnotationText + ", sortByPosition=" + sortByPosition + ", useNonSequentialParser=" + useNonSequentialParser - + ", extractAcroFormContent=" + extractAcroFormContent + "]"; + + ", extractAcroFormContent=" + extractAcroFormContent + + ", extractInlineImages=" + extractInlineImages + + ", extractUniqueInlineImagesOnly=" + + extractUniqueInlineImagesOnly + ", averageCharTolerance=" + + averageCharTolerance + ", spacingTolerance=" + + spacingTolerance + "]"; } } Modified: tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties (original) +++ tika/trunk/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties Tue May 27 19:33:07 2014 @@ -19,3 +19,5 @@ sortByPosition false suppressDuplicateOverlappingText false useNonSequentialParser false extractAcroFormContent true +extractInlineImages false +extractUniqueInlineImagesOnly true Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Tue May 27 19:33:07 2014 @@ -232,7 +232,11 @@ public abstract class TikaTest { public List<Metadata> getAllMetadata() { return metadatas; - } + } + + public void clear() { + metadatas.clear(); + } } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1597856&r1=1597855&r2=1597856&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue May 27 19:33:07 2014 @@ -34,6 +34,7 @@ import java.util.Set; import org.apache.tika.TikaTest; import org.apache.tika.extractor.ContainerExtractor; +import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.extractor.ParserContainerExtractor; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -640,28 +641,35 @@ public class PDFParserTest extends TikaT //"regressiveness" exists only in Unit10.doc not in the container pdf document assertTrue(xml.contains("regressiveness")); - TrackingHandler tracker = new TrackingHandler(); + RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), false); TikaInputStream tis = null; - ContainerExtractor ex = new ParserContainerExtractor(); + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + config.setExtractUniqueInlineImagesOnly(false); + context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); + context.set(org.apache.tika.parser.Parser.class, p); + try { tis= TikaInputStream.get( - getResourceAsStream("/test-documents/testPDF_childAttachments.pdf")); - ex.extract(tis, ex, tracker); + getResourceAsStream("/test-documents/testPDF_childAttachments.pdf")); + p.parse(tis, new BodyContentHandler(-1), new Metadata(), context); } finally { if (tis != null) { tis.close(); } } - assertEquals(4, tracker.filenames.size()); - assertEquals(4, tracker.mediaTypes.size()); - assertNull(tracker.filenames.get(0)); - assertNull(tracker.filenames.get(1)); - assertEquals("Press Quality(1).joboptions", tracker.filenames.get(2)); - assertEquals("Unit10.doc", tracker.filenames.get(3)); - assertEquals(MediaType.image("jpeg"), tracker.mediaTypes.get(0)); - assertEquals(MediaType.image("tiff"), tracker.mediaTypes.get(1)); - assertEquals(TYPE_TEXT, tracker.mediaTypes.get(2)); - assertEquals(TYPE_DOC, tracker.mediaTypes.get(3)); + + List<Metadata> metadatas = p.getAllMetadata(); + assertEquals(5, metadatas.size()); + assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY)); + assertNull(metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals("Press Quality(1).joboptions", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals("Unit10.doc", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY)); + assertEquals(MediaType.image("jpeg").toString(), metadatas.get(0).get(Metadata.CONTENT_TYPE)); + assertEquals(MediaType.image("tiff").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(2).get(Metadata.CONTENT_TYPE)); + assertEquals(TYPE_DOC.toString(), metadatas.get(3).get(Metadata.CONTENT_TYPE)); } public void testVersions() throws Exception { @@ -839,4 +847,144 @@ public class PDFParserTest extends TikaT } } } + + @Test + public void testInlineSelector() throws Exception { + + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + config.setExtractUniqueInlineImagesOnly(false); + + Parser defaultParser = new AutoDetectParser(); + + RecursiveMetadataParser p = new RecursiveMetadataParser(defaultParser, false); + ParseContext context = new ParseContext(); + context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); + context.set(org.apache.tika.parser.Parser.class, p); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(-1); + String path = "/test-documents/testPDF_childAttachments.pdf"; + InputStream stream = TikaInputStream.get(this.getClass().getResource(path)); + + p.parse(stream, handler, metadata, context); + + List<Metadata> metadatas = p.getAllMetadata(); + int inline = 0; + int attach = 0; + for (Metadata m : metadatas) { + String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null) { + if (v.equals(TikaCoreProperties.EmbeddedResourceType.inline.toString())){ + inline++; + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.attachment.toString())){ + attach++; + } + } + } + assertEquals(2, inline); + assertEquals(2, attach); + + stream.close(); + p.clear(); + + //now try turning off inline + stream = TikaInputStream.get(this.getClass().getResource(path)); + + context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector()); + inline = 0; + attach = 0; + handler = new BodyContentHandler(-1); + metadata = new Metadata(); + p.parse(stream, handler, metadata, context); + + metadatas = p.getAllMetadata(); + for (Metadata m : metadatas) { + String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null) { + if (v.equals(TikaCoreProperties.EmbeddedResourceType.inline.toString())){ + inline++; + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.attachment.toString())){ + attach++; + } + } + } + assertEquals(0, inline); + assertEquals(2, attach); + + } + + + @Test + public void testInlineConfig() throws Exception { + + Parser defaultParser = new AutoDetectParser(); + RecursiveMetadataParser p = new RecursiveMetadataParser(defaultParser, false); + ParseContext context = new ParseContext(); + context.set(org.apache.tika.parser.Parser.class, p); + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(-1); + String path = "/test-documents/testPDF_childAttachments.pdf"; + InputStream stream = TikaInputStream.get(this.getClass().getResource(path)); + + p.parse(stream, handler, metadata, context); + + List<Metadata> metadatas = p.getAllMetadata(); + int inline = 0; + int attach = 0; + for (Metadata m : metadatas) { + String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null) { + if (v.equals(TikaCoreProperties.EmbeddedResourceType.inline.toString())){ + inline++; + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.attachment.toString())){ + attach++; + } + } + } + assertEquals(0, inline); + assertEquals(2, attach); + + stream.close(); + p.clear(); + + //now try turning off inline + stream = TikaInputStream.get(this.getClass().getResource(path)); + PDFParserConfig config = new PDFParserConfig(); + config.setExtractInlineImages(true); + config.setExtractUniqueInlineImagesOnly(false); + + context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config); + inline = 0; + attach = 0; + handler = new BodyContentHandler(-1); + metadata = new Metadata(); + p.parse(stream, handler, metadata, context); + + metadatas = p.getAllMetadata(); + for (Metadata m : metadatas) { + String v = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null) { + if (v.equals(TikaCoreProperties.EmbeddedResourceType.inline.toString())){ + inline++; + } else if (v.equals(TikaCoreProperties.EmbeddedResourceType.attachment.toString())){ + attach++; + } + } + } + assertEquals(2, inline); + assertEquals(2, attach); + } + + + private class AvoidInlineSelector implements DocumentSelector { + + @Override + public boolean select(Metadata metadata) { + String v = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE); + if (v != null && v.equals(TikaCoreProperties.EmbeddedResourceType.inline.toString())){ + return false; + } + return true; + } + } }