[4/5] tika git commit: TIKA-2059 - Merge multimedia and pdf parser modules and bundles

bob Sun, 28 Aug 2016 09:30:21 -0700

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
new file mode 100644
index 0000000..f735f25
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -0,0 +1,626 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.stream.XMLStreamException;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Calendar;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchema;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.common.PDMetadata;
+import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
+import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.parser.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.w3c.dom.Document;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * PDF parser.
+ * <p/>
+ * This parser can process also encrypted PDF documents if the required
+ * password is given as a part of the input metadata associated with a
+ * document. If no password is given, then this parser will try decrypting
+ * the document using the empty password that's often used with PDFs. If
+ * the PDF contains any embedded documents (for example as part of a PDF
+ * package) then this parser will use the {@link EmbeddedDocumentExtractor}
+ * to handle them.
+ * <p/>
+ * As of Tika 1.6, it is possible to extract inline images with
+ * the {@link EmbeddedDocumentExtractor} as if they were regular
+ * attachments.  By default, this feature is turned off because of
+ * the potentially enormous number and size of inline images.  To
+ * turn this feature on, see
+ * {@link PDFParserConfig#setExtractInlineImages(boolean)}.
+ */
+public class PDFParser extends AbstractParser {
+
+
+    /**
+     * Metadata key for giving the document password to the parser.
+     *
+     * @since Apache Tika 0.5
+     * @deprecated Supply a {@link PasswordProvider} on the {@link 
ParseContext} instead
+     */
+    public static final String PASSWORD = 
"org.apache.tika.parser.pdf.password";
+    private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -752276948656079347L;
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MEDIA_TYPE);
+    private PDFParserConfig defaultConfig = new PDFParserConfig();
+
+
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        PDDocument pdfDocument = null;
+        TemporaryResources tmp = new TemporaryResources();
+        //config from context, or default if not set via context
+        PDFParserConfig localConfig = context.get(PDFParserConfig.class, 
defaultConfig);
+        String password = "";
+        try {
+            // PDFBox can process entirely in memory, or can use a temp file
+            //  for unpacked / processed resources
+            // Decide which to do based on if we're reading from a file or not 
already
+            //TODO: make this configurable via MemoryUsageSetting
+            TikaInputStream tstream = TikaInputStream.cast(stream);
+            password = getPassword(metadata, context);
+            if (tstream != null && tstream.hasFile()) {
+                // File based -- send file directly to PDFBox
+                pdfDocument = PDDocument.load(tstream.getPath().toFile(), 
password);
+            } else {
+                pdfDocument = PDDocument.load(new 
CloseShieldInputStream(stream), password);
+            }
+            metadata.set("pdf:encrypted", 
Boolean.toString(pdfDocument.isEncrypted()));
+
+            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
+            extractMetadata(pdfDocument, metadata, context);
+
+            AccessChecker checker = localConfig.getAccessChecker();
+            checker.check(metadata);
+            if (handler != null) {
+                if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
+                    handleXFAOnly(pdfDocument, handler, metadata, context);
+                } else if 
(localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
+                    metadata.add("X-Parsed-By", 
TesseractOCRParser.class.toString());
+                    OCR2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
+                } else {
+                    if 
(localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION))
 {
+                        metadata.add("X-Parsed-By", 
TesseractOCRParser.class.toString());
+                    }
+                    PDF2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
+                }
+
+            }
+        } catch (InvalidPasswordException e) {
+            metadata.set("pdf:encrypted", "true");
+            throw new EncryptedDocumentException(e);
+        } finally {
+            if (pdfDocument != null) {
+                pdfDocument.close();
+            }
+        }
+    }
+
+    private String getPassword(Metadata metadata, ParseContext context) {
+        String password = null;
+
+        // Did they supply a new style Password Provider?
+        PasswordProvider passwordProvider = 
context.get(PasswordProvider.class);
+        if (passwordProvider != null) {
+            password = passwordProvider.getPassword(metadata);
+        }
+
+        // Fall back on the old style metadata if set
+        if (password == null && metadata.get(PASSWORD) != null) {
+            password = metadata.get(PASSWORD);
+        }
+
+        // If no password is given, use an empty string as the default
+        if (password == null) {
+            password = "";
+        }
+        return password;
+    }
+
+
+    private void extractMetadata(PDDocument document, Metadata metadata, 
ParseContext context)
+            throws TikaException {
+
+        //first extract AccessPermissions
+        AccessPermission ap = document.getCurrentAccessPermission();
+        metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
+                Boolean.toString(ap.canExtractForAccessibility()));
+        metadata.set(AccessPermissions.EXTRACT_CONTENT,
+                Boolean.toString(ap.canExtractContent()));
+        metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT,
+                Boolean.toString(ap.canAssembleDocument()));
+        metadata.set(AccessPermissions.FILL_IN_FORM,
+                Boolean.toString(ap.canFillInForm()));
+        metadata.set(AccessPermissions.CAN_MODIFY,
+                Boolean.toString(ap.canModify()));
+        metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
+                Boolean.toString(ap.canModifyAnnotations()));
+        metadata.set(AccessPermissions.CAN_PRINT,
+                Boolean.toString(ap.canPrint()));
+        metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
+                Boolean.toString(ap.canPrintDegraded()));
+
+
+        //now go for the XMP
+        Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), 
context);
+
+        XMPMetadata xmp = null;
+        if (dom != null) {
+            xmp = new XMPMetadata(dom);
+        }
+        XMPSchemaDublinCore dcSchema = null;
+        try {
+            if (document.getDocumentCatalog().getMetadata() != null) {
+                InputStream xmpIs = 
document.getDocumentCatalog().getMetadata().exportXMPMetadata();
+                xmp = XMPMetadata.load(xmpIs);
+            }
+        } catch (IOException e) {}
+
+        if (xmp != null) {
+            try {
+                dcSchema = xmp.getDublinCoreSchema();
+            } catch (IOException e) {}
+
+            JempboxExtractor.extractXMPMM(xmp, metadata);
+        }
+
+        PDDocumentInformation info = document.getDocumentInformation();
+        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
+        extractMultilingualItems(metadata, TikaCoreProperties.TITLE, 
info.getTitle(), dcSchema);
+        extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, 
info.getAuthor(), dcSchema);
+        extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, 
null, dcSchema);
+        addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, 
info.getCreator());
+        addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
+        addMetadata(metadata, "producer", info.getProducer());
+        extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, 
null, dcSchema);
+
+        // TODO: Move to description in Tika 2.0
+        addMetadata(metadata, 
TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
+        addMetadata(metadata, "trapped", info.getTrapped());
+            // TODO Remove these in Tika 2.0
+        addMetadata(metadata, "created", info.getCreationDate());
+        addMetadata(metadata, TikaCoreProperties.CREATED, 
info.getCreationDate());
+        Calendar modified = info.getModificationDate();
+        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
+        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
+
+        // All remaining metadata is custom
+        // Copy this over as-is
+        List<String> handledMetadata = Arrays.asList("Author", "Creator", 
"CreationDate", "ModDate",
+                "Keywords", "Producer", "Subject", "Title", "Trapped");
+        for (COSName key : info.getCOSObject().keySet()) {
+            String name = key.getName();
+            if (!handledMetadata.contains(name)) {
+                addMetadata(metadata, name, 
info.getCOSObject().getDictionaryObject(key));
+            }
+        }
+
+        //try to get the various versions
+        //Caveats:
+        //    there is currently a fair amount of redundancy
+        //    TikaCoreProperties.FORMAT can be multivalued
+        //    There are also three potential pdf specific version keys: 
pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
+        metadata.set("pdf:PDFVersion", 
Float.toString(document.getDocument().getVersion()));
+        metadata.add(TikaCoreProperties.FORMAT.getName(),
+                MEDIA_TYPE.toString() + "; version=" +
+                        Float.toString(document.getDocument().getVersion()));
+
+        try {
+            if (xmp != null) {
+                xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, 
XMPSchemaPDFAId.class);
+                XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) 
xmp.getSchemaByClass(XMPSchemaPDFAId.class);
+                if (pdfaxmp != null) {
+                    if (pdfaxmp.getPart() != null) {
+                        metadata.set("pdfaid:part", 
Integer.toString(pdfaxmp.getPart()));
+                    }
+                    if (pdfaxmp.getConformance() != null) {
+                        metadata.set("pdfaid:conformance", 
pdfaxmp.getConformance());
+                        String version = "A-" + pdfaxmp.getPart() + 
pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
+                        metadata.set("pdfa:PDFVersion", version);
+                        metadata.add(TikaCoreProperties.FORMAT.getName(),
+                                MEDIA_TYPE.toString() + "; version=\"" + 
version + "\"");
+                    }
+                }
+                // TODO WARN if this XMP version is inconsistent with document 
header version?          
+            }
+        } catch (IOException e) {
+            metadata.set(TikaCoreProperties.TIKA_META_PREFIX + 
"pdf:metadata-xmp-parse-failed", "" + e);
+        }
+        //TODO: Let's try to move this into PDFBox.
+        //Attempt to determine Adobe extension level, if present:
+        COSDictionary root = document.getDocumentCatalog().getCOSObject();
+        COSDictionary extensions = (COSDictionary) 
root.getDictionaryObject(COSName.getPDFName("Extensions"));
+        if (extensions != null) {
+            for (COSName extName : extensions.keySet()) {
+                // If it's an Adobe one, interpret it to determine the 
extension level:
+                if (extName.equals(COSName.getPDFName("ADBE"))) {
+                    COSDictionary adobeExt = (COSDictionary) 
extensions.getDictionaryObject(extName);
+                    if (adobeExt != null) {
+                        String baseVersion = 
adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
+                        int el = 
adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
+                        //-1 is sentinel value that something went wrong in 
getInt
+                        if (el != -1) {
+                            metadata.set("pdf:PDFExtensionVersion", 
baseVersion + " Adobe Extension Level " + el);
+                            metadata.add(TikaCoreProperties.FORMAT.getName(),
+                                    MEDIA_TYPE.toString() + "; version=\"" + 
baseVersion + " Adobe Extension Level " + el + "\"");
+                        }
+                    }
+                } else {
+                    // WARN that there is an Extension, but it's not Adobe's, 
and so is a 'new' format'.
+                    metadata.set("pdf:foundNonAdobeExtensionName", 
extName.getName());
+                }
+            }
+        }
+    }
+
+    /**
+     * Try to extract all multilingual items from the XMPSchema
+     * <p/>
+     * This relies on the property having a valid xmp getName()
+     * <p/>
+     * For now, this only extracts the first language if the property does not 
allow multiple values (see TIKA-1295)
+     *
+     * @param metadata
+     * @param property
+     * @param pdfBoxBaseline
+     * @param schema
+     */
+    private void extractMultilingualItems(Metadata metadata, Property property,
+                                          String pdfBoxBaseline, XMPSchema 
schema) {
+        //if schema is null, just go with pdfBoxBaseline
+        if (schema == null) {
+            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+                addMetadata(metadata, property, pdfBoxBaseline);
+            }
+            return;
+        }
+
+        for (String lang : 
schema.getLanguagePropertyLanguages(property.getName())) {
+            String value = schema.getLanguageProperty(property.getName(), 
lang);
+
+            if (value != null && value.length() > 0) {
+                //if you're going to add it below in the baseline addition, 
don't add it now
+                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
+                    continue;
+                }
+                addMetadata(metadata, property, value);
+                if (!property.isMultiValuePermitted()) {
+                    return;
+                }
+            }
+        }
+
+        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+            //if we've already added something above and multivalue is not 
permitted
+            //return.
+            if (!property.isMultiValuePermitted()) {
+                if (metadata.get(property) != null) {
+                    return;
+                }
+            }
+            addMetadata(metadata, property, pdfBoxBaseline);
+        }
+    }
+
+
+    /**
+     * This tries to read a list from a particular property in
+     * XMPSchemaDublinCore.
+     * If it can't find the information, it falls back to the
+     * pdfboxBaseline.  The pdfboxBaseline should be the value
+     * that pdfbox returns from its PDDocumentInformation object
+     * (e.g. getAuthor()) This method is designed include the pdfboxBaseline,
+     * and it should not duplicate the pdfboxBaseline.
+     * <p/>
+     * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
+     * on dates!
+     * <p/>
+     * This relies on the property having a DublinCore compliant getName()
+     *
+     * @param property
+     * @param pdfBoxBaseline
+     * @param dc
+     * @param metadata
+     */
+    private void extractDublinCoreListItems(Metadata metadata, Property 
property,
+                                            String pdfBoxBaseline, 
XMPSchemaDublinCore dc) {
+        //if no dc, add baseline and return
+        if (dc == null) {
+            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+                addMetadata(metadata, property, pdfBoxBaseline);
+            }
+            return;
+        }
+        List<String> items = getXMPBagOrSeqList(dc, property.getName());
+        if (items == null) {
+            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+                addMetadata(metadata, property, pdfBoxBaseline);
+            }
+            return;
+        }
+        for (String item : items) {
+            if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) {
+                addMetadata(metadata, property, item);
+            }
+        }
+        //finally, add the baseline
+        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
+            addMetadata(metadata, property, pdfBoxBaseline);
+        }
+    }
+
+    /**
+     * As of this writing, XMPSchema can contain bags or sequence lists
+     * for some attributes...despite standards documentation.
+     * JempBox expects one or the other for specific attributes.
+     * Until more flexibility is added to JempBox, Tika will have to handle 
both.
+     *
+     * @param schema
+     * @param name
+     * @return list of values or null
+     */
+    private List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
+        List<String> ret = schema.getBagList(name);
+        if (ret == null) {
+            ret = schema.getSequenceList(name);
+        }
+        return ret;
+    }
+
+    private void addMetadata(Metadata metadata, Property property, String 
value) {
+        if (value != null) {
+            String decoded = decode(value);
+            if (property.isMultiValuePermitted() || metadata.get(property) == 
null) {
+                metadata.add(property, decoded);
+            }
+            //silently skip adding property that already exists if multiple 
values are not permitted
+        }
+    }
+
+    private void addMetadata(Metadata metadata, String name, String value) {
+        if (value != null) {
+            metadata.add(name, decode(value));
+        }
+    }
+
+    private String decode(String value) {
+        if (PDFEncodedStringDecoder.shouldDecode(value)) {
+            PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
+            return d.decode(value);
+        }
+        return value;
+    }
+
+    private void addMetadata(Metadata metadata, String name, Calendar value) {
+        if (value != null) {
+            metadata.set(name, value.getTime().toString());
+        }
+    }
+
+    private void addMetadata(Metadata metadata, Property property, Calendar 
value) {
+        if (value != null) {
+            metadata.set(property, value.getTime());
+        }
+    }
+
+    /**
+     * Used when processing custom metadata entries, as PDFBox won't do
+     * the conversion for us in the way it does for the standard ones
+     */
+    private void addMetadata(Metadata metadata, String name, COSBase value) {
+        if (value instanceof COSArray) {
+            for (Object v : ((COSArray) value).toList()) {
+                addMetadata(metadata, name, ((COSBase) v));
+            }
+        } else if (value instanceof COSString) {
+            addMetadata(metadata, name, ((COSString) value).getString());
+        }
+        // Avoid calling COSDictionary#toString, since it can lead to infinite
+        // recursion. See TIKA-1038 and PDFBOX-1835.
+        else if (value != null && !(value instanceof COSDictionary)) {
+            addMetadata(metadata, name, value.toString());
+        }
+    }
+
+
+    private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig 
config) {
+        if (config.getIfXFAExtractOnlyXFA() &&
+            pdDocument.getDocumentCatalog() != null &&
+            pdDocument.getDocumentCatalog().getAcroForm() != null &&
+            pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
+            return true;
+        }
+        return false;
+    }
+
+    private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler,
+                               Metadata metadata, ParseContext context)
+        throws SAXException, IOException, TikaException {
+        XFAExtractor ex = new XFAExtractor();
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        try (InputStream is = new ByteArrayInputStream(
+                
pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) {
+            ex.extract(is, xhtml, metadata, context);
+        } catch (XMLStreamException e) {
+            throw new TikaException("XML error in XFA", e);
+        }
+        xhtml.endDocument();
+    }
+
+    public PDFParserConfig getPDFParserConfig() {
+        return defaultConfig;
+    }
+
+    public void setPDFParserConfig(PDFParserConfig config) {
+        this.defaultConfig = config;
+    }
+
+    /**
+     * @see #setEnableAutoSpace(boolean)
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getEnableAutoSpace() {
+        return defaultConfig.getEnableAutoSpace();
+    }
+
+    /**
+     * If true (the default), the parser should estimate
+     * where spaces should be inserted between words.  For
+     * many PDFs this is necessary as they do not include
+     * explicit whitespace characters.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setEnableAutoSpace(boolean v) {
+        defaultConfig.setEnableAutoSpace(v);
+    }
+
+    /**
+     * If true, text in annotations will be extracted.
+     *
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getExtractAnnotationText() {
+        return defaultConfig.getExtractAnnotationText();
+    }
+
+    /**
+     * If true (the default), text in annotations will be
+     * extracted.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setExtractAnnotationText(boolean v) {
+        defaultConfig.setExtractAnnotationText(v);
+    }
+
+    /**
+     * @see #setSuppressDuplicateOverlappingText(boolean)
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getSuppressDuplicateOverlappingText() {
+        return defaultConfig.getSuppressDuplicateOverlappingText();
+    }
+
+    /**
+     * If true, the parser should try to remove duplicated
+     * text over the same region.  This is needed for some
+     * PDFs that achieve bolding by re-writing the same
+     * text in the same area.  Note that this can
+     * slow down extraction substantially (PDFBOX-956) and
+     * sometimes remove characters that were not in fact
+     * duplicated (PDFBOX-1155).  By default this is disabled.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setSuppressDuplicateOverlappingText(boolean v) {
+        defaultConfig.setSuppressDuplicateOverlappingText(v);
+    }
+
+    /**
+     * @see #setSortByPosition(boolean)
+     * @deprecated use {@link #getPDFParserConfig()}
+     */
+    public boolean getSortByPosition() {
+        return defaultConfig.getSortByPosition();
+    }
+
+    /**
+     * If true, sort text tokens by their x/y position
+     * before extracting text.  This may be necessary for
+     * some PDFs (if the text tokens are not rendered "in
+     * order"), while for other PDFs it can produce the
+     * wrong result (for example if there are 2 columns,
+     * the text will be interleaved).  Default is false.
+     *
+     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
+     */
+    public void setSortByPosition(boolean v) {
+        defaultConfig.setSortByPosition(v);
+    }
+
+
+    //can return null!
+    private Document loadDOM(PDMetadata pdMetadata, ParseContext context) {
+        if (pdMetadata == null) {
+            return null;
+        }
+        try (InputStream is = pdMetadata.exportXMPMetadata()) {
+            DocumentBuilder documentBuilder = context.getDocumentBuilder();
+            documentBuilder.setErrorHandler((ErrorHandler)null);
+            return documentBuilder.parse(is);
+        } catch (IOException|SAXException|TikaException e) {
+            //swallow
+        }
+        return null;
+
+    }
+
+}


http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
new file mode 100644
index 0000000..296b191
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -0,0 +1,614 @@
+package org.apache.tika.parser.pdf;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Properties;
+
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+/**
+ * Config for PDFParser.
+ * <p/>
+ * This allows parameters to be set programmatically:
+ * <ol>
+ * <li>Calls to PDFParser, i.e. 
parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
+ * <li>Constructor of PDFParser</li>
+ * <li>Passing to PDFParser through a ParseContext: 
context.set(PDFParserConfig.class, config);</li>
+ * </ol>
+ * <p/>
+ * Parameters can also be set by modifying the PDFParserConfig.properties file,
+ * which lives in the expected places, in trunk:
+ * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
+ * <p/>
+ * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
+ * org/apache/tika/parser/pdf
+ */
+public class PDFParserConfig implements Serializable {
+
+    public enum OCR_STRATEGY {
+        NO_OCR,
+        OCR_ONLY,
+        OCR_AND_TEXT_EXTRACTION;
+
+        private static OCR_STRATEGY parse(String s) {
+            if (s == null) {
+                return NO_OCR;
+            } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) {
+                return NO_OCR;
+            } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) {
+                return OCR_ONLY;
+            } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) {
+                return OCR_AND_TEXT_EXTRACTION;
+            }
+            //default -- no ocr
+            return NO_OCR;
+        }
+    }
+
+    private static final long serialVersionUID = 6492570218190936986L;
+
+    // True if we let PDFBox "guess" where spaces should go:
+    private boolean enableAutoSpace = true;
+
+    // True if we let PDFBox remove duplicate overlapping text:
+    private boolean suppressDuplicateOverlappingText;
+
+    // True if we extract annotation text ourselves
+    // (workaround for PDFBOX-1143):
+    private boolean extractAnnotationText = true;
+
+    // True if we should sort text tokens by position
+    // (necessary for some PDFs, but messes up other PDFs):
+    private boolean sortByPosition = false;
+
+    //True if acroform content should be extracted
+    private boolean extractAcroFormContent = true;
+
+    //True if inline PDXImage objects should be extracted
+    private boolean extractInlineImages = false;
+
+    //True if inline images (as identified by their object id within
+    //a pdf file) should only be extracted once.
+    private boolean extractUniqueInlineImagesOnly = true;
+
+    //The character width-based tolerance value used to estimate where spaces 
in text should be added
+    private Float averageCharTolerance;
+
+    //The space width-based tolerance value used to estimate where spaces in 
text should be added
+    private Float spacingTolerance;
+
+    //If the PDF has an XFA element, process only that and skip extracting
+    //content from elsewhere in the document.
+    private boolean ifXFAExtractOnlyXFA = false;
+
+    private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR;
+
+    private int ocrDPI = 200;
+    private ImageType ocrImageType = ImageType.GRAY;
+    private String ocrImageFormatName = "png";
+
+    private AccessChecker accessChecker;
+
+    //The PDFParser can throw IOExceptions if there is a problem
+    //with a streams.  If this is set to true, Tika's
+    //parser catches these exceptions, reports them in the metadata
+    //and then throws the first stored exception after the parse has completed.
+    private boolean isCatchIntermediateIOExceptions = true;
+
+    public PDFParserConfig() {
+        init(this.getClass().getResourceAsStream("PDFParser.properties"));
+    }
+
+    /**
+     * Loads properties from InputStream and then tries to close InputStream.
+     * If there is an IOException, this silently swallows the exception
+     * and goes back to the default.
+     *
+     * @param is
+     */
+    public PDFParserConfig(InputStream is) {
+        init(is);
+    }
+
+    //initializes object and then tries to close inputstream
+    private void init(InputStream is) {
+
+        if (is == null) {
+            return;
+        }
+        Properties props = new Properties();
+        try {
+            props.load(is);
+        } catch (IOException e) {
+        } finally {
+            if (is != null) {
+                try {
+                    is.close();
+                } catch (IOException e) {
+                    //swallow
+                }
+            }
+        }
+        setEnableAutoSpace(
+                getBooleanProp(props.getProperty("enableAutoSpace"), 
getEnableAutoSpace()));
+        setSuppressDuplicateOverlappingText(
+                
getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
+                        getSuppressDuplicateOverlappingText()));
+        setExtractAnnotationText(
+                getBooleanProp(props.getProperty("extractAnnotationText"),
+                        getExtractAnnotationText()));
+        setSortByPosition(
+                getBooleanProp(props.getProperty("sortByPosition"),
+                        getSortByPosition()));
+        setExtractAcroFormContent(
+                getBooleanProp(props.getProperty("extractAcroFormContent"),
+                        getExtractAcroFormContent()));
+        setExtractInlineImages(
+                getBooleanProp(props.getProperty("extractInlineImages"),
+                        getExtractInlineImages()));
+        setExtractUniqueInlineImagesOnly(
+                
getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
+                        getExtractUniqueInlineImagesOnly()));
+
+        setIfXFAExtractOnlyXFA(
+            getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
+                getIfXFAExtractOnlyXFA()));
+
+        setCatchIntermediateIOExceptions(
+                
getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
+                isCatchIntermediateIOExceptions()));
+
+        setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
+
+        setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI()));
+
+        setOCRImageFormatName(props.getProperty("ocrImageFormatName"));
+
+        setOCRImageType(parseImageType(props.getProperty("ocrImageType")));
+
+
+        boolean checkExtractAccessPermission = 
getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
+        boolean allowExtractionForAccessibility = 
getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
+
+        if (checkExtractAccessPermission == false) {
+            //silently ignore the crazy configuration of 
checkExtractAccessPermission = false,
+            //but allowExtractionForAccessibility=false
+            accessChecker = new AccessChecker();
+        } else {
+            accessChecker = new AccessChecker(allowExtractionForAccessibility);
+        }
+    }
+
+    /**
+     * Configures the given pdf2XHTML.
+     *
+     * @param pdf2XHTML
+     */
+    public void configure(PDF2XHTML pdf2XHTML) {
+        pdf2XHTML.setSortByPosition(getSortByPosition());
+        if (getEnableAutoSpace()) {
+            pdf2XHTML.setWordSeparator(" ");
+        } else {
+            pdf2XHTML.setWordSeparator("");
+        }
+        if (getAverageCharTolerance() != null) {
+            pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
+        }
+        if (getSpacingTolerance() != null) {
+            pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
+        }
+        
pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
+    }
+
+    /**
+     * @see #setExtractAcroFormContent(boolean)
+     */
+    public boolean getExtractAcroFormContent() {
+        return extractAcroFormContent;
+    }
+
+    /**
+     * If true (the default), extract content from AcroForms
+     * at the end of the document.  If an XFA is found,
+     * try to process that, otherwise, process the AcroForm.
+     *
+     * @param extractAcroFormContent
+     */
+    public void setExtractAcroFormContent(boolean extractAcroFormContent) {
+        this.extractAcroFormContent = extractAcroFormContent;
+
+    }
+
+    /**
+     * @see #setIfXFAExtractOnlyXFA(boolean)
+     * @return how to handle XFA data if it exists
+     */
+    public boolean getIfXFAExtractOnlyXFA() {
+        return ifXFAExtractOnlyXFA;
+    }
+
+    /**
+     * If false (the default), extract content from the full PDF
+     * as well as the XFA form.  This will likely lead to some duplicative
+     * content.
+     *
+     * @param ifXFAExtractOnlyXFA
+     */
+    public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
+        this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA;
+    }
+
+
+    /**
+     * @see #setExtractInlineImages(boolean)
+     */
+    public boolean getExtractInlineImages() {
+        return extractInlineImages;
+    }
+
+    /**
+     * If true, extract inline embedded OBXImages.
+     * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain
+     * thousands of embedded images totaling > 2.5 GB.  Also, at least as of 
PDFBox 1.8.5,
+     * there can be surprisingly large memory consumption and/or out of memory 
errors.
+     * Set to <code>true</code> with caution.
+     * <p/>
+     * The default is <code>false</code>.
+     * <p/>
+     * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)};
+     *
+     * @param extractInlineImages
+     */
+    public void setExtractInlineImages(boolean extractInlineImages) {
+        this.extractInlineImages = extractInlineImages;
+    }
+
+    /**
+     * @see #setExtractUniqueInlineImagesOnly(boolean)
+     */
+    public boolean getExtractUniqueInlineImagesOnly() {
+        return extractUniqueInlineImagesOnly;
+    }
+
+    /**
+     * Multiple pages within a PDF file might refer to the same underlying 
image.
+     * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, 
the
+     * parser will call the EmbeddedExtractor each time the image appears on a 
page.
+     * This might be desired for some use cases.  However, to avoid 
duplication of
+     * extracted images, set this to <code>true</code>.  The default is 
<code>true</code>.
+     * <p/>
+     * Note that uniqueness is determined only by the underlying PDF COSObject 
id, not by
+     * file hash or similar equality metric.
+     * If the PDF actually contains multiple copies of the same image
+     * -- all with different object ids -- then all images will be extracted.
+     * <p/>
+     * For this parameter to have any effect, {@link #extractInlineImages} 
must be
+     * set to <code>true</code>.
+     * <p>
+     * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the 
setting
+     * of this parameter, the extractor will only pull out one copy of each 
image per
+     * page.  This parameter tries to capture uniqueness across the entire 
document.
+     *
+     * @param extractUniqueInlineImagesOnly
+     */
+    public void setExtractUniqueInlineImagesOnly(boolean 
extractUniqueInlineImagesOnly) {
+        this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
+
+    }
+
+    /**
+     * @see #setEnableAutoSpace(boolean)
+     */
+    public boolean getEnableAutoSpace() {
+        return enableAutoSpace;
+    }
+
+    /**
+     * If true (the default), the parser should estimate
+     * where spaces should be inserted between words.  For
+     * many PDFs this is necessary as they do not include
+     * explicit whitespace characters.
+     */
+    public void setEnableAutoSpace(boolean enableAutoSpace) {
+        this.enableAutoSpace = enableAutoSpace;
+    }
+
+    /**
+     * @see #setSuppressDuplicateOverlappingText(boolean)
+     */
+    public boolean getSuppressDuplicateOverlappingText() {
+        return suppressDuplicateOverlappingText;
+    }
+
+    /**
+     * If true, the parser should try to remove duplicated
+     * text over the same region.  This is needed for some
+     * PDFs that achieve bolding by re-writing the same
+     * text in the same area.  Note that this can
+     * slow down extraction substantially (PDFBOX-956) and
+     * sometimes remove characters that were not in fact
+     * duplicated (PDFBOX-1155).  By default this is disabled.
+     */
+    public void setSuppressDuplicateOverlappingText(
+            boolean suppressDuplicateOverlappingText) {
+        this.suppressDuplicateOverlappingText = 
suppressDuplicateOverlappingText;
+    }
+
+    /**
+     * @see #setExtractAnnotationText(boolean)
+     */
+    public boolean getExtractAnnotationText() {
+        return extractAnnotationText;
+    }
+
+    /**
+     * If true (the default), text in annotations will be
+     * extracted.
+     */
+    public void setExtractAnnotationText(boolean extractAnnotationText) {
+        this.extractAnnotationText = extractAnnotationText;
+    }
+
+    /**
+     * @see #setSortByPosition(boolean)
+     */
+    public boolean getSortByPosition() {
+        return sortByPosition;
+    }
+
+    /**
+     * If true, sort text tokens by their x/y position
+     * before extracting text.  This may be necessary for
+     * some PDFs (if the text tokens are not rendered "in
+     * order"), while for other PDFs it can produce the
+     * wrong result (for example if there are 2 columns,
+     * the text will be interleaved).  Default is false.
+     */
+    public void setSortByPosition(boolean sortByPosition) {
+        this.sortByPosition = sortByPosition;
+    }
+
+    /**
+     * @see #setAverageCharTolerance(Float)
+     */
+    public Float getAverageCharTolerance() {
+        return averageCharTolerance;
+    }
+
+    /**
+     * See {@link PDFTextStripper#setAverageCharTolerance(float)}
+     */
+    public void setAverageCharTolerance(Float averageCharTolerance) {
+        this.averageCharTolerance = averageCharTolerance;
+    }
+
+    /**
+     * @see #setSpacingTolerance(Float)
+     */
+    public Float getSpacingTolerance() {
+        return spacingTolerance;
+    }
+
+    /**
+     * See {@link PDFTextStripper#setSpacingTolerance(float)}
+     */
+    public void setSpacingTolerance(Float spacingTolerance) {
+        this.spacingTolerance = spacingTolerance;
+    }
+
+    public AccessChecker getAccessChecker() {
+        return accessChecker;
+    }
+
+    public void setAccessChecker(AccessChecker accessChecker) {
+        this.accessChecker = accessChecker;
+    }
+
+    /**
+     * See {@link #setCatchIntermediateIOExceptions(boolean)}
+     * @return whether or not to catch IOExceptions
+     */
+    public boolean isCatchIntermediateIOExceptions() {
+        return isCatchIntermediateIOExceptions;
+    }
+
+    /**
+     * The PDFBox parser will throw an IOException if there is
+     * a problem with a stream.  If this is set to <code>true</code>,
+     * Tika's PDFParser will catch these exceptions and try to parse
+     * the rest of the document.  After the parse is completed,
+     * Tika's PDFParser will throw the first caught exception.
+     * @param catchIntermediateIOExceptions
+     */
+    public void setCatchIntermediateIOExceptions(boolean 
catchIntermediateIOExceptions) {
+        isCatchIntermediateIOExceptions = catchIntermediateIOExceptions;
+    }
+
+    /**
+     * Which strategy to use for OCR
+     * @param ocrStrategy
+     */
+    public void setOCRStrategy(OCR_STRATEGY ocrStrategy) {
+        this.ocrStrategy = ocrStrategy;
+    }
+
+    /**
+     *
+     * @return strategy to use for OCR
+     */
+    public OCR_STRATEGY getOCRStrategy() {
+        return ocrStrategy;
+    }
+
+    private boolean getBooleanProp(String p, boolean defaultMissing) {
+        if (p == null) {
+            return defaultMissing;
+        }
+        if (p.toLowerCase(Locale.ROOT).equals("true")) {
+            return true;
+        } else if (p.toLowerCase(Locale.ROOT).equals("false")) {
+            return false;
+        } else {
+            return defaultMissing;
+        }
+    }
+    //throws NumberFormatException if there's a non-null unparseable
+    //string passed in
+    private int getIntProp(String p, int defaultMissing) {
+        if (p == null) {
+            return defaultMissing;
+        }
+
+        return Integer.parseInt(p);
+    }
+
+    /**
+     * String representation of the image format used to render
+     * the page image for OCR (examples: png, tiff, jpeg)
+     * @return
+     */
+    public String getOCRImageFormatName() {
+        return ocrImageFormatName;
+    }
+
+    /**
+     * @see #getOCRImageFormatName()
+     *
+     * @param ocrImageFormatName name of image format used to render
+     *                           page image
+     */
+    public void setOCRImageFormatName(String ocrImageFormatName) {
+        this.ocrImageFormatName = ocrImageFormatName;
+    }
+
+    /**
+     * Image type used to render the page image for OCR.
+     * @see #setOCRImageType(ImageType)
+     * @return image type
+     */
+    public ImageType getOCRImageType() {
+        return ocrImageType;
+    }
+
+    /**
+     * Image type used to render the page image for OCR.
+     * @param ocrImageType
+     */
+    public void setOCRImageType(ImageType ocrImageType) {
+        this.ocrImageType = ocrImageType;
+    }
+
+    /**
+     * Dots per inch used to render the page image for OCR
+     * @return dots per inch
+     */
+    public int getOCRDPI() {
+        return ocrDPI;
+    }
+
+    /**
+     * Dots per inche used to render the page image for OCR
+     * @param ocrDPI
+     */
+    public void setOCRDPI(int ocrDPI) {
+        this.ocrDPI = ocrDPI;
+    }
+
+    private ImageType parseImageType(String ocrImageType) {
+        for (ImageType t : ImageType.values()) {
+            if (ocrImageType.equalsIgnoreCase(t.toString())) {
+                return t;
+            }
+        }
+        return null;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (!(o instanceof PDFParserConfig)) return false;
+
+        PDFParserConfig config = (PDFParserConfig) o;
+
+        if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
+        if (getSuppressDuplicateOverlappingText() != 
config.getSuppressDuplicateOverlappingText()) return false;
+        if (getExtractAnnotationText() != config.getExtractAnnotationText()) 
return false;
+        if (getSortByPosition() != config.getSortByPosition()) return false;
+        if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) 
return false;
+        if (getExtractInlineImages() != config.getExtractInlineImages()) 
return false;
+        if (getExtractUniqueInlineImagesOnly() != 
config.getExtractUniqueInlineImagesOnly()) return false;
+        if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) 
return false;
+        if (getOCRDPI() != config.getOCRDPI()) return false;
+        if (isCatchIntermediateIOExceptions() != 
config.isCatchIntermediateIOExceptions()) return false;
+        if 
(!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return 
false;
+        if (!getSpacingTolerance().equals(config.getSpacingTolerance())) 
return false;
+        if (!getOCRStrategy().equals(config.getOCRStrategy())) return false;
+        if (getOCRImageType() != config.getOCRImageType()) return false;
+        if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) 
return false;
+        return getAccessChecker().equals(config.getAccessChecker());
+
+    }
+
+    @Override
+    public int hashCode() {
+        int result = (getEnableAutoSpace() ? 1 : 0);
+        result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
+        result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
+        result = 31 * result + (getSortByPosition() ? 1 : 0);
+        result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
+        result = 31 * result + (getExtractInlineImages() ? 1 : 0);
+        result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
+        result = 31 * result + getAverageCharTolerance().hashCode();
+        result = 31 * result + getSpacingTolerance().hashCode();
+        result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
+        result = 31 * result + ocrStrategy.hashCode();
+        result = 31 * result + getOCRDPI();
+        result = 31 * result + getOCRImageType().hashCode();
+        result = 31 * result + getOCRImageFormatName().hashCode();
+        result = 31 * result + getAccessChecker().hashCode();
+        result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
+        return result;
+    }
+
+    @Override
+    public String toString() {
+        return "PDFParserConfig{" +
+                "enableAutoSpace=" + enableAutoSpace +
+                ", suppressDuplicateOverlappingText=" + 
suppressDuplicateOverlappingText +
+                ", extractAnnotationText=" + extractAnnotationText +
+                ", sortByPosition=" + sortByPosition +
+                ", extractAcroFormContent=" + extractAcroFormContent +
+                ", extractInlineImages=" + extractInlineImages +
+                ", extractUniqueInlineImagesOnly=" + 
extractUniqueInlineImagesOnly +
+                ", averageCharTolerance=" + averageCharTolerance +
+                ", spacingTolerance=" + spacingTolerance +
+                ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
+                ", ocrStrategy=" + ocrStrategy +
+                ", ocrDPI=" + ocrDPI +
+                ", ocrImageType=" + ocrImageType +
+                ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
+                ", accessChecker=" + accessChecker +
+                ", isCatchIntermediateIOExceptions=" + 
isCatchIntermediateIOExceptions +
+                '}';
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
new file mode 100644
index 0000000..d3c34dd
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/XFAExtractor.java
@@ -0,0 +1,304 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import javax.xml.namespace.QName;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+/**
+ * This class offers an initial capability to
+ * scrape text containing elements out of XFA, and
+ * it tries to link fields with values.
+ * <p>
+ * Some areas for improvement:
+ * <ol>
+ *     <li>convert this to 2 lines of XPath</li>
+ *     <li>handle metadata stored in &lt;desc&gt; section (govdocs1: 
754282.pdf, 982106.pdf)</li>
+ *     <li>handle pdf metadata (access permissions, etc.) in &lt;pdf&gt; 
element</li>
+ *     <li>extract different types of uris as metadata</li>
+ *     <li>add extraction of &lt;image&gt; data (govdocs1: 754282.pdf)</li>
+ *     <li>add computation of traversal order for fields</li>
+ *     <li>figure out when text extracted from xfa fields is duplicative of 
that
+ *     extracted from the rest of the pdf...and do this efficiently and 
quickly</li>
+ *     <li>avoid duplication with &lt;speak&gt; and &lt;tooltip&gt; 
elements</li>
+ * </ol>
+ */
+class XFAExtractor {
+
+    private static final Pattern XFA_TEMPLATE_ANY_VERSION = 
Pattern.compile("^http://www.xfa.org/schema/xfa-template";);
+    private static final Pattern TEXT_PATTERN =
+            Pattern.compile("^(speak|text|contents-richtext|toolTip|exData)$");
+
+    private static final String XFA_DATA_NS = 
"http://www.xfa.org/schema/xfa-data/1.0/";;
+
+    private static final String FIELD_LN = "field";
+    private static final QName XFA_DATA = new QName(XFA_DATA_NS, "data");
+
+    private final Matcher xfaTemplateMatcher;//namespace any version
+    private final Matcher textMatcher;
+
+    XFAExtractor() {
+        xfaTemplateMatcher = XFA_TEMPLATE_ANY_VERSION.matcher("");
+        textMatcher = TEXT_PATTERN.matcher("");
+    }
+
+    void extract(InputStream xfaIs, XHTMLContentHandler xhtml, Metadata m, 
ParseContext context)
+            throws XMLStreamException, SAXException {
+        xhtml.startElement("div", "class", "xfa_content");
+
+        Map<String, String> pdfObjRToValues = new HashMap<>();
+
+        //for now, store and dump the fields in insertion order
+        Map<String, XFAField> namedFields = new LinkedHashMap<>();
+
+        //The strategy is to cache the fields in fields
+        //and cache the values in pdfObjRToValues while
+        //handling the text etc along the way.
+        //
+        //As a final step, dump the merged fields and the values.
+
+        XMLStreamReader reader = 
context.getXMLInputFactory().createXMLStreamReader(xfaIs);
+        while (reader.hasNext()) {
+            switch (reader.next()) {
+                case XMLStreamConstants.START_ELEMENT :
+                    QName name = reader.getName();
+                    String localName = name.getLocalPart();
+                    if 
(xfaTemplateMatcher.reset(name.getNamespaceURI()).find() &&
+                        FIELD_LN.equals(name.getLocalPart())) {
+                        handleField(reader, namedFields);
+                    } else if (XFA_DATA.equals(name)) {//full qname match is 
important!
+                        loadData(reader, pdfObjRToValues);
+                    } else if (textMatcher.reset(localName).find()) {
+                        scrapeTextUntil(reader, xhtml, name);
+                    }
+                    break;
+                case XMLStreamConstants.END_ELEMENT :
+                    break;
+            }
+        }
+
+        if (namedFields.size() == 0) {
+            xhtml.endElement("xfa_content");
+            return;
+        }
+        //now dump fields and values
+        xhtml.startElement("div", "class", "xfa_form");
+        xhtml.startElement("ol");
+        StringBuilder sb = new StringBuilder();
+        for (Map.Entry<String, XFAField> e : namedFields.entrySet()) {
+            String fieldName = e.getKey();
+            XFAField field = e.getValue();
+            String fieldValue = pdfObjRToValues.get(fieldName);
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute("", "fieldName", "fieldName", "CDATA", 
fieldName);
+
+            String displayFieldName = (field.toolTip == null ||
+                field.toolTip.trim().length() == 0) ? fieldName : 
field.toolTip;
+
+            sb.append(displayFieldName).append(": ");
+            if (fieldValue != null) {
+                sb.append(fieldValue);
+            }
+
+            xhtml.startElement("li", attrs);
+            xhtml.characters(sb.toString());
+            xhtml.endElement("li");
+            sb.setLength(0);
+        }
+        xhtml.endElement("ol");
+        xhtml.endElement("div");
+        xhtml.endElement("xfa_content");
+    }
+
+    //try to scrape the text until the endElement
+    private void scrapeTextUntil(XMLStreamReader reader, XHTMLContentHandler 
xhtml,
+                                 QName endElement) throws XMLStreamException, 
SAXException {
+        StringBuilder buffer = new StringBuilder();
+        boolean keepGoing = true;
+        while (reader.hasNext() && keepGoing) {
+            switch (reader.next()) {
+                case XMLStreamConstants.START_ELEMENT:
+                    break;
+                case XMLStreamConstants.CHARACTERS:
+                    int start = reader.getTextStart();
+                    int length = reader.getTextLength();
+                    buffer.append(reader.getTextCharacters(),
+                            start,
+                            length);
+                    break;
+
+                case XMLStreamConstants.CDATA:
+                    start = reader.getTextStart();
+                    length = reader.getTextLength();
+                    buffer.append(reader.getTextCharacters(),
+                            start,
+                            length);
+                    break;
+
+                case (XMLStreamConstants.END_ELEMENT):
+                    if (reader.getName().equals(endElement)) {
+                        keepGoing = false;
+                    } else if ("p".equals(reader.getName().getLocalPart())) {
+                        xhtml.element("p", buffer.toString());
+                        buffer.setLength(0);
+                    }
+                    break;
+            }
+        }
+        String remainder = buffer.toString();
+        if (remainder.trim().length() > 0) {
+            xhtml.element("p", remainder);
+        }
+    }
+
+
+    private String scrapeTextUntil(XMLStreamReader reader, QName endElement) 
throws XMLStreamException {
+        StringBuilder buffer = new StringBuilder();
+        boolean keepGoing = true;
+        while (reader.hasNext() && keepGoing) {
+            switch (reader.next()) {
+                case XMLStreamConstants.START_ELEMENT:
+                    break;
+                case XMLStreamConstants.CHARACTERS:
+                    int start = reader.getTextStart();
+                    int length = reader.getTextLength();
+                    buffer.append(reader.getTextCharacters(),
+                            start,
+                            length);
+                    break;
+
+                case XMLStreamConstants.CDATA:
+                    start = reader.getTextStart();
+                    length = reader.getTextLength();
+                    buffer.append(reader.getTextCharacters(),
+                            start,
+                            length);
+                    break;
+
+                case (XMLStreamConstants.END_ELEMENT):
+                    if (reader.getName().equals(endElement)) {
+                        keepGoing = false;
+                    } else if ("p".equals(reader.getName().getLocalPart())) {
+                        buffer.append("\n");
+                    }
+                    break;
+            }
+        }
+        return buffer.toString();
+    }
+
+    private void loadData(XMLStreamReader reader, Map<String, String> 
pdfObjRToValues)
+            throws XMLStreamException {
+        //reader is at the "xfa:data" element
+        while (reader.hasNext()) {
+            switch (reader.next()) {
+                case (XMLStreamConstants.START_ELEMENT) :
+                    if ("topmostSubform".equals(reader.getLocalName())) {
+                        continue;
+                    }
+                    String value = scrapeTextUntil(reader, reader.getName());
+                    pdfObjRToValues.put(reader.getLocalName(), value);
+                    break;
+                case (XMLStreamConstants.END_ELEMENT) :
+                    if (XFA_DATA.equals(reader.getName())) {
+                        return;
+                    }
+                    break;
+
+            }
+        }
+    }
+
+    private void handleField(XMLStreamReader reader, Map<String, XFAField> 
fields) throws XMLStreamException {
+        //reader is set to the field element
+        String fieldName = findFirstAttributeValue(reader, "name");
+        String pdfObjRef = "";
+        String toolTip = "";
+        while (reader.hasNext()) {
+            switch (reader.next()) {
+                case XMLStreamConstants.START_ELEMENT :
+                    if ("toolTip".equals(reader.getName().getLocalPart())) {
+                        toolTip = scrapeTextUntil(reader, reader.getName());
+                    }
+                    // add checkbutton, etcif (reader.getName().equals())
+                    break;
+                case XMLStreamConstants.END_ELEMENT :
+                    if 
(xfaTemplateMatcher.reset(reader.getName().getNamespaceURI()).find() &&
+                            FIELD_LN.equals(reader.getName().getLocalPart())) {
+                        if (fieldName != null) {
+                            fields.put(fieldName, new XFAField(fieldName, 
toolTip, pdfObjRef));
+                        }
+                        return;
+                    }
+                    break;
+                case XMLStreamConstants.PROCESSING_INSTRUCTION:
+                    if ("PDF_OBJR".equals(reader.getPITarget())) {
+                        pdfObjRef = reader.getPIData();
+                    }
+                    break;
+
+            }
+        }
+    }
+
+    private String findFirstAttributeValue(XMLStreamReader reader, String 
name) {
+        for (int i = 0; i < reader.getAttributeCount(); i++) {
+            String n = reader.getAttributeLocalName(i);
+            if (name.equals(n)) {
+                return reader.getAttributeValue(i);
+            }
+        }
+        return "";
+    }
+
+    class XFAField {
+        String fieldName;
+        String toolTip;
+        String pdfObjRef;
+        String value;
+
+        public XFAField(String fieldName, String toolTip, String pdfObjRef) {
+            this.fieldName = fieldName;
+            this.toolTip = toolTip;
+            this.pdfObjRef = pdfObjRef;
+        }
+
+        @Override
+        public String toString() {
+            return "XFAField{" +
+                    "fieldName='" + fieldName + '\'' +
+                    ", toolTip='" + toolTip + '\'' +
+                    ", pdfObjRef='" + pdfObjRef + '\'' +
+                    ", value='" + value + '\'' +
+                    '}';
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 3216233..08693b5 100644
--- 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -13,7 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-
+org.apache.tika.parser.pdf.PDFParser
 org.apache.tika.parser.font.AdobeFontMetricParser
 org.apache.tika.parser.font.TrueTypeParser
 org.apache.tika.parser.image.BPGParser
@@ -29,3 +29,4 @@ org.apache.tika.parser.mp3.Mp3Parser
 org.apache.tika.parser.mp4.MP4Parser
 org.apache.tika.parser.pot.PooledTimeSeriesParser
 org.apache.tika.parser.video.FLVParser
+

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
new file mode 100644
index 0000000..319e693
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties
@@ -0,0 +1,34 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+enableAutoSpace true
+extractAnnotationText true
+sortByPosition false
+suppressDuplicateOverlappingText       false
+extractAcroFormContent true
+extractInlineImages false
+extractUniqueInlineImagesOnly true
+checkExtractAccessPermission false
+allowExtractionForAccessibility true
+ifXFAExtractOnlyXFA false
+catchIntermediateIOExceptions true
+#options: no_ocr, ocr_only, ocr_and_text_extraction
+ocrStrategy no_ocr
+#dots per inch for the ocr rendering of the page image
+ocrDPI 200
+#if you request tif, make sure you have imageio jars on your classpath!
+ocrImageFormatName png
+#options: argb, binary, gray, rgb
+ocrImageType gray

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
 
b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
new file mode 100644
index 0000000..ef646ac
--- /dev/null
+++ 
b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.tika.exception.AccessPermissionException;
+import org.apache.tika.metadata.AccessPermissions;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PropertyTypeException;
+import org.junit.Test;
+
+public class AccessCheckerTest {
+
+    @Test
+    public void testLegacy() throws AccessPermissionException {
+
+        Metadata m = getMetadata(false, false);
+        //legacy behavior; don't bother checking
+        AccessChecker checker = new AccessChecker();
+        checker.check(m);
+        assertTrue("no exception", true);
+
+        m = getMetadata(false, true);
+        assertTrue("no exception", true);
+        checker.check(m);
+
+        m = getMetadata(true, true);
+        assertTrue("no exception", true);
+        checker.check(m);
+    }
+
+    @Test
+    public void testNoExtraction() {
+
+        Metadata m = null;
+        //allow nothing
+        AccessChecker checker = new AccessChecker(false);
+        boolean ex = false;
+        try {
+            m = getMetadata(false, false);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            ex = true;
+        }
+        assertTrue("correct exception with no extraction, no extract for 
accessibility", ex);
+        ex = false;
+        try {
+            //document allows extraction for accessibility
+            m = getMetadata(false, true);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            //but application is not an accessibility application
+            ex = true;
+        }
+        assertTrue("correct exception with no extraction, no extract for 
accessibility", ex);
+    }
+
+    @Test
+    public void testExtractOnlyForAccessibility() throws 
AccessPermissionException {
+        Metadata m = getMetadata(false, true);
+        //allow accessibility
+        AccessChecker checker = new AccessChecker(true);
+        checker.check(m);
+        assertTrue("no exception", true);
+        boolean ex = false;
+        try {
+            m = getMetadata(false, false);
+            checker.check(m);
+        } catch (AccessPermissionException e) {
+            ex = true;
+        }
+        assertTrue("correct exception", ex);
+    }
+
+    @Test
+    public void testCrazyExtractNotForAccessibility() throws 
AccessPermissionException {
+        Metadata m = getMetadata(true, false);
+        //allow accessibility
+        AccessChecker checker = new AccessChecker(true);
+        checker.check(m);
+        assertTrue("no exception", true);
+
+        //don't extract for accessibility
+        checker = new AccessChecker(false);
+        //if extract content is allowed, the checker shouldn't
+        //check the value of extract for accessibility
+        checker.check(m);
+        assertTrue("no exception", true);
+
+    }
+
+    @Test
+    public void testCantAddMultiplesToMetadata() {
+        Metadata m = new Metadata();
+        boolean ex = false;
+        m.add(AccessPermissions.EXTRACT_CONTENT, "true");
+        try {
+            m.add(AccessPermissions.EXTRACT_CONTENT, "false");
+        } catch (PropertyTypeException e) {
+            ex = true;
+        }
+        assertTrue("can't add multiple values", ex);
+
+        m = new Metadata();
+        ex = false;
+        m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true");
+        try {
+            m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false");
+        } catch (PropertyTypeException e) {
+            ex = true;
+        }
+        assertTrue("can't add multiple values", ex);
+    }
+
+    private Metadata getMetadata(boolean allowExtraction, boolean 
allowExtractionForAccessibility) {
+        Metadata m = new Metadata();
+        m.set(AccessPermissions.EXTRACT_CONTENT, 
Boolean.toString(allowExtraction));
+        m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, 
Boolean.toString(allowExtractionForAccessibility));
+        return m;
+    }
+}

[4/5] tika git commit: TIKA-2059 - Merge multimedia and pdf parser modules and bundles

Reply via email to