[2/5] tika git commit: TIKA-2059 - Merge multimedia and pdf parser modules and bundles

bob Sun, 28 Aug 2016 09:30:33 -0700

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
deleted file mode 100644
index 775e590..0000000
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.pdf;
-
-import java.io.Serializable;
-
-import org.apache.tika.exception.AccessPermissionException;
-import org.apache.tika.metadata.AccessPermissions;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * Checks whether or not a document allows extraction generally
- * or extraction for accessibility only.
- */
-public class AccessChecker implements Serializable {
-
-    private static final long serialVersionUID = 6492570218190936986L;
-
-    private final boolean needToCheck;
-    private final boolean allowAccessibility;
-
-    /**
-     * This constructs an {@link AccessChecker} that
-     * will not perform any checking and will always return without
-     * throwing an exception.
-     * <p/>
-     * This constructor is available to allow for Tika's legacy ( <= v1.7) 
behavior.
-     */
-    public AccessChecker() {
-        needToCheck = false;
-        allowAccessibility = true;
-    }
-
-    /**
-     * This constructs an {@link AccessChecker} that will check
-     * for whether or not content should be extracted from a document.
-     *
-     * @param allowExtractionForAccessibility if general extraction is not 
allowed, is extraction for accessibility allowed
-     */
-    public AccessChecker(boolean allowExtractionForAccessibility) {
-        needToCheck = true;
-        this.allowAccessibility = allowExtractionForAccessibility;
-    }
-
-    /**
-     * Checks to see if a document's content should be extracted based
-     * on metadata values and the value of {@link #allowAccessibility} in the 
constructor.
-     *
-     * @param metadata
-     * @throws AccessPermissionException if access is not permitted
-     */
-    public void check(Metadata metadata) throws AccessPermissionException {
-        if (!needToCheck) {
-            return;
-        }
-        if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) {
-            if (allowAccessibility) {
-                if 
("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) {
-                    return;
-                }
-                throw new AccessPermissionException("Content extraction for 
accessibility is not allowed.");
-            }
-            throw new AccessPermissionException("Content extraction is not 
allowed.");
-        }
-    }
-}


http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
deleted file mode 100644
index 3ad551d..0000000
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import java.io.IOException;
-import java.io.Writer;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.text.TextPosition;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-
-/**
- * Utility class that overrides the {@link PDFTextStripper} functionality
- * to integrate text extraction via OCR only.
- *
- */
-class OCR2XHTML extends AbstractPDF2XHTML {
-
-    private OCR2XHTML(PDDocument document, ContentHandler handler, 
ParseContext context, Metadata metadata,
-                      PDFParserConfig config)
-            throws IOException {
-        super(document, handler, context, metadata, config);
-    }
-
-    /**
-     * Converts the given PDF document (and related metadata) to a stream
-     * of XHTML SAX events sent to the given content handler.
-     *
-     * @param document PDF document
-     * @param handler  SAX content handler
-     * @param metadata PDF metadata
-     * @throws SAXException  if the content handler fails to process SAX events
-     * @throws TikaException if there was an exception outside of per page 
processing
-     */
-    public static void process(
-            PDDocument document, ContentHandler handler, ParseContext context, 
Metadata metadata,
-            PDFParserConfig config)
-            throws SAXException, TikaException {
-        OCR2XHTML ocr2XHTML = null;
-        try {
-            ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, 
config);
-            ocr2XHTML.writeText(document, new Writer() {
-                @Override
-                public void write(char[] cbuf, int off, int len) {
-                }
-
-                @Override
-                public void flush() {
-                }
-
-                @Override
-                public void close() {
-                }
-            });
-        } catch (IOException e) {
-            if (e.getCause() instanceof SAXException) {
-                throw (SAXException) e.getCause();
-            } else {
-                throw new TikaException("Unable to extract PDF content", e);
-            }
-        }
-        if (ocr2XHTML.exceptions.size() > 0) {
-            //throw the first
-            throw new TikaException("Unable to extract all PDF content",
-                    ocr2XHTML.exceptions.get(0));
-        }
-    }
-
-    @Override
-    public void processPage(PDPage pdPage) throws IOException {
-        try {
-            startPage(pdPage);
-            doOCROnCurrentPage();
-            endPage(pdPage);
-        } catch (TikaException |SAXException e) {
-            throw new IOExceptionWithCause(e);
-        } catch (IOException e) {
-            handleCatchableIOE(e);
-        }
-    }
-
-    @Override
-    protected void writeString(String text) throws IOException {
-        //no-op
-    }
-
-    @Override
-    protected void writeCharacters(TextPosition text) throws IOException {
-        //no-op
-    }
-
-    @Override
-    protected void writeWordSeparator() throws IOException {
-        //no-op
-    }
-
-    @Override
-    protected void writeLineSeparator() throws IOException {
-        //no-op
-    }
-
-}
-

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
deleted file mode 100644
index ac9823e..0000000
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Writer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.pdfbox.cos.COSBase;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.cos.COSStream;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
-import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
-import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
-import org.apache.pdfbox.text.PDFTextStripper;
-import org.apache.pdfbox.text.TextPosition;
-import org.apache.pdfbox.tools.imageio.ImageIOUtil;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.EmbeddedContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-/**
- * Utility class that overrides the {@link PDFTextStripper} functionality
- * to produce a semi-structured XHTML SAX events instead of a plain text
- * stream.
- */
-class PDF2XHTML extends AbstractPDF2XHTML {
-
-
-    private static final List<String> JPEG = Arrays.asList(
-            COSName.DCT_DECODE.getName(),
-            COSName.DCT_DECODE_ABBREVIATION.getName());
-
-    /**
-     * This keeps track of the pdf object ids for inline
-     * images that have been processed.
-     * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly()
-     * is true, this will be checked before extracting an embedded image.
-     * The integer keeps track of the inlineImageCounter for that image.
-     * This integer is used to identify images in the markup.
-     *
-     * This is used across the document.  To avoid infinite recursion
-     * TIKA-1742, we're limiting the export to one image per page.
-     */
-    private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
-    private int inlineImageCounter = 0;
-    private PDF2XHTML(PDDocument document, ContentHandler handler, 
ParseContext context, Metadata metadata,
-                      PDFParserConfig config)
-            throws IOException {
-        super(document, handler, context, metadata, config);
-    }
-
-    /**
-     * Converts the given PDF document (and related metadata) to a stream
-     * of XHTML SAX events sent to the given content handler.
-     *
-     * @param document PDF document
-     * @param handler  SAX content handler
-     * @param metadata PDF metadata
-     * @throws SAXException  if the content handler fails to process SAX events
-     * @throws TikaException if there was an exception outside of per page 
processing
-     */
-    public static void process(
-            PDDocument document, ContentHandler handler, ParseContext context, 
Metadata metadata,
-            PDFParserConfig config)
-            throws SAXException, TikaException {
-        PDF2XHTML pdf2XHTML = null;
-        try {
-            // Extract text using a dummy Writer as we override the
-            // key methods to output to the given content
-            // handler.
-            pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, 
config);
-
-            config.configure(pdf2XHTML);
-
-            pdf2XHTML.writeText(document, new Writer() {
-                @Override
-                public void write(char[] cbuf, int off, int len) {
-                }
-
-                @Override
-                public void flush() {
-                }
-
-                @Override
-                public void close() {
-                }
-            });
-
-        } catch (IOException e) {
-            if (e.getCause() instanceof SAXException) {
-                throw (SAXException) e.getCause();
-            } else {
-                throw new TikaException("Unable to extract PDF content", e);
-            }
-        }
-        if (pdf2XHTML.exceptions.size() > 0) {
-            //throw the first
-            throw new TikaException("Unable to extract all PDF content",
-                    pdf2XHTML.exceptions.get(0));
-        }
-    }
-
-
-    @Override
-    public void processPage(PDPage page) throws IOException {
-        try {
-            super.processPage(page);
-        } catch (IOException e) {
-            handleCatchableIOE(e);
-        }
-    }
-
-    @Override
-    protected void endPage(PDPage page) throws IOException {
-        try {
-            writeParagraphEnd();
-            try {
-                extractImages(page.getResources(), new HashSet<COSBase>());
-            } catch (IOException e) {
-                handleCatchableIOE(e);
-            }
-            super.endPage(page);
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to end a page", e);
-        } catch (IOException e) {
-            exceptions.add(e);
-        }
-    }
-
-    private void extractImages(PDResources resources, Set<COSBase> 
seenThisPage) throws SAXException, IOException {
-        if (resources == null || config.getExtractInlineImages() == false) {
-            return;
-        }
-
-        for (COSName name : resources.getXObjectNames()) {
-
-            PDXObject object = resources.getXObject(name);
-            if (object == null) {
-                continue;
-            }
-            COSStream cosStream = object.getCOSObject();
-            if (seenThisPage.contains(cosStream)) {
-                //avoid infinite recursion TIKA-1742
-                continue;
-            }
-            seenThisPage.add(cosStream);
-
-            if (object instanceof PDFormXObject) {
-                extractImages(((PDFormXObject) object).getResources(), 
seenThisPage);
-            } else if (object instanceof PDImageXObject) {
-
-                PDImageXObject image = (PDImageXObject) object;
-
-                Metadata metadata = new Metadata();
-                String extension = image.getSuffix();
-                if (extension == null) {
-                    metadata.set(Metadata.CONTENT_TYPE, "image/png");
-                    extension = "png";
-                } else if (extension.equals("jpg")) {
-                    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-                } else if (extension.equals("tiff")) {
-                    metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
-                    extension = "tif";
-                } else {
-                    //TODO: determine if we need to add more image types
-                    //throw new RuntimeException("EXTEN:" + extension);
-                }
-
-                Integer imageNumber = processedInlineImages.get(cosStream);
-                if (imageNumber == null) {
-                    imageNumber = inlineImageCounter++;
-                }
-                String fileName = "image" + imageNumber + "."+extension;
-                metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-
-                // Output the img tag
-                AttributesImpl attr = new AttributesImpl();
-                attr.addAttribute("", "src", "src", "CDATA", "embedded:" + 
fileName);
-                attr.addAttribute("", "alt", "alt", "CDATA", fileName);
-                xhtml.startElement("img", attr);
-                xhtml.endElement("img");
-
-                //Do we only want to process unique COSObject ids?
-                //If so, have we already processed this one?
-                if (config.getExtractUniqueInlineImagesOnly() == true) {
-                    if (processedInlineImages.containsKey(cosStream)) {
-                        continue;
-                    }
-                    processedInlineImages.put(cosStream, imageNumber);
-                }
-
-                metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-                        
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
-
-                EmbeddedDocumentExtractor extractor =
-                        getEmbeddedDocumentExtractor();
-                if (extractor.shouldParseEmbedded(metadata)) {
-                    ByteArrayOutputStream buffer = new ByteArrayOutputStream();
-                    try {
-                        //TODO: handle image.getMetadata()?
-                        writeToBuffer(image, extension, buffer);
-                        extractor.parseEmbedded(
-                                new ByteArrayInputStream(buffer.toByteArray()),
-                                new EmbeddedContentHandler(xhtml),
-                                metadata, false);
-                    } catch (IOException e) {
-                        handleCatchableIOE(e);
-                    }
-                }
-            }
-        }
-    }
-
-    //nearly directly copied from PDFBox ExtractImages
-    private void writeToBuffer(PDImageXObject pdImage, String suffix, 
OutputStream out)
-            throws IOException {
-
-        BufferedImage image = pdImage.getImage();
-        if (image != null) {
-            if ("jpg".equals(suffix)) {
-                String colorSpaceName = pdImage.getColorSpace().getName();
-                //TODO: figure out if we want directJPEG as a configuration
-                //previously: if (directJPeg || PDDeviceGray....
-                if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
-                        PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) 
{
-                    // RGB or Gray colorspace: get and write the 
unmodifiedJPEG stream
-                    InputStream data = 
pdImage.getStream().createInputStream(JPEG);
-                    org.apache.pdfbox.io.IOUtils.copy(data, out);
-                    org.apache.pdfbox.io.IOUtils.closeQuietly(data);
-                } else {
-                    // for CMYK and other "unusual" colorspaces, the JPEG will 
be converted
-                    ImageIOUtil.writeImage(image, suffix, out);
-                }
-            } else {
-                ImageIOUtil.writeImage(image, suffix, out);
-            }
-        }
-        out.flush();
-    }
-
-    @Override
-    protected void writeParagraphStart() throws IOException {
-        super.writeParagraphStart();
-        try {
-            xhtml.startElement("p");
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to start a paragraph", e);
-        }
-    }
-
-    @Override
-    protected void writeParagraphEnd() throws IOException {
-        super.writeParagraphEnd();
-        try {
-            xhtml.endElement("p");
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause("Unable to end a paragraph", e);
-        }
-    }
-
-    @Override
-    protected void writeString(String text) throws IOException {
-        try {
-            xhtml.characters(text);
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause(
-                    "Unable to write a string: " + text, e);
-        }
-    }
-
-    @Override
-    protected void writeCharacters(TextPosition text) throws IOException {
-        try {
-            xhtml.characters(text.getUnicode());
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause(
-                    "Unable to write a character: " + text.getUnicode(), e);
-        }
-    }
-
-    @Override
-    protected void writeWordSeparator() throws IOException {
-        try {
-            xhtml.characters(getWordSeparator());
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause(
-                    "Unable to write a space character", e);
-        }
-    }
-
-    @Override
-    protected void writeLineSeparator() throws IOException {
-        try {
-            xhtml.newline();
-        } catch (SAXException e) {
-            throw new IOExceptionWithCause(
-                    "Unable to write a newline character", e);
-        }
-    }
-
-}
-

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
deleted file mode 100644
index 057f833..0000000
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.parser.pdf;
-
-import static java.nio.charset.StandardCharsets.ISO_8859_1;
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.pdfbox.cos.COSString;
-import org.apache.pdfbox.io.RandomAccessBuffer;
-import org.apache.pdfbox.io.RandomAccessRead;
-import org.apache.pdfbox.pdfparser.COSParser;
-
-/**
- * In fairly rare cases, a PDF's XMP will contain a string that
- * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and
- * ascii for ascii, e.g. 
"\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000"
- * <p>
- * This class can be used to decode those strings.
- * <p>
- * See TIKA-1678.  Many thanks to Andrew Jackson for raising this issue
- * and Tilman Hausherr for the solution.
- * <p>
- * As of this writing, we are only handling strings that start with
- * an encoded BOM.  Andrew Jackson found a handful of other examples (e.g.
- * this ISO-8859-7 string:
- * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336
- * \\364\\347\\362 PRAKSIS \\363\\364\\357")
- * that we aren't currently handling.
- */
-class PDFEncodedStringDecoder {
-
-    private static final String[] PDF_ENCODING_BOMS = {
-            "\\376\\377", //UTF-16BE
-            "\\377\\376", //UTF-16LE
-            "\\357\\273\\277"//UTF-8
-    };
-
-    /**
-     * Does this string contain an octal-encoded UTF BOM?
-     * Call this statically to determine if you should bother creating a new 
parser to parse it.
-     * @param s
-     * @return
-     */
-    static boolean shouldDecode(String s) {
-        if (s == null || s.length() < 8) {
-            return false;
-        }
-        for (String BOM : PDF_ENCODING_BOMS) {
-            if (s.startsWith(BOM)) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-    /**
-     * This assumes that {@link #shouldDecode(String)} has been called
-     * and has returned true.  If you run this on a non-octal encoded string,
-     * disaster will happen!
-     *
-     * @param value
-     * @return
-     */
-    String decode(String value) {
-        try {
-            byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1);
-            InputStream is = new ByteArrayInputStream(bytes);
-            COSStringParser p = new COSStringParser(new 
RandomAccessBuffer(is));
-            String parsed = p.myParseCOSString();
-            if (parsed != null) {
-                return parsed;
-            }
-        } catch (IOException e) {
-            //oh well, we tried.
-        }
-        //just return value if something went wrong
-        return value;
-    }
-
-    class COSStringParser extends COSParser {
-
-        COSStringParser(RandomAccessRead buffer) throws IOException {
-            super(buffer);
-        }
-
-        /**
-         *
-         * @return parsed string or null if something went wrong.
-         */
-        String myParseCOSString() {
-            try {
-                COSString cosString = parseCOSString();
-                if (cosString != null) {
-                    return cosString.getString();
-                }
-            } catch (IOException e) {
-            }
-            return null;
-        }
-    }
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
deleted file mode 100644
index f735f25..0000000
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.pdf;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.stream.XMLStreamException;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Calendar;
-import java.util.Collections;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.jempbox.xmp.XMPMetadata;
-import org.apache.jempbox.xmp.XMPSchema;
-import org.apache.jempbox.xmp.XMPSchemaDublinCore;
-import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
-import org.apache.pdfbox.cos.COSArray;
-import org.apache.pdfbox.cos.COSBase;
-import org.apache.pdfbox.cos.COSDictionary;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.cos.COSString;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentInformation;
-import org.apache.pdfbox.pdmodel.common.PDMetadata;
-import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
-import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.AccessPermissions;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.ocr.TesseractOCRParser;
-import org.apache.tika.parser.xmp.JempboxExtractor;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.w3c.dom.Document;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.ErrorHandler;
-import org.xml.sax.SAXException;
-
-/**
- * PDF parser.
- * <p/>
- * This parser can process also encrypted PDF documents if the required
- * password is given as a part of the input metadata associated with a
- * document. If no password is given, then this parser will try decrypting
- * the document using the empty password that's often used with PDFs. If
- * the PDF contains any embedded documents (for example as part of a PDF
- * package) then this parser will use the {@link EmbeddedDocumentExtractor}
- * to handle them.
- * <p/>
- * As of Tika 1.6, it is possible to extract inline images with
- * the {@link EmbeddedDocumentExtractor} as if they were regular
- * attachments.  By default, this feature is turned off because of
- * the potentially enormous number and size of inline images.  To
- * turn this feature on, see
- * {@link PDFParserConfig#setExtractInlineImages(boolean)}.
- */
-public class PDFParser extends AbstractParser {
-
-
-    /**
-     * Metadata key for giving the document password to the parser.
-     *
-     * @since Apache Tika 0.5
-     * @deprecated Supply a {@link PasswordProvider} on the {@link 
ParseContext} instead
-     */
-    public static final String PASSWORD = 
"org.apache.tika.parser.pdf.password";
-    private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
-    /**
-     * Serial version UID
-     */
-    private static final long serialVersionUID = -752276948656079347L;
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            Collections.singleton(MEDIA_TYPE);
-    private PDFParserConfig defaultConfig = new PDFParserConfig();
-
-
-
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return SUPPORTED_TYPES;
-    }
-
-    public void parse(
-            InputStream stream, ContentHandler handler,
-            Metadata metadata, ParseContext context)
-            throws IOException, SAXException, TikaException {
-
-        PDDocument pdfDocument = null;
-        TemporaryResources tmp = new TemporaryResources();
-        //config from context, or default if not set via context
-        PDFParserConfig localConfig = context.get(PDFParserConfig.class, 
defaultConfig);
-        String password = "";
-        try {
-            // PDFBox can process entirely in memory, or can use a temp file
-            //  for unpacked / processed resources
-            // Decide which to do based on if we're reading from a file or not 
already
-            //TODO: make this configurable via MemoryUsageSetting
-            TikaInputStream tstream = TikaInputStream.cast(stream);
-            password = getPassword(metadata, context);
-            if (tstream != null && tstream.hasFile()) {
-                // File based -- send file directly to PDFBox
-                pdfDocument = PDDocument.load(tstream.getPath().toFile(), 
password);
-            } else {
-                pdfDocument = PDDocument.load(new 
CloseShieldInputStream(stream), password);
-            }
-            metadata.set("pdf:encrypted", 
Boolean.toString(pdfDocument.isEncrypted()));
-
-            metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
-            extractMetadata(pdfDocument, metadata, context);
-
-            AccessChecker checker = localConfig.getAccessChecker();
-            checker.check(metadata);
-            if (handler != null) {
-                if (shouldHandleXFAOnly(pdfDocument, localConfig)) {
-                    handleXFAOnly(pdfDocument, handler, metadata, context);
-                } else if 
(localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
-                    metadata.add("X-Parsed-By", 
TesseractOCRParser.class.toString());
-                    OCR2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
-                } else {
-                    if 
(localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION))
 {
-                        metadata.add("X-Parsed-By", 
TesseractOCRParser.class.toString());
-                    }
-                    PDF2XHTML.process(pdfDocument, handler, context, metadata, 
localConfig);
-                }
-
-            }
-        } catch (InvalidPasswordException e) {
-            metadata.set("pdf:encrypted", "true");
-            throw new EncryptedDocumentException(e);
-        } finally {
-            if (pdfDocument != null) {
-                pdfDocument.close();
-            }
-        }
-    }
-
-    private String getPassword(Metadata metadata, ParseContext context) {
-        String password = null;
-
-        // Did they supply a new style Password Provider?
-        PasswordProvider passwordProvider = 
context.get(PasswordProvider.class);
-        if (passwordProvider != null) {
-            password = passwordProvider.getPassword(metadata);
-        }
-
-        // Fall back on the old style metadata if set
-        if (password == null && metadata.get(PASSWORD) != null) {
-            password = metadata.get(PASSWORD);
-        }
-
-        // If no password is given, use an empty string as the default
-        if (password == null) {
-            password = "";
-        }
-        return password;
-    }
-
-
-    private void extractMetadata(PDDocument document, Metadata metadata, 
ParseContext context)
-            throws TikaException {
-
-        //first extract AccessPermissions
-        AccessPermission ap = document.getCurrentAccessPermission();
-        metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY,
-                Boolean.toString(ap.canExtractForAccessibility()));
-        metadata.set(AccessPermissions.EXTRACT_CONTENT,
-                Boolean.toString(ap.canExtractContent()));
-        metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT,
-                Boolean.toString(ap.canAssembleDocument()));
-        metadata.set(AccessPermissions.FILL_IN_FORM,
-                Boolean.toString(ap.canFillInForm()));
-        metadata.set(AccessPermissions.CAN_MODIFY,
-                Boolean.toString(ap.canModify()));
-        metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS,
-                Boolean.toString(ap.canModifyAnnotations()));
-        metadata.set(AccessPermissions.CAN_PRINT,
-                Boolean.toString(ap.canPrint()));
-        metadata.set(AccessPermissions.CAN_PRINT_DEGRADED,
-                Boolean.toString(ap.canPrintDegraded()));
-
-
-        //now go for the XMP
-        Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), 
context);
-
-        XMPMetadata xmp = null;
-        if (dom != null) {
-            xmp = new XMPMetadata(dom);
-        }
-        XMPSchemaDublinCore dcSchema = null;
-        try {
-            if (document.getDocumentCatalog().getMetadata() != null) {
-                InputStream xmpIs = 
document.getDocumentCatalog().getMetadata().exportXMPMetadata();
-                xmp = XMPMetadata.load(xmpIs);
-            }
-        } catch (IOException e) {}
-
-        if (xmp != null) {
-            try {
-                dcSchema = xmp.getDublinCoreSchema();
-            } catch (IOException e) {}
-
-            JempboxExtractor.extractXMPMM(xmp, metadata);
-        }
-
-        PDDocumentInformation info = document.getDocumentInformation();
-        metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
-        extractMultilingualItems(metadata, TikaCoreProperties.TITLE, 
info.getTitle(), dcSchema);
-        extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, 
info.getAuthor(), dcSchema);
-        extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, 
null, dcSchema);
-        addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, 
info.getCreator());
-        addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords());
-        addMetadata(metadata, "producer", info.getProducer());
-        extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, 
null, dcSchema);
-
-        // TODO: Move to description in Tika 2.0
-        addMetadata(metadata, 
TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject());
-        addMetadata(metadata, "trapped", info.getTrapped());
-            // TODO Remove these in Tika 2.0
-        addMetadata(metadata, "created", info.getCreationDate());
-        addMetadata(metadata, TikaCoreProperties.CREATED, 
info.getCreationDate());
-        Calendar modified = info.getModificationDate();
-        addMetadata(metadata, Metadata.LAST_MODIFIED, modified);
-        addMetadata(metadata, TikaCoreProperties.MODIFIED, modified);
-
-        // All remaining metadata is custom
-        // Copy this over as-is
-        List<String> handledMetadata = Arrays.asList("Author", "Creator", 
"CreationDate", "ModDate",
-                "Keywords", "Producer", "Subject", "Title", "Trapped");
-        for (COSName key : info.getCOSObject().keySet()) {
-            String name = key.getName();
-            if (!handledMetadata.contains(name)) {
-                addMetadata(metadata, name, 
info.getCOSObject().getDictionaryObject(key));
-            }
-        }
-
-        //try to get the various versions
-        //Caveats:
-        //    there is currently a fair amount of redundancy
-        //    TikaCoreProperties.FORMAT can be multivalued
-        //    There are also three potential pdf specific version keys: 
pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion        
-        metadata.set("pdf:PDFVersion", 
Float.toString(document.getDocument().getVersion()));
-        metadata.add(TikaCoreProperties.FORMAT.getName(),
-                MEDIA_TYPE.toString() + "; version=" +
-                        Float.toString(document.getDocument().getVersion()));
-
-        try {
-            if (xmp != null) {
-                xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, 
XMPSchemaPDFAId.class);
-                XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) 
xmp.getSchemaByClass(XMPSchemaPDFAId.class);
-                if (pdfaxmp != null) {
-                    if (pdfaxmp.getPart() != null) {
-                        metadata.set("pdfaid:part", 
Integer.toString(pdfaxmp.getPart()));
-                    }
-                    if (pdfaxmp.getConformance() != null) {
-                        metadata.set("pdfaid:conformance", 
pdfaxmp.getConformance());
-                        String version = "A-" + pdfaxmp.getPart() + 
pdfaxmp.getConformance().toLowerCase(Locale.ROOT);
-                        metadata.set("pdfa:PDFVersion", version);
-                        metadata.add(TikaCoreProperties.FORMAT.getName(),
-                                MEDIA_TYPE.toString() + "; version=\"" + 
version + "\"");
-                    }
-                }
-                // TODO WARN if this XMP version is inconsistent with document 
header version?          
-            }
-        } catch (IOException e) {
-            metadata.set(TikaCoreProperties.TIKA_META_PREFIX + 
"pdf:metadata-xmp-parse-failed", "" + e);
-        }
-        //TODO: Let's try to move this into PDFBox.
-        //Attempt to determine Adobe extension level, if present:
-        COSDictionary root = document.getDocumentCatalog().getCOSObject();
-        COSDictionary extensions = (COSDictionary) 
root.getDictionaryObject(COSName.getPDFName("Extensions"));
-        if (extensions != null) {
-            for (COSName extName : extensions.keySet()) {
-                // If it's an Adobe one, interpret it to determine the 
extension level:
-                if (extName.equals(COSName.getPDFName("ADBE"))) {
-                    COSDictionary adobeExt = (COSDictionary) 
extensions.getDictionaryObject(extName);
-                    if (adobeExt != null) {
-                        String baseVersion = 
adobeExt.getNameAsString(COSName.getPDFName("BaseVersion"));
-                        int el = 
adobeExt.getInt(COSName.getPDFName("ExtensionLevel"));
-                        //-1 is sentinel value that something went wrong in 
getInt
-                        if (el != -1) {
-                            metadata.set("pdf:PDFExtensionVersion", 
baseVersion + " Adobe Extension Level " + el);
-                            metadata.add(TikaCoreProperties.FORMAT.getName(),
-                                    MEDIA_TYPE.toString() + "; version=\"" + 
baseVersion + " Adobe Extension Level " + el + "\"");
-                        }
-                    }
-                } else {
-                    // WARN that there is an Extension, but it's not Adobe's, 
and so is a 'new' format'.
-                    metadata.set("pdf:foundNonAdobeExtensionName", 
extName.getName());
-                }
-            }
-        }
-    }
-
-    /**
-     * Try to extract all multilingual items from the XMPSchema
-     * <p/>
-     * This relies on the property having a valid xmp getName()
-     * <p/>
-     * For now, this only extracts the first language if the property does not 
allow multiple values (see TIKA-1295)
-     *
-     * @param metadata
-     * @param property
-     * @param pdfBoxBaseline
-     * @param schema
-     */
-    private void extractMultilingualItems(Metadata metadata, Property property,
-                                          String pdfBoxBaseline, XMPSchema 
schema) {
-        //if schema is null, just go with pdfBoxBaseline
-        if (schema == null) {
-            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                addMetadata(metadata, property, pdfBoxBaseline);
-            }
-            return;
-        }
-
-        for (String lang : 
schema.getLanguagePropertyLanguages(property.getName())) {
-            String value = schema.getLanguageProperty(property.getName(), 
lang);
-
-            if (value != null && value.length() > 0) {
-                //if you're going to add it below in the baseline addition, 
don't add it now
-                if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) {
-                    continue;
-                }
-                addMetadata(metadata, property, value);
-                if (!property.isMultiValuePermitted()) {
-                    return;
-                }
-            }
-        }
-
-        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-            //if we've already added something above and multivalue is not 
permitted
-            //return.
-            if (!property.isMultiValuePermitted()) {
-                if (metadata.get(property) != null) {
-                    return;
-                }
-            }
-            addMetadata(metadata, property, pdfBoxBaseline);
-        }
-    }
-
-
-    /**
-     * This tries to read a list from a particular property in
-     * XMPSchemaDublinCore.
-     * If it can't find the information, it falls back to the
-     * pdfboxBaseline.  The pdfboxBaseline should be the value
-     * that pdfbox returns from its PDDocumentInformation object
-     * (e.g. getAuthor()) This method is designed include the pdfboxBaseline,
-     * and it should not duplicate the pdfboxBaseline.
-     * <p/>
-     * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this
-     * on dates!
-     * <p/>
-     * This relies on the property having a DublinCore compliant getName()
-     *
-     * @param property
-     * @param pdfBoxBaseline
-     * @param dc
-     * @param metadata
-     */
-    private void extractDublinCoreListItems(Metadata metadata, Property 
property,
-                                            String pdfBoxBaseline, 
XMPSchemaDublinCore dc) {
-        //if no dc, add baseline and return
-        if (dc == null) {
-            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                addMetadata(metadata, property, pdfBoxBaseline);
-            }
-            return;
-        }
-        List<String> items = getXMPBagOrSeqList(dc, property.getName());
-        if (items == null) {
-            if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-                addMetadata(metadata, property, pdfBoxBaseline);
-            }
-            return;
-        }
-        for (String item : items) {
-            if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) {
-                addMetadata(metadata, property, item);
-            }
-        }
-        //finally, add the baseline
-        if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) {
-            addMetadata(metadata, property, pdfBoxBaseline);
-        }
-    }
-
-    /**
-     * As of this writing, XMPSchema can contain bags or sequence lists
-     * for some attributes...despite standards documentation.
-     * JempBox expects one or the other for specific attributes.
-     * Until more flexibility is added to JempBox, Tika will have to handle 
both.
-     *
-     * @param schema
-     * @param name
-     * @return list of values or null
-     */
-    private List<String> getXMPBagOrSeqList(XMPSchema schema, String name) {
-        List<String> ret = schema.getBagList(name);
-        if (ret == null) {
-            ret = schema.getSequenceList(name);
-        }
-        return ret;
-    }
-
-    private void addMetadata(Metadata metadata, Property property, String 
value) {
-        if (value != null) {
-            String decoded = decode(value);
-            if (property.isMultiValuePermitted() || metadata.get(property) == 
null) {
-                metadata.add(property, decoded);
-            }
-            //silently skip adding property that already exists if multiple 
values are not permitted
-        }
-    }
-
-    private void addMetadata(Metadata metadata, String name, String value) {
-        if (value != null) {
-            metadata.add(name, decode(value));
-        }
-    }
-
-    private String decode(String value) {
-        if (PDFEncodedStringDecoder.shouldDecode(value)) {
-            PDFEncodedStringDecoder d = new PDFEncodedStringDecoder();
-            return d.decode(value);
-        }
-        return value;
-    }
-
-    private void addMetadata(Metadata metadata, String name, Calendar value) {
-        if (value != null) {
-            metadata.set(name, value.getTime().toString());
-        }
-    }
-
-    private void addMetadata(Metadata metadata, Property property, Calendar 
value) {
-        if (value != null) {
-            metadata.set(property, value.getTime());
-        }
-    }
-
-    /**
-     * Used when processing custom metadata entries, as PDFBox won't do
-     * the conversion for us in the way it does for the standard ones
-     */
-    private void addMetadata(Metadata metadata, String name, COSBase value) {
-        if (value instanceof COSArray) {
-            for (Object v : ((COSArray) value).toList()) {
-                addMetadata(metadata, name, ((COSBase) v));
-            }
-        } else if (value instanceof COSString) {
-            addMetadata(metadata, name, ((COSString) value).getString());
-        }
-        // Avoid calling COSDictionary#toString, since it can lead to infinite
-        // recursion. See TIKA-1038 and PDFBOX-1835.
-        else if (value != null && !(value instanceof COSDictionary)) {
-            addMetadata(metadata, name, value.toString());
-        }
-    }
-
-
-    private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig 
config) {
-        if (config.getIfXFAExtractOnlyXFA() &&
-            pdDocument.getDocumentCatalog() != null &&
-            pdDocument.getDocumentCatalog().getAcroForm() != null &&
-            pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) {
-            return true;
-        }
-        return false;
-    }
-
-    private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler,
-                               Metadata metadata, ParseContext context)
-        throws SAXException, IOException, TikaException {
-        XFAExtractor ex = new XFAExtractor();
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        try (InputStream is = new ByteArrayInputStream(
-                
pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) {
-            ex.extract(is, xhtml, metadata, context);
-        } catch (XMLStreamException e) {
-            throw new TikaException("XML error in XFA", e);
-        }
-        xhtml.endDocument();
-    }
-
-    public PDFParserConfig getPDFParserConfig() {
-        return defaultConfig;
-    }
-
-    public void setPDFParserConfig(PDFParserConfig config) {
-        this.defaultConfig = config;
-    }
-
-    /**
-     * @see #setEnableAutoSpace(boolean)
-     * @deprecated use {@link #getPDFParserConfig()}
-     */
-    public boolean getEnableAutoSpace() {
-        return defaultConfig.getEnableAutoSpace();
-    }
-
-    /**
-     * If true (the default), the parser should estimate
-     * where spaces should be inserted between words.  For
-     * many PDFs this is necessary as they do not include
-     * explicit whitespace characters.
-     *
-     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
-     */
-    public void setEnableAutoSpace(boolean v) {
-        defaultConfig.setEnableAutoSpace(v);
-    }
-
-    /**
-     * If true, text in annotations will be extracted.
-     *
-     * @deprecated use {@link #getPDFParserConfig()}
-     */
-    public boolean getExtractAnnotationText() {
-        return defaultConfig.getExtractAnnotationText();
-    }
-
-    /**
-     * If true (the default), text in annotations will be
-     * extracted.
-     *
-     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
-     */
-    public void setExtractAnnotationText(boolean v) {
-        defaultConfig.setExtractAnnotationText(v);
-    }
-
-    /**
-     * @see #setSuppressDuplicateOverlappingText(boolean)
-     * @deprecated use {@link #getPDFParserConfig()}
-     */
-    public boolean getSuppressDuplicateOverlappingText() {
-        return defaultConfig.getSuppressDuplicateOverlappingText();
-    }
-
-    /**
-     * If true, the parser should try to remove duplicated
-     * text over the same region.  This is needed for some
-     * PDFs that achieve bolding by re-writing the same
-     * text in the same area.  Note that this can
-     * slow down extraction substantially (PDFBOX-956) and
-     * sometimes remove characters that were not in fact
-     * duplicated (PDFBOX-1155).  By default this is disabled.
-     *
-     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
-     */
-    public void setSuppressDuplicateOverlappingText(boolean v) {
-        defaultConfig.setSuppressDuplicateOverlappingText(v);
-    }
-
-    /**
-     * @see #setSortByPosition(boolean)
-     * @deprecated use {@link #getPDFParserConfig()}
-     */
-    public boolean getSortByPosition() {
-        return defaultConfig.getSortByPosition();
-    }
-
-    /**
-     * If true, sort text tokens by their x/y position
-     * before extracting text.  This may be necessary for
-     * some PDFs (if the text tokens are not rendered "in
-     * order"), while for other PDFs it can produce the
-     * wrong result (for example if there are 2 columns,
-     * the text will be interleaved).  Default is false.
-     *
-     * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)}
-     */
-    public void setSortByPosition(boolean v) {
-        defaultConfig.setSortByPosition(v);
-    }
-
-
-    //can return null!
-    private Document loadDOM(PDMetadata pdMetadata, ParseContext context) {
-        if (pdMetadata == null) {
-            return null;
-        }
-        try (InputStream is = pdMetadata.exportXMPMetadata()) {
-            DocumentBuilder documentBuilder = context.getDocumentBuilder();
-            documentBuilder.setErrorHandler((ErrorHandler)null);
-            return documentBuilder.parse(is);
-        } catch (IOException|SAXException|TikaException e) {
-            //swallow
-        }
-        return null;
-
-    }
-
-}

http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
 
b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
deleted file mode 100644
index 296b191..0000000
--- 
a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ /dev/null
@@ -1,614 +0,0 @@
-package org.apache.tika.parser.pdf;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Serializable;
-import java.util.Locale;
-import java.util.Properties;
-
-import org.apache.pdfbox.rendering.ImageType;
-import org.apache.pdfbox.text.PDFTextStripper;
-
-/**
- * Config for PDFParser.
- * <p/>
- * This allows parameters to be set programmatically:
- * <ol>
- * <li>Calls to PDFParser, i.e. 
parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li>
- * <li>Constructor of PDFParser</li>
- * <li>Passing to PDFParser through a ParseContext: 
context.set(PDFParserConfig.class, config);</li>
- * </ol>
- * <p/>
- * Parameters can also be set by modifying the PDFParserConfig.properties file,
- * which lives in the expected places, in trunk:
- * tika-parsers/src/main/resources/org/apache/tika/parser/pdf
- * <p/>
- * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar:
- * org/apache/tika/parser/pdf
- */
-public class PDFParserConfig implements Serializable {
-
-    public enum OCR_STRATEGY {
-        NO_OCR,
-        OCR_ONLY,
-        OCR_AND_TEXT_EXTRACTION;
-
-        private static OCR_STRATEGY parse(String s) {
-            if (s == null) {
-                return NO_OCR;
-            } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) {
-                return NO_OCR;
-            } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) {
-                return OCR_ONLY;
-            } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) {
-                return OCR_AND_TEXT_EXTRACTION;
-            }
-            //default -- no ocr
-            return NO_OCR;
-        }
-    }
-
-    private static final long serialVersionUID = 6492570218190936986L;
-
-    // True if we let PDFBox "guess" where spaces should go:
-    private boolean enableAutoSpace = true;
-
-    // True if we let PDFBox remove duplicate overlapping text:
-    private boolean suppressDuplicateOverlappingText;
-
-    // True if we extract annotation text ourselves
-    // (workaround for PDFBOX-1143):
-    private boolean extractAnnotationText = true;
-
-    // True if we should sort text tokens by position
-    // (necessary for some PDFs, but messes up other PDFs):
-    private boolean sortByPosition = false;
-
-    //True if acroform content should be extracted
-    private boolean extractAcroFormContent = true;
-
-    //True if inline PDXImage objects should be extracted
-    private boolean extractInlineImages = false;
-
-    //True if inline images (as identified by their object id within
-    //a pdf file) should only be extracted once.
-    private boolean extractUniqueInlineImagesOnly = true;
-
-    //The character width-based tolerance value used to estimate where spaces 
in text should be added
-    private Float averageCharTolerance;
-
-    //The space width-based tolerance value used to estimate where spaces in 
text should be added
-    private Float spacingTolerance;
-
-    //If the PDF has an XFA element, process only that and skip extracting
-    //content from elsewhere in the document.
-    private boolean ifXFAExtractOnlyXFA = false;
-
-    private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR;
-
-    private int ocrDPI = 200;
-    private ImageType ocrImageType = ImageType.GRAY;
-    private String ocrImageFormatName = "png";
-
-    private AccessChecker accessChecker;
-
-    //The PDFParser can throw IOExceptions if there is a problem
-    //with a streams.  If this is set to true, Tika's
-    //parser catches these exceptions, reports them in the metadata
-    //and then throws the first stored exception after the parse has completed.
-    private boolean isCatchIntermediateIOExceptions = true;
-
-    public PDFParserConfig() {
-        init(this.getClass().getResourceAsStream("PDFParser.properties"));
-    }
-
-    /**
-     * Loads properties from InputStream and then tries to close InputStream.
-     * If there is an IOException, this silently swallows the exception
-     * and goes back to the default.
-     *
-     * @param is
-     */
-    public PDFParserConfig(InputStream is) {
-        init(is);
-    }
-
-    //initializes object and then tries to close inputstream
-    private void init(InputStream is) {
-
-        if (is == null) {
-            return;
-        }
-        Properties props = new Properties();
-        try {
-            props.load(is);
-        } catch (IOException e) {
-        } finally {
-            if (is != null) {
-                try {
-                    is.close();
-                } catch (IOException e) {
-                    //swallow
-                }
-            }
-        }
-        setEnableAutoSpace(
-                getBooleanProp(props.getProperty("enableAutoSpace"), 
getEnableAutoSpace()));
-        setSuppressDuplicateOverlappingText(
-                
getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"),
-                        getSuppressDuplicateOverlappingText()));
-        setExtractAnnotationText(
-                getBooleanProp(props.getProperty("extractAnnotationText"),
-                        getExtractAnnotationText()));
-        setSortByPosition(
-                getBooleanProp(props.getProperty("sortByPosition"),
-                        getSortByPosition()));
-        setExtractAcroFormContent(
-                getBooleanProp(props.getProperty("extractAcroFormContent"),
-                        getExtractAcroFormContent()));
-        setExtractInlineImages(
-                getBooleanProp(props.getProperty("extractInlineImages"),
-                        getExtractInlineImages()));
-        setExtractUniqueInlineImagesOnly(
-                
getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
-                        getExtractUniqueInlineImagesOnly()));
-
-        setIfXFAExtractOnlyXFA(
-            getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"),
-                getIfXFAExtractOnlyXFA()));
-
-        setCatchIntermediateIOExceptions(
-                
getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
-                isCatchIntermediateIOExceptions()));
-
-        setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
-
-        setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI()));
-
-        setOCRImageFormatName(props.getProperty("ocrImageFormatName"));
-
-        setOCRImageType(parseImageType(props.getProperty("ocrImageType")));
-
-
-        boolean checkExtractAccessPermission = 
getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
-        boolean allowExtractionForAccessibility = 
getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
-
-        if (checkExtractAccessPermission == false) {
-            //silently ignore the crazy configuration of 
checkExtractAccessPermission = false,
-            //but allowExtractionForAccessibility=false
-            accessChecker = new AccessChecker();
-        } else {
-            accessChecker = new AccessChecker(allowExtractionForAccessibility);
-        }
-    }
-
-    /**
-     * Configures the given pdf2XHTML.
-     *
-     * @param pdf2XHTML
-     */
-    public void configure(PDF2XHTML pdf2XHTML) {
-        pdf2XHTML.setSortByPosition(getSortByPosition());
-        if (getEnableAutoSpace()) {
-            pdf2XHTML.setWordSeparator(" ");
-        } else {
-            pdf2XHTML.setWordSeparator("");
-        }
-        if (getAverageCharTolerance() != null) {
-            pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance());
-        }
-        if (getSpacingTolerance() != null) {
-            pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
-        }
-        
pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
-    }
-
-    /**
-     * @see #setExtractAcroFormContent(boolean)
-     */
-    public boolean getExtractAcroFormContent() {
-        return extractAcroFormContent;
-    }
-
-    /**
-     * If true (the default), extract content from AcroForms
-     * at the end of the document.  If an XFA is found,
-     * try to process that, otherwise, process the AcroForm.
-     *
-     * @param extractAcroFormContent
-     */
-    public void setExtractAcroFormContent(boolean extractAcroFormContent) {
-        this.extractAcroFormContent = extractAcroFormContent;
-
-    }
-
-    /**
-     * @see #setIfXFAExtractOnlyXFA(boolean)
-     * @return how to handle XFA data if it exists
-     */
-    public boolean getIfXFAExtractOnlyXFA() {
-        return ifXFAExtractOnlyXFA;
-    }
-
-    /**
-     * If false (the default), extract content from the full PDF
-     * as well as the XFA form.  This will likely lead to some duplicative
-     * content.
-     *
-     * @param ifXFAExtractOnlyXFA
-     */
-    public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) {
-        this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA;
-    }
-
-
-    /**
-     * @see #setExtractInlineImages(boolean)
-     */
-    public boolean getExtractInlineImages() {
-        return extractInlineImages;
-    }
-
-    /**
-     * If true, extract inline embedded OBXImages.
-     * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain
-     * thousands of embedded images totaling > 2.5 GB.  Also, at least as of 
PDFBox 1.8.5,
-     * there can be surprisingly large memory consumption and/or out of memory 
errors.
-     * Set to <code>true</code> with caution.
-     * <p/>
-     * The default is <code>false</code>.
-     * <p/>
-     * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)};
-     *
-     * @param extractInlineImages
-     */
-    public void setExtractInlineImages(boolean extractInlineImages) {
-        this.extractInlineImages = extractInlineImages;
-    }
-
-    /**
-     * @see #setExtractUniqueInlineImagesOnly(boolean)
-     */
-    public boolean getExtractUniqueInlineImagesOnly() {
-        return extractUniqueInlineImagesOnly;
-    }
-
-    /**
-     * Multiple pages within a PDF file might refer to the same underlying 
image.
-     * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, 
the
-     * parser will call the EmbeddedExtractor each time the image appears on a 
page.
-     * This might be desired for some use cases.  However, to avoid 
duplication of
-     * extracted images, set this to <code>true</code>.  The default is 
<code>true</code>.
-     * <p/>
-     * Note that uniqueness is determined only by the underlying PDF COSObject 
id, not by
-     * file hash or similar equality metric.
-     * If the PDF actually contains multiple copies of the same image
-     * -- all with different object ids -- then all images will be extracted.
-     * <p/>
-     * For this parameter to have any effect, {@link #extractInlineImages} 
must be
-     * set to <code>true</code>.
-     * <p>
-     * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the 
setting
-     * of this parameter, the extractor will only pull out one copy of each 
image per
-     * page.  This parameter tries to capture uniqueness across the entire 
document.
-     *
-     * @param extractUniqueInlineImagesOnly
-     */
-    public void setExtractUniqueInlineImagesOnly(boolean 
extractUniqueInlineImagesOnly) {
-        this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly;
-
-    }
-
-    /**
-     * @see #setEnableAutoSpace(boolean)
-     */
-    public boolean getEnableAutoSpace() {
-        return enableAutoSpace;
-    }
-
-    /**
-     * If true (the default), the parser should estimate
-     * where spaces should be inserted between words.  For
-     * many PDFs this is necessary as they do not include
-     * explicit whitespace characters.
-     */
-    public void setEnableAutoSpace(boolean enableAutoSpace) {
-        this.enableAutoSpace = enableAutoSpace;
-    }
-
-    /**
-     * @see #setSuppressDuplicateOverlappingText(boolean)
-     */
-    public boolean getSuppressDuplicateOverlappingText() {
-        return suppressDuplicateOverlappingText;
-    }
-
-    /**
-     * If true, the parser should try to remove duplicated
-     * text over the same region.  This is needed for some
-     * PDFs that achieve bolding by re-writing the same
-     * text in the same area.  Note that this can
-     * slow down extraction substantially (PDFBOX-956) and
-     * sometimes remove characters that were not in fact
-     * duplicated (PDFBOX-1155).  By default this is disabled.
-     */
-    public void setSuppressDuplicateOverlappingText(
-            boolean suppressDuplicateOverlappingText) {
-        this.suppressDuplicateOverlappingText = 
suppressDuplicateOverlappingText;
-    }
-
-    /**
-     * @see #setExtractAnnotationText(boolean)
-     */
-    public boolean getExtractAnnotationText() {
-        return extractAnnotationText;
-    }
-
-    /**
-     * If true (the default), text in annotations will be
-     * extracted.
-     */
-    public void setExtractAnnotationText(boolean extractAnnotationText) {
-        this.extractAnnotationText = extractAnnotationText;
-    }
-
-    /**
-     * @see #setSortByPosition(boolean)
-     */
-    public boolean getSortByPosition() {
-        return sortByPosition;
-    }
-
-    /**
-     * If true, sort text tokens by their x/y position
-     * before extracting text.  This may be necessary for
-     * some PDFs (if the text tokens are not rendered "in
-     * order"), while for other PDFs it can produce the
-     * wrong result (for example if there are 2 columns,
-     * the text will be interleaved).  Default is false.
-     */
-    public void setSortByPosition(boolean sortByPosition) {
-        this.sortByPosition = sortByPosition;
-    }
-
-    /**
-     * @see #setAverageCharTolerance(Float)
-     */
-    public Float getAverageCharTolerance() {
-        return averageCharTolerance;
-    }
-
-    /**
-     * See {@link PDFTextStripper#setAverageCharTolerance(float)}
-     */
-    public void setAverageCharTolerance(Float averageCharTolerance) {
-        this.averageCharTolerance = averageCharTolerance;
-    }
-
-    /**
-     * @see #setSpacingTolerance(Float)
-     */
-    public Float getSpacingTolerance() {
-        return spacingTolerance;
-    }
-
-    /**
-     * See {@link PDFTextStripper#setSpacingTolerance(float)}
-     */
-    public void setSpacingTolerance(Float spacingTolerance) {
-        this.spacingTolerance = spacingTolerance;
-    }
-
-    public AccessChecker getAccessChecker() {
-        return accessChecker;
-    }
-
-    public void setAccessChecker(AccessChecker accessChecker) {
-        this.accessChecker = accessChecker;
-    }
-
-    /**
-     * See {@link #setCatchIntermediateIOExceptions(boolean)}
-     * @return whether or not to catch IOExceptions
-     */
-    public boolean isCatchIntermediateIOExceptions() {
-        return isCatchIntermediateIOExceptions;
-    }
-
-    /**
-     * The PDFBox parser will throw an IOException if there is
-     * a problem with a stream.  If this is set to <code>true</code>,
-     * Tika's PDFParser will catch these exceptions and try to parse
-     * the rest of the document.  After the parse is completed,
-     * Tika's PDFParser will throw the first caught exception.
-     * @param catchIntermediateIOExceptions
-     */
-    public void setCatchIntermediateIOExceptions(boolean 
catchIntermediateIOExceptions) {
-        isCatchIntermediateIOExceptions = catchIntermediateIOExceptions;
-    }
-
-    /**
-     * Which strategy to use for OCR
-     * @param ocrStrategy
-     */
-    public void setOCRStrategy(OCR_STRATEGY ocrStrategy) {
-        this.ocrStrategy = ocrStrategy;
-    }
-
-    /**
-     *
-     * @return strategy to use for OCR
-     */
-    public OCR_STRATEGY getOCRStrategy() {
-        return ocrStrategy;
-    }
-
-    private boolean getBooleanProp(String p, boolean defaultMissing) {
-        if (p == null) {
-            return defaultMissing;
-        }
-        if (p.toLowerCase(Locale.ROOT).equals("true")) {
-            return true;
-        } else if (p.toLowerCase(Locale.ROOT).equals("false")) {
-            return false;
-        } else {
-            return defaultMissing;
-        }
-    }
-    //throws NumberFormatException if there's a non-null unparseable
-    //string passed in
-    private int getIntProp(String p, int defaultMissing) {
-        if (p == null) {
-            return defaultMissing;
-        }
-
-        return Integer.parseInt(p);
-    }
-
-    /**
-     * String representation of the image format used to render
-     * the page image for OCR (examples: png, tiff, jpeg)
-     * @return
-     */
-    public String getOCRImageFormatName() {
-        return ocrImageFormatName;
-    }
-
-    /**
-     * @see #getOCRImageFormatName()
-     *
-     * @param ocrImageFormatName name of image format used to render
-     *                           page image
-     */
-    public void setOCRImageFormatName(String ocrImageFormatName) {
-        this.ocrImageFormatName = ocrImageFormatName;
-    }
-
-    /**
-     * Image type used to render the page image for OCR.
-     * @see #setOCRImageType(ImageType)
-     * @return image type
-     */
-    public ImageType getOCRImageType() {
-        return ocrImageType;
-    }
-
-    /**
-     * Image type used to render the page image for OCR.
-     * @param ocrImageType
-     */
-    public void setOCRImageType(ImageType ocrImageType) {
-        this.ocrImageType = ocrImageType;
-    }
-
-    /**
-     * Dots per inch used to render the page image for OCR
-     * @return dots per inch
-     */
-    public int getOCRDPI() {
-        return ocrDPI;
-    }
-
-    /**
-     * Dots per inche used to render the page image for OCR
-     * @param ocrDPI
-     */
-    public void setOCRDPI(int ocrDPI) {
-        this.ocrDPI = ocrDPI;
-    }
-
-    private ImageType parseImageType(String ocrImageType) {
-        for (ImageType t : ImageType.values()) {
-            if (ocrImageType.equalsIgnoreCase(t.toString())) {
-                return t;
-            }
-        }
-        return null;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) return true;
-        if (!(o instanceof PDFParserConfig)) return false;
-
-        PDFParserConfig config = (PDFParserConfig) o;
-
-        if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false;
-        if (getSuppressDuplicateOverlappingText() != 
config.getSuppressDuplicateOverlappingText()) return false;
-        if (getExtractAnnotationText() != config.getExtractAnnotationText()) 
return false;
-        if (getSortByPosition() != config.getSortByPosition()) return false;
-        if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) 
return false;
-        if (getExtractInlineImages() != config.getExtractInlineImages()) 
return false;
-        if (getExtractUniqueInlineImagesOnly() != 
config.getExtractUniqueInlineImagesOnly()) return false;
-        if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) 
return false;
-        if (getOCRDPI() != config.getOCRDPI()) return false;
-        if (isCatchIntermediateIOExceptions() != 
config.isCatchIntermediateIOExceptions()) return false;
-        if 
(!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return 
false;
-        if (!getSpacingTolerance().equals(config.getSpacingTolerance())) 
return false;
-        if (!getOCRStrategy().equals(config.getOCRStrategy())) return false;
-        if (getOCRImageType() != config.getOCRImageType()) return false;
-        if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) 
return false;
-        return getAccessChecker().equals(config.getAccessChecker());
-
-    }
-
-    @Override
-    public int hashCode() {
-        int result = (getEnableAutoSpace() ? 1 : 0);
-        result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0);
-        result = 31 * result + (getExtractAnnotationText() ? 1 : 0);
-        result = 31 * result + (getSortByPosition() ? 1 : 0);
-        result = 31 * result + (getExtractAcroFormContent() ? 1 : 0);
-        result = 31 * result + (getExtractInlineImages() ? 1 : 0);
-        result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
-        result = 31 * result + getAverageCharTolerance().hashCode();
-        result = 31 * result + getSpacingTolerance().hashCode();
-        result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
-        result = 31 * result + ocrStrategy.hashCode();
-        result = 31 * result + getOCRDPI();
-        result = 31 * result + getOCRImageType().hashCode();
-        result = 31 * result + getOCRImageFormatName().hashCode();
-        result = 31 * result + getAccessChecker().hashCode();
-        result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
-        return result;
-    }
-
-    @Override
-    public String toString() {
-        return "PDFParserConfig{" +
-                "enableAutoSpace=" + enableAutoSpace +
-                ", suppressDuplicateOverlappingText=" + 
suppressDuplicateOverlappingText +
-                ", extractAnnotationText=" + extractAnnotationText +
-                ", sortByPosition=" + sortByPosition +
-                ", extractAcroFormContent=" + extractAcroFormContent +
-                ", extractInlineImages=" + extractInlineImages +
-                ", extractUniqueInlineImagesOnly=" + 
extractUniqueInlineImagesOnly +
-                ", averageCharTolerance=" + averageCharTolerance +
-                ", spacingTolerance=" + spacingTolerance +
-                ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
-                ", ocrStrategy=" + ocrStrategy +
-                ", ocrDPI=" + ocrDPI +
-                ", ocrImageType=" + ocrImageType +
-                ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
-                ", accessChecker=" + accessChecker +
-                ", isCatchIntermediateIOExceptions=" + 
isCatchIntermediateIOExceptions +
-                '}';
-    }
-}

[2/5] tika git commit: TIKA-2059 - Merge multimedia and pdf parser modules and bundles

Reply via email to