http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java deleted file mode 100644 index 775e590..0000000 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AccessChecker.java +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.parser.pdf; - -import java.io.Serializable; - -import org.apache.tika.exception.AccessPermissionException; -import org.apache.tika.metadata.AccessPermissions; -import org.apache.tika.metadata.Metadata; - -/** - * Checks whether or not a document allows extraction generally - * or extraction for accessibility only. - */ -public class AccessChecker implements Serializable { - - private static final long serialVersionUID = 6492570218190936986L; - - private final boolean needToCheck; - private final boolean allowAccessibility; - - /** - * This constructs an {@link AccessChecker} that - * will not perform any checking and will always return without - * throwing an exception. - * <p/> - * This constructor is available to allow for Tika's legacy ( <= v1.7) behavior. - */ - public AccessChecker() { - needToCheck = false; - allowAccessibility = true; - } - - /** - * This constructs an {@link AccessChecker} that will check - * for whether or not content should be extracted from a document. - * - * @param allowExtractionForAccessibility if general extraction is not allowed, is extraction for accessibility allowed - */ - public AccessChecker(boolean allowExtractionForAccessibility) { - needToCheck = true; - this.allowAccessibility = allowExtractionForAccessibility; - } - - /** - * Checks to see if a document's content should be extracted based - * on metadata values and the value of {@link #allowAccessibility} in the constructor. - * - * @param metadata - * @throws AccessPermissionException if access is not permitted - */ - public void check(Metadata metadata) throws AccessPermissionException { - if (!needToCheck) { - return; - } - if ("false".equals(metadata.get(AccessPermissions.EXTRACT_CONTENT))) { - if (allowAccessibility) { - if ("true".equals(metadata.get(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY))) { - return; - } - throw new AccessPermissionException("Content extraction for accessibility is not allowed."); - } - throw new AccessPermissionException("Content extraction is not allowed."); - } - } -}
http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java deleted file mode 100644 index 3ad551d..0000000 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.pdf; - -import java.io.IOException; -import java.io.Writer; - -import org.apache.commons.io.IOExceptionWithCause; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.text.PDFTextStripper; -import org.apache.pdfbox.text.TextPosition; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.ParseContext; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - - -/** - * Utility class that overrides the {@link PDFTextStripper} functionality - * to integrate text extraction via OCR only. - * - */ -class OCR2XHTML extends AbstractPDF2XHTML { - - private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, - PDFParserConfig config) - throws IOException { - super(document, handler, context, metadata, config); - } - - /** - * Converts the given PDF document (and related metadata) to a stream - * of XHTML SAX events sent to the given content handler. - * - * @param document PDF document - * @param handler SAX content handler - * @param metadata PDF metadata - * @throws SAXException if the content handler fails to process SAX events - * @throws TikaException if there was an exception outside of per page processing - */ - public static void process( - PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, - PDFParserConfig config) - throws SAXException, TikaException { - OCR2XHTML ocr2XHTML = null; - try { - ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config); - ocr2XHTML.writeText(document, new Writer() { - @Override - public void write(char[] cbuf, int off, int len) { - } - - @Override - public void flush() { - } - - @Override - public void close() { - } - }); - } catch (IOException e) { - if (e.getCause() instanceof SAXException) { - throw (SAXException) e.getCause(); - } else { - throw new TikaException("Unable to extract PDF content", e); - } - } - if (ocr2XHTML.exceptions.size() > 0) { - //throw the first - throw new TikaException("Unable to extract all PDF content", - ocr2XHTML.exceptions.get(0)); - } - } - - @Override - public void processPage(PDPage pdPage) throws IOException { - try { - startPage(pdPage); - doOCROnCurrentPage(); - endPage(pdPage); - } catch (TikaException |SAXException e) { - throw new IOExceptionWithCause(e); - } catch (IOException e) { - handleCatchableIOE(e); - } - } - - @Override - protected void writeString(String text) throws IOException { - //no-op - } - - @Override - protected void writeCharacters(TextPosition text) throws IOException { - //no-op - } - - @Override - protected void writeWordSeparator() throws IOException { - //no-op - } - - @Override - protected void writeLineSeparator() throws IOException { - //no-op - } - -} - http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java deleted file mode 100644 index ac9823e..0000000 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.pdf; - -import java.awt.image.BufferedImage; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.io.Writer; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import org.apache.commons.io.IOExceptionWithCause; -import org.apache.pdfbox.cos.COSBase; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.cos.COSStream; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.graphics.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; -import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; -import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; -import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; -import org.apache.pdfbox.text.PDFTextStripper; -import org.apache.pdfbox.text.TextPosition; -import org.apache.pdfbox.tools.imageio.ImageIOUtil; -import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.EmbeddedContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.AttributesImpl; - -/** - * Utility class that overrides the {@link PDFTextStripper} functionality - * to produce a semi-structured XHTML SAX events instead of a plain text - * stream. - */ -class PDF2XHTML extends AbstractPDF2XHTML { - - - private static final List<String> JPEG = Arrays.asList( - COSName.DCT_DECODE.getName(), - COSName.DCT_DECODE_ABBREVIATION.getName()); - - /** - * This keeps track of the pdf object ids for inline - * images that have been processed. - * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly() - * is true, this will be checked before extracting an embedded image. - * The integer keeps track of the inlineImageCounter for that image. - * This integer is used to identify images in the markup. - * - * This is used across the document. To avoid infinite recursion - * TIKA-1742, we're limiting the export to one image per page. - */ - private Map<COSStream, Integer> processedInlineImages = new HashMap<>(); - private int inlineImageCounter = 0; - private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, - PDFParserConfig config) - throws IOException { - super(document, handler, context, metadata, config); - } - - /** - * Converts the given PDF document (and related metadata) to a stream - * of XHTML SAX events sent to the given content handler. - * - * @param document PDF document - * @param handler SAX content handler - * @param metadata PDF metadata - * @throws SAXException if the content handler fails to process SAX events - * @throws TikaException if there was an exception outside of per page processing - */ - public static void process( - PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, - PDFParserConfig config) - throws SAXException, TikaException { - PDF2XHTML pdf2XHTML = null; - try { - // Extract text using a dummy Writer as we override the - // key methods to output to the given content - // handler. - pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config); - - config.configure(pdf2XHTML); - - pdf2XHTML.writeText(document, new Writer() { - @Override - public void write(char[] cbuf, int off, int len) { - } - - @Override - public void flush() { - } - - @Override - public void close() { - } - }); - - } catch (IOException e) { - if (e.getCause() instanceof SAXException) { - throw (SAXException) e.getCause(); - } else { - throw new TikaException("Unable to extract PDF content", e); - } - } - if (pdf2XHTML.exceptions.size() > 0) { - //throw the first - throw new TikaException("Unable to extract all PDF content", - pdf2XHTML.exceptions.get(0)); - } - } - - - @Override - public void processPage(PDPage page) throws IOException { - try { - super.processPage(page); - } catch (IOException e) { - handleCatchableIOE(e); - } - } - - @Override - protected void endPage(PDPage page) throws IOException { - try { - writeParagraphEnd(); - try { - extractImages(page.getResources(), new HashSet<COSBase>()); - } catch (IOException e) { - handleCatchableIOE(e); - } - super.endPage(page); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to end a page", e); - } catch (IOException e) { - exceptions.add(e); - } - } - - private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException { - if (resources == null || config.getExtractInlineImages() == false) { - return; - } - - for (COSName name : resources.getXObjectNames()) { - - PDXObject object = resources.getXObject(name); - if (object == null) { - continue; - } - COSStream cosStream = object.getCOSObject(); - if (seenThisPage.contains(cosStream)) { - //avoid infinite recursion TIKA-1742 - continue; - } - seenThisPage.add(cosStream); - - if (object instanceof PDFormXObject) { - extractImages(((PDFormXObject) object).getResources(), seenThisPage); - } else if (object instanceof PDImageXObject) { - - PDImageXObject image = (PDImageXObject) object; - - Metadata metadata = new Metadata(); - String extension = image.getSuffix(); - if (extension == null) { - metadata.set(Metadata.CONTENT_TYPE, "image/png"); - extension = "png"; - } else if (extension.equals("jpg")) { - metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); - } else if (extension.equals("tiff")) { - metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); - extension = "tif"; - } else { - //TODO: determine if we need to add more image types - //throw new RuntimeException("EXTEN:" + extension); - } - - Integer imageNumber = processedInlineImages.get(cosStream); - if (imageNumber == null) { - imageNumber = inlineImageCounter++; - } - String fileName = "image" + imageNumber + "."+extension; - metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); - - // Output the img tag - AttributesImpl attr = new AttributesImpl(); - attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); - attr.addAttribute("", "alt", "alt", "CDATA", fileName); - xhtml.startElement("img", attr); - xhtml.endElement("img"); - - //Do we only want to process unique COSObject ids? - //If so, have we already processed this one? - if (config.getExtractUniqueInlineImagesOnly() == true) { - if (processedInlineImages.containsKey(cosStream)) { - continue; - } - processedInlineImages.put(cosStream, imageNumber); - } - - metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, - TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); - - EmbeddedDocumentExtractor extractor = - getEmbeddedDocumentExtractor(); - if (extractor.shouldParseEmbedded(metadata)) { - ByteArrayOutputStream buffer = new ByteArrayOutputStream(); - try { - //TODO: handle image.getMetadata()? - writeToBuffer(image, extension, buffer); - extractor.parseEmbedded( - new ByteArrayInputStream(buffer.toByteArray()), - new EmbeddedContentHandler(xhtml), - metadata, false); - } catch (IOException e) { - handleCatchableIOE(e); - } - } - } - } - } - - //nearly directly copied from PDFBox ExtractImages - private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream out) - throws IOException { - - BufferedImage image = pdImage.getImage(); - if (image != null) { - if ("jpg".equals(suffix)) { - String colorSpaceName = pdImage.getColorSpace().getName(); - //TODO: figure out if we want directJPEG as a configuration - //previously: if (directJPeg || PDDeviceGray.... - if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || - PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) { - // RGB or Gray colorspace: get and write the unmodifiedJPEG stream - InputStream data = pdImage.getStream().createInputStream(JPEG); - org.apache.pdfbox.io.IOUtils.copy(data, out); - org.apache.pdfbox.io.IOUtils.closeQuietly(data); - } else { - // for CMYK and other "unusual" colorspaces, the JPEG will be converted - ImageIOUtil.writeImage(image, suffix, out); - } - } else { - ImageIOUtil.writeImage(image, suffix, out); - } - } - out.flush(); - } - - @Override - protected void writeParagraphStart() throws IOException { - super.writeParagraphStart(); - try { - xhtml.startElement("p"); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to start a paragraph", e); - } - } - - @Override - protected void writeParagraphEnd() throws IOException { - super.writeParagraphEnd(); - try { - xhtml.endElement("p"); - } catch (SAXException e) { - throw new IOExceptionWithCause("Unable to end a paragraph", e); - } - } - - @Override - protected void writeString(String text) throws IOException { - try { - xhtml.characters(text); - } catch (SAXException e) { - throw new IOExceptionWithCause( - "Unable to write a string: " + text, e); - } - } - - @Override - protected void writeCharacters(TextPosition text) throws IOException { - try { - xhtml.characters(text.getUnicode()); - } catch (SAXException e) { - throw new IOExceptionWithCause( - "Unable to write a character: " + text.getUnicode(), e); - } - } - - @Override - protected void writeWordSeparator() throws IOException { - try { - xhtml.characters(getWordSeparator()); - } catch (SAXException e) { - throw new IOExceptionWithCause( - "Unable to write a space character", e); - } - } - - @Override - protected void writeLineSeparator() throws IOException { - try { - xhtml.newline(); - } catch (SAXException e) { - throw new IOExceptionWithCause( - "Unable to write a newline character", e); - } - } - -} - http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java deleted file mode 100644 index 057f833..0000000 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tika.parser.pdf; - -import static java.nio.charset.StandardCharsets.ISO_8859_1; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; - -import org.apache.pdfbox.cos.COSString; -import org.apache.pdfbox.io.RandomAccessBuffer; -import org.apache.pdfbox.io.RandomAccessRead; -import org.apache.pdfbox.pdfparser.COSParser; - -/** - * In fairly rare cases, a PDF's XMP will contain a string that - * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and - * ascii for ascii, e.g. "\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000" - * <p> - * This class can be used to decode those strings. - * <p> - * See TIKA-1678. Many thanks to Andrew Jackson for raising this issue - * and Tilman Hausherr for the solution. - * <p> - * As of this writing, we are only handling strings that start with - * an encoded BOM. Andrew Jackson found a handful of other examples (e.g. - * this ISO-8859-7 string: - * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336 - * \\364\\347\\362 PRAKSIS \\363\\364\\357") - * that we aren't currently handling. - */ -class PDFEncodedStringDecoder { - - private static final String[] PDF_ENCODING_BOMS = { - "\\376\\377", //UTF-16BE - "\\377\\376", //UTF-16LE - "\\357\\273\\277"//UTF-8 - }; - - /** - * Does this string contain an octal-encoded UTF BOM? - * Call this statically to determine if you should bother creating a new parser to parse it. - * @param s - * @return - */ - static boolean shouldDecode(String s) { - if (s == null || s.length() < 8) { - return false; - } - for (String BOM : PDF_ENCODING_BOMS) { - if (s.startsWith(BOM)) { - return true; - } - } - return false; - } - - /** - * This assumes that {@link #shouldDecode(String)} has been called - * and has returned true. If you run this on a non-octal encoded string, - * disaster will happen! - * - * @param value - * @return - */ - String decode(String value) { - try { - byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1); - InputStream is = new ByteArrayInputStream(bytes); - COSStringParser p = new COSStringParser(new RandomAccessBuffer(is)); - String parsed = p.myParseCOSString(); - if (parsed != null) { - return parsed; - } - } catch (IOException e) { - //oh well, we tried. - } - //just return value if something went wrong - return value; - } - - class COSStringParser extends COSParser { - - COSStringParser(RandomAccessRead buffer) throws IOException { - super(buffer); - } - - /** - * - * @return parsed string or null if something went wrong. - */ - String myParseCOSString() { - try { - COSString cosString = parseCOSString(); - if (cosString != null) { - return cosString.getString(); - } - } catch (IOException e) { - } - return null; - } - } -} http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java deleted file mode 100644 index f735f25..0000000 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ /dev/null @@ -1,626 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.pdf; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.stream.XMLStreamException; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.Arrays; -import java.util.Calendar; -import java.util.Collections; -import java.util.List; -import java.util.Locale; -import java.util.Set; - -import org.apache.commons.io.input.CloseShieldInputStream; -import org.apache.jempbox.xmp.XMPMetadata; -import org.apache.jempbox.xmp.XMPSchema; -import org.apache.jempbox.xmp.XMPSchemaDublinCore; -import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId; -import org.apache.pdfbox.cos.COSArray; -import org.apache.pdfbox.cos.COSBase; -import org.apache.pdfbox.cos.COSDictionary; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.cos.COSString; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDDocumentInformation; -import org.apache.pdfbox.pdmodel.common.PDMetadata; -import org.apache.pdfbox.pdmodel.encryption.AccessPermission; -import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; -import org.apache.tika.exception.EncryptedDocumentException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.AccessPermissions; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.PagedText; -import org.apache.tika.metadata.Property; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.PasswordProvider; -import org.apache.tika.parser.ocr.TesseractOCRParser; -import org.apache.tika.parser.xmp.JempboxExtractor; -import org.apache.tika.sax.XHTMLContentHandler; -import org.w3c.dom.Document; -import org.xml.sax.ContentHandler; -import org.xml.sax.ErrorHandler; -import org.xml.sax.SAXException; - -/** - * PDF parser. - * <p/> - * This parser can process also encrypted PDF documents if the required - * password is given as a part of the input metadata associated with a - * document. If no password is given, then this parser will try decrypting - * the document using the empty password that's often used with PDFs. If - * the PDF contains any embedded documents (for example as part of a PDF - * package) then this parser will use the {@link EmbeddedDocumentExtractor} - * to handle them. - * <p/> - * As of Tika 1.6, it is possible to extract inline images with - * the {@link EmbeddedDocumentExtractor} as if they were regular - * attachments. By default, this feature is turned off because of - * the potentially enormous number and size of inline images. To - * turn this feature on, see - * {@link PDFParserConfig#setExtractInlineImages(boolean)}. - */ -public class PDFParser extends AbstractParser { - - - /** - * Metadata key for giving the document password to the parser. - * - * @since Apache Tika 0.5 - * @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead - */ - public static final String PASSWORD = "org.apache.tika.parser.pdf.password"; - private static final MediaType MEDIA_TYPE = MediaType.application("pdf"); - /** - * Serial version UID - */ - private static final long serialVersionUID = -752276948656079347L; - private static final Set<MediaType> SUPPORTED_TYPES = - Collections.singleton(MEDIA_TYPE); - private PDFParserConfig defaultConfig = new PDFParserConfig(); - - - - public Set<MediaType> getSupportedTypes(ParseContext context) { - return SUPPORTED_TYPES; - } - - public void parse( - InputStream stream, ContentHandler handler, - Metadata metadata, ParseContext context) - throws IOException, SAXException, TikaException { - - PDDocument pdfDocument = null; - TemporaryResources tmp = new TemporaryResources(); - //config from context, or default if not set via context - PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); - String password = ""; - try { - // PDFBox can process entirely in memory, or can use a temp file - // for unpacked / processed resources - // Decide which to do based on if we're reading from a file or not already - //TODO: make this configurable via MemoryUsageSetting - TikaInputStream tstream = TikaInputStream.cast(stream); - password = getPassword(metadata, context); - if (tstream != null && tstream.hasFile()) { - // File based -- send file directly to PDFBox - pdfDocument = PDDocument.load(tstream.getPath().toFile(), password); - } else { - pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password); - } - metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); - - metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); - extractMetadata(pdfDocument, metadata, context); - - AccessChecker checker = localConfig.getAccessChecker(); - checker.check(metadata); - if (handler != null) { - if (shouldHandleXFAOnly(pdfDocument, localConfig)) { - handleXFAOnly(pdfDocument, handler, metadata, context); - } else if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) { - metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); - OCR2XHTML.process(pdfDocument, handler, context, metadata, localConfig); - } else { - if (localConfig.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) { - metadata.add("X-Parsed-By", TesseractOCRParser.class.toString()); - } - PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); - } - - } - } catch (InvalidPasswordException e) { - metadata.set("pdf:encrypted", "true"); - throw new EncryptedDocumentException(e); - } finally { - if (pdfDocument != null) { - pdfDocument.close(); - } - } - } - - private String getPassword(Metadata metadata, ParseContext context) { - String password = null; - - // Did they supply a new style Password Provider? - PasswordProvider passwordProvider = context.get(PasswordProvider.class); - if (passwordProvider != null) { - password = passwordProvider.getPassword(metadata); - } - - // Fall back on the old style metadata if set - if (password == null && metadata.get(PASSWORD) != null) { - password = metadata.get(PASSWORD); - } - - // If no password is given, use an empty string as the default - if (password == null) { - password = ""; - } - return password; - } - - - private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context) - throws TikaException { - - //first extract AccessPermissions - AccessPermission ap = document.getCurrentAccessPermission(); - metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, - Boolean.toString(ap.canExtractForAccessibility())); - metadata.set(AccessPermissions.EXTRACT_CONTENT, - Boolean.toString(ap.canExtractContent())); - metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, - Boolean.toString(ap.canAssembleDocument())); - metadata.set(AccessPermissions.FILL_IN_FORM, - Boolean.toString(ap.canFillInForm())); - metadata.set(AccessPermissions.CAN_MODIFY, - Boolean.toString(ap.canModify())); - metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, - Boolean.toString(ap.canModifyAnnotations())); - metadata.set(AccessPermissions.CAN_PRINT, - Boolean.toString(ap.canPrint())); - metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, - Boolean.toString(ap.canPrintDegraded())); - - - //now go for the XMP - Document dom = loadDOM(document.getDocumentCatalog().getMetadata(), context); - - XMPMetadata xmp = null; - if (dom != null) { - xmp = new XMPMetadata(dom); - } - XMPSchemaDublinCore dcSchema = null; - try { - if (document.getDocumentCatalog().getMetadata() != null) { - InputStream xmpIs = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); - xmp = XMPMetadata.load(xmpIs); - } - } catch (IOException e) {} - - if (xmp != null) { - try { - dcSchema = xmp.getDublinCoreSchema(); - } catch (IOException e) {} - - JempboxExtractor.extractXMPMM(xmp, metadata); - } - - PDDocumentInformation info = document.getDocumentInformation(); - metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); - extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); - extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); - addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); - addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); - addMetadata(metadata, "producer", info.getProducer()); - extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); - - // TODO: Move to description in Tika 2.0 - addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); - addMetadata(metadata, "trapped", info.getTrapped()); - // TODO Remove these in Tika 2.0 - addMetadata(metadata, "created", info.getCreationDate()); - addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); - Calendar modified = info.getModificationDate(); - addMetadata(metadata, Metadata.LAST_MODIFIED, modified); - addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); - - // All remaining metadata is custom - // Copy this over as-is - List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", - "Keywords", "Producer", "Subject", "Title", "Trapped"); - for (COSName key : info.getCOSObject().keySet()) { - String name = key.getName(); - if (!handledMetadata.contains(name)) { - addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); - } - } - - //try to get the various versions - //Caveats: - // there is currently a fair amount of redundancy - // TikaCoreProperties.FORMAT can be multivalued - // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion - metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); - metadata.add(TikaCoreProperties.FORMAT.getName(), - MEDIA_TYPE.toString() + "; version=" + - Float.toString(document.getDocument().getVersion())); - - try { - if (xmp != null) { - xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); - XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); - if (pdfaxmp != null) { - if (pdfaxmp.getPart() != null) { - metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); - } - if (pdfaxmp.getConformance() != null) { - metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); - String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); - metadata.set("pdfa:PDFVersion", version); - metadata.add(TikaCoreProperties.FORMAT.getName(), - MEDIA_TYPE.toString() + "; version=\"" + version + "\""); - } - } - // TODO WARN if this XMP version is inconsistent with document header version? - } - } catch (IOException e) { - metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); - } - //TODO: Let's try to move this into PDFBox. - //Attempt to determine Adobe extension level, if present: - COSDictionary root = document.getDocumentCatalog().getCOSObject(); - COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); - if (extensions != null) { - for (COSName extName : extensions.keySet()) { - // If it's an Adobe one, interpret it to determine the extension level: - if (extName.equals(COSName.getPDFName("ADBE"))) { - COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); - if (adobeExt != null) { - String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); - int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); - //-1 is sentinel value that something went wrong in getInt - if (el != -1) { - metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); - metadata.add(TikaCoreProperties.FORMAT.getName(), - MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); - } - } - } else { - // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. - metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); - } - } - } - } - - /** - * Try to extract all multilingual items from the XMPSchema - * <p/> - * This relies on the property having a valid xmp getName() - * <p/> - * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295) - * - * @param metadata - * @param property - * @param pdfBoxBaseline - * @param schema - */ - private void extractMultilingualItems(Metadata metadata, Property property, - String pdfBoxBaseline, XMPSchema schema) { - //if schema is null, just go with pdfBoxBaseline - if (schema == null) { - if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { - addMetadata(metadata, property, pdfBoxBaseline); - } - return; - } - - for (String lang : schema.getLanguagePropertyLanguages(property.getName())) { - String value = schema.getLanguageProperty(property.getName(), lang); - - if (value != null && value.length() > 0) { - //if you're going to add it below in the baseline addition, don't add it now - if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) { - continue; - } - addMetadata(metadata, property, value); - if (!property.isMultiValuePermitted()) { - return; - } - } - } - - if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { - //if we've already added something above and multivalue is not permitted - //return. - if (!property.isMultiValuePermitted()) { - if (metadata.get(property) != null) { - return; - } - } - addMetadata(metadata, property, pdfBoxBaseline); - } - } - - - /** - * This tries to read a list from a particular property in - * XMPSchemaDublinCore. - * If it can't find the information, it falls back to the - * pdfboxBaseline. The pdfboxBaseline should be the value - * that pdfbox returns from its PDDocumentInformation object - * (e.g. getAuthor()) This method is designed include the pdfboxBaseline, - * and it should not duplicate the pdfboxBaseline. - * <p/> - * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this - * on dates! - * <p/> - * This relies on the property having a DublinCore compliant getName() - * - * @param property - * @param pdfBoxBaseline - * @param dc - * @param metadata - */ - private void extractDublinCoreListItems(Metadata metadata, Property property, - String pdfBoxBaseline, XMPSchemaDublinCore dc) { - //if no dc, add baseline and return - if (dc == null) { - if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { - addMetadata(metadata, property, pdfBoxBaseline); - } - return; - } - List<String> items = getXMPBagOrSeqList(dc, property.getName()); - if (items == null) { - if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { - addMetadata(metadata, property, pdfBoxBaseline); - } - return; - } - for (String item : items) { - if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) { - addMetadata(metadata, property, item); - } - } - //finally, add the baseline - if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { - addMetadata(metadata, property, pdfBoxBaseline); - } - } - - /** - * As of this writing, XMPSchema can contain bags or sequence lists - * for some attributes...despite standards documentation. - * JempBox expects one or the other for specific attributes. - * Until more flexibility is added to JempBox, Tika will have to handle both. - * - * @param schema - * @param name - * @return list of values or null - */ - private List<String> getXMPBagOrSeqList(XMPSchema schema, String name) { - List<String> ret = schema.getBagList(name); - if (ret == null) { - ret = schema.getSequenceList(name); - } - return ret; - } - - private void addMetadata(Metadata metadata, Property property, String value) { - if (value != null) { - String decoded = decode(value); - if (property.isMultiValuePermitted() || metadata.get(property) == null) { - metadata.add(property, decoded); - } - //silently skip adding property that already exists if multiple values are not permitted - } - } - - private void addMetadata(Metadata metadata, String name, String value) { - if (value != null) { - metadata.add(name, decode(value)); - } - } - - private String decode(String value) { - if (PDFEncodedStringDecoder.shouldDecode(value)) { - PDFEncodedStringDecoder d = new PDFEncodedStringDecoder(); - return d.decode(value); - } - return value; - } - - private void addMetadata(Metadata metadata, String name, Calendar value) { - if (value != null) { - metadata.set(name, value.getTime().toString()); - } - } - - private void addMetadata(Metadata metadata, Property property, Calendar value) { - if (value != null) { - metadata.set(property, value.getTime()); - } - } - - /** - * Used when processing custom metadata entries, as PDFBox won't do - * the conversion for us in the way it does for the standard ones - */ - private void addMetadata(Metadata metadata, String name, COSBase value) { - if (value instanceof COSArray) { - for (Object v : ((COSArray) value).toList()) { - addMetadata(metadata, name, ((COSBase) v)); - } - } else if (value instanceof COSString) { - addMetadata(metadata, name, ((COSString) value).getString()); - } - // Avoid calling COSDictionary#toString, since it can lead to infinite - // recursion. See TIKA-1038 and PDFBOX-1835. - else if (value != null && !(value instanceof COSDictionary)) { - addMetadata(metadata, name, value.toString()); - } - } - - - private boolean shouldHandleXFAOnly(PDDocument pdDocument, PDFParserConfig config) { - if (config.getIfXFAExtractOnlyXFA() && - pdDocument.getDocumentCatalog() != null && - pdDocument.getDocumentCatalog().getAcroForm() != null && - pdDocument.getDocumentCatalog().getAcroForm().getXFA() != null) { - return true; - } - return false; - } - - private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, - Metadata metadata, ParseContext context) - throws SAXException, IOException, TikaException { - XFAExtractor ex = new XFAExtractor(); - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startDocument(); - try (InputStream is = new ByteArrayInputStream( - pdDocument.getDocumentCatalog().getAcroForm().getXFA().getBytes())) { - ex.extract(is, xhtml, metadata, context); - } catch (XMLStreamException e) { - throw new TikaException("XML error in XFA", e); - } - xhtml.endDocument(); - } - - public PDFParserConfig getPDFParserConfig() { - return defaultConfig; - } - - public void setPDFParserConfig(PDFParserConfig config) { - this.defaultConfig = config; - } - - /** - * @see #setEnableAutoSpace(boolean) - * @deprecated use {@link #getPDFParserConfig()} - */ - public boolean getEnableAutoSpace() { - return defaultConfig.getEnableAutoSpace(); - } - - /** - * If true (the default), the parser should estimate - * where spaces should be inserted between words. For - * many PDFs this is necessary as they do not include - * explicit whitespace characters. - * - * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} - */ - public void setEnableAutoSpace(boolean v) { - defaultConfig.setEnableAutoSpace(v); - } - - /** - * If true, text in annotations will be extracted. - * - * @deprecated use {@link #getPDFParserConfig()} - */ - public boolean getExtractAnnotationText() { - return defaultConfig.getExtractAnnotationText(); - } - - /** - * If true (the default), text in annotations will be - * extracted. - * - * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} - */ - public void setExtractAnnotationText(boolean v) { - defaultConfig.setExtractAnnotationText(v); - } - - /** - * @see #setSuppressDuplicateOverlappingText(boolean) - * @deprecated use {@link #getPDFParserConfig()} - */ - public boolean getSuppressDuplicateOverlappingText() { - return defaultConfig.getSuppressDuplicateOverlappingText(); - } - - /** - * If true, the parser should try to remove duplicated - * text over the same region. This is needed for some - * PDFs that achieve bolding by re-writing the same - * text in the same area. Note that this can - * slow down extraction substantially (PDFBOX-956) and - * sometimes remove characters that were not in fact - * duplicated (PDFBOX-1155). By default this is disabled. - * - * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} - */ - public void setSuppressDuplicateOverlappingText(boolean v) { - defaultConfig.setSuppressDuplicateOverlappingText(v); - } - - /** - * @see #setSortByPosition(boolean) - * @deprecated use {@link #getPDFParserConfig()} - */ - public boolean getSortByPosition() { - return defaultConfig.getSortByPosition(); - } - - /** - * If true, sort text tokens by their x/y position - * before extracting text. This may be necessary for - * some PDFs (if the text tokens are not rendered "in - * order"), while for other PDFs it can produce the - * wrong result (for example if there are 2 columns, - * the text will be interleaved). Default is false. - * - * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} - */ - public void setSortByPosition(boolean v) { - defaultConfig.setSortByPosition(v); - } - - - //can return null! - private Document loadDOM(PDMetadata pdMetadata, ParseContext context) { - if (pdMetadata == null) { - return null; - } - try (InputStream is = pdMetadata.exportXMPMetadata()) { - DocumentBuilder documentBuilder = context.getDocumentBuilder(); - documentBuilder.setErrorHandler((ErrorHandler)null); - return documentBuilder.parse(is); - } catch (IOException|SAXException|TikaException e) { - //swallow - } - return null; - - } - -} http://git-wip-us.apache.org/repos/asf/tika/blob/59e0ca0f/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java deleted file mode 100644 index 296b191..0000000 --- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ /dev/null @@ -1,614 +0,0 @@ -package org.apache.tika.parser.pdf; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.io.Serializable; -import java.util.Locale; -import java.util.Properties; - -import org.apache.pdfbox.rendering.ImageType; -import org.apache.pdfbox.text.PDFTextStripper; - -/** - * Config for PDFParser. - * <p/> - * This allows parameters to be set programmatically: - * <ol> - * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li> - * <li>Constructor of PDFParser</li> - * <li>Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);</li> - * </ol> - * <p/> - * Parameters can also be set by modifying the PDFParserConfig.properties file, - * which lives in the expected places, in trunk: - * tika-parsers/src/main/resources/org/apache/tika/parser/pdf - * <p/> - * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar: - * org/apache/tika/parser/pdf - */ -public class PDFParserConfig implements Serializable { - - public enum OCR_STRATEGY { - NO_OCR, - OCR_ONLY, - OCR_AND_TEXT_EXTRACTION; - - private static OCR_STRATEGY parse(String s) { - if (s == null) { - return NO_OCR; - } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) { - return NO_OCR; - } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) { - return OCR_ONLY; - } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) { - return OCR_AND_TEXT_EXTRACTION; - } - //default -- no ocr - return NO_OCR; - } - } - - private static final long serialVersionUID = 6492570218190936986L; - - // True if we let PDFBox "guess" where spaces should go: - private boolean enableAutoSpace = true; - - // True if we let PDFBox remove duplicate overlapping text: - private boolean suppressDuplicateOverlappingText; - - // True if we extract annotation text ourselves - // (workaround for PDFBOX-1143): - private boolean extractAnnotationText = true; - - // True if we should sort text tokens by position - // (necessary for some PDFs, but messes up other PDFs): - private boolean sortByPosition = false; - - //True if acroform content should be extracted - private boolean extractAcroFormContent = true; - - //True if inline PDXImage objects should be extracted - private boolean extractInlineImages = false; - - //True if inline images (as identified by their object id within - //a pdf file) should only be extracted once. - private boolean extractUniqueInlineImagesOnly = true; - - //The character width-based tolerance value used to estimate where spaces in text should be added - private Float averageCharTolerance; - - //The space width-based tolerance value used to estimate where spaces in text should be added - private Float spacingTolerance; - - //If the PDF has an XFA element, process only that and skip extracting - //content from elsewhere in the document. - private boolean ifXFAExtractOnlyXFA = false; - - private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.NO_OCR; - - private int ocrDPI = 200; - private ImageType ocrImageType = ImageType.GRAY; - private String ocrImageFormatName = "png"; - - private AccessChecker accessChecker; - - //The PDFParser can throw IOExceptions if there is a problem - //with a streams. If this is set to true, Tika's - //parser catches these exceptions, reports them in the metadata - //and then throws the first stored exception after the parse has completed. - private boolean isCatchIntermediateIOExceptions = true; - - public PDFParserConfig() { - init(this.getClass().getResourceAsStream("PDFParser.properties")); - } - - /** - * Loads properties from InputStream and then tries to close InputStream. - * If there is an IOException, this silently swallows the exception - * and goes back to the default. - * - * @param is - */ - public PDFParserConfig(InputStream is) { - init(is); - } - - //initializes object and then tries to close inputstream - private void init(InputStream is) { - - if (is == null) { - return; - } - Properties props = new Properties(); - try { - props.load(is); - } catch (IOException e) { - } finally { - if (is != null) { - try { - is.close(); - } catch (IOException e) { - //swallow - } - } - } - setEnableAutoSpace( - getBooleanProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); - setSuppressDuplicateOverlappingText( - getBooleanProp(props.getProperty("suppressDuplicateOverlappingText"), - getSuppressDuplicateOverlappingText())); - setExtractAnnotationText( - getBooleanProp(props.getProperty("extractAnnotationText"), - getExtractAnnotationText())); - setSortByPosition( - getBooleanProp(props.getProperty("sortByPosition"), - getSortByPosition())); - setExtractAcroFormContent( - getBooleanProp(props.getProperty("extractAcroFormContent"), - getExtractAcroFormContent())); - setExtractInlineImages( - getBooleanProp(props.getProperty("extractInlineImages"), - getExtractInlineImages())); - setExtractUniqueInlineImagesOnly( - getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"), - getExtractUniqueInlineImagesOnly())); - - setIfXFAExtractOnlyXFA( - getBooleanProp(props.getProperty("ifXFAExtractOnlyXFA"), - getIfXFAExtractOnlyXFA())); - - setCatchIntermediateIOExceptions( - getBooleanProp(props.getProperty("catchIntermediateIOExceptions"), - isCatchIntermediateIOExceptions())); - - setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy"))); - - setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI())); - - setOCRImageFormatName(props.getProperty("ocrImageFormatName")); - - setOCRImageType(parseImageType(props.getProperty("ocrImageType"))); - - - boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false); - boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true); - - if (checkExtractAccessPermission == false) { - //silently ignore the crazy configuration of checkExtractAccessPermission = false, - //but allowExtractionForAccessibility=false - accessChecker = new AccessChecker(); - } else { - accessChecker = new AccessChecker(allowExtractionForAccessibility); - } - } - - /** - * Configures the given pdf2XHTML. - * - * @param pdf2XHTML - */ - public void configure(PDF2XHTML pdf2XHTML) { - pdf2XHTML.setSortByPosition(getSortByPosition()); - if (getEnableAutoSpace()) { - pdf2XHTML.setWordSeparator(" "); - } else { - pdf2XHTML.setWordSeparator(""); - } - if (getAverageCharTolerance() != null) { - pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance()); - } - if (getSpacingTolerance() != null) { - pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); - } - pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); - } - - /** - * @see #setExtractAcroFormContent(boolean) - */ - public boolean getExtractAcroFormContent() { - return extractAcroFormContent; - } - - /** - * If true (the default), extract content from AcroForms - * at the end of the document. If an XFA is found, - * try to process that, otherwise, process the AcroForm. - * - * @param extractAcroFormContent - */ - public void setExtractAcroFormContent(boolean extractAcroFormContent) { - this.extractAcroFormContent = extractAcroFormContent; - - } - - /** - * @see #setIfXFAExtractOnlyXFA(boolean) - * @return how to handle XFA data if it exists - */ - public boolean getIfXFAExtractOnlyXFA() { - return ifXFAExtractOnlyXFA; - } - - /** - * If false (the default), extract content from the full PDF - * as well as the XFA form. This will likely lead to some duplicative - * content. - * - * @param ifXFAExtractOnlyXFA - */ - public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) { - this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA; - } - - - /** - * @see #setExtractInlineImages(boolean) - */ - public boolean getExtractInlineImages() { - return extractInlineImages; - } - - /** - * If true, extract inline embedded OBXImages. - * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain - * thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5, - * there can be surprisingly large memory consumption and/or out of memory errors. - * Set to <code>true</code> with caution. - * <p/> - * The default is <code>false</code>. - * <p/> - * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)}; - * - * @param extractInlineImages - */ - public void setExtractInlineImages(boolean extractInlineImages) { - this.extractInlineImages = extractInlineImages; - } - - /** - * @see #setExtractUniqueInlineImagesOnly(boolean) - */ - public boolean getExtractUniqueInlineImagesOnly() { - return extractUniqueInlineImagesOnly; - } - - /** - * Multiple pages within a PDF file might refer to the same underlying image. - * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, the - * parser will call the EmbeddedExtractor each time the image appears on a page. - * This might be desired for some use cases. However, to avoid duplication of - * extracted images, set this to <code>true</code>. The default is <code>true</code>. - * <p/> - * Note that uniqueness is determined only by the underlying PDF COSObject id, not by - * file hash or similar equality metric. - * If the PDF actually contains multiple copies of the same image - * -- all with different object ids -- then all images will be extracted. - * <p/> - * For this parameter to have any effect, {@link #extractInlineImages} must be - * set to <code>true</code>. - * <p> - * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting - * of this parameter, the extractor will only pull out one copy of each image per - * page. This parameter tries to capture uniqueness across the entire document. - * - * @param extractUniqueInlineImagesOnly - */ - public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { - this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly; - - } - - /** - * @see #setEnableAutoSpace(boolean) - */ - public boolean getEnableAutoSpace() { - return enableAutoSpace; - } - - /** - * If true (the default), the parser should estimate - * where spaces should be inserted between words. For - * many PDFs this is necessary as they do not include - * explicit whitespace characters. - */ - public void setEnableAutoSpace(boolean enableAutoSpace) { - this.enableAutoSpace = enableAutoSpace; - } - - /** - * @see #setSuppressDuplicateOverlappingText(boolean) - */ - public boolean getSuppressDuplicateOverlappingText() { - return suppressDuplicateOverlappingText; - } - - /** - * If true, the parser should try to remove duplicated - * text over the same region. This is needed for some - * PDFs that achieve bolding by re-writing the same - * text in the same area. Note that this can - * slow down extraction substantially (PDFBOX-956) and - * sometimes remove characters that were not in fact - * duplicated (PDFBOX-1155). By default this is disabled. - */ - public void setSuppressDuplicateOverlappingText( - boolean suppressDuplicateOverlappingText) { - this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText; - } - - /** - * @see #setExtractAnnotationText(boolean) - */ - public boolean getExtractAnnotationText() { - return extractAnnotationText; - } - - /** - * If true (the default), text in annotations will be - * extracted. - */ - public void setExtractAnnotationText(boolean extractAnnotationText) { - this.extractAnnotationText = extractAnnotationText; - } - - /** - * @see #setSortByPosition(boolean) - */ - public boolean getSortByPosition() { - return sortByPosition; - } - - /** - * If true, sort text tokens by their x/y position - * before extracting text. This may be necessary for - * some PDFs (if the text tokens are not rendered "in - * order"), while for other PDFs it can produce the - * wrong result (for example if there are 2 columns, - * the text will be interleaved). Default is false. - */ - public void setSortByPosition(boolean sortByPosition) { - this.sortByPosition = sortByPosition; - } - - /** - * @see #setAverageCharTolerance(Float) - */ - public Float getAverageCharTolerance() { - return averageCharTolerance; - } - - /** - * See {@link PDFTextStripper#setAverageCharTolerance(float)} - */ - public void setAverageCharTolerance(Float averageCharTolerance) { - this.averageCharTolerance = averageCharTolerance; - } - - /** - * @see #setSpacingTolerance(Float) - */ - public Float getSpacingTolerance() { - return spacingTolerance; - } - - /** - * See {@link PDFTextStripper#setSpacingTolerance(float)} - */ - public void setSpacingTolerance(Float spacingTolerance) { - this.spacingTolerance = spacingTolerance; - } - - public AccessChecker getAccessChecker() { - return accessChecker; - } - - public void setAccessChecker(AccessChecker accessChecker) { - this.accessChecker = accessChecker; - } - - /** - * See {@link #setCatchIntermediateIOExceptions(boolean)} - * @return whether or not to catch IOExceptions - */ - public boolean isCatchIntermediateIOExceptions() { - return isCatchIntermediateIOExceptions; - } - - /** - * The PDFBox parser will throw an IOException if there is - * a problem with a stream. If this is set to <code>true</code>, - * Tika's PDFParser will catch these exceptions and try to parse - * the rest of the document. After the parse is completed, - * Tika's PDFParser will throw the first caught exception. - * @param catchIntermediateIOExceptions - */ - public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) { - isCatchIntermediateIOExceptions = catchIntermediateIOExceptions; - } - - /** - * Which strategy to use for OCR - * @param ocrStrategy - */ - public void setOCRStrategy(OCR_STRATEGY ocrStrategy) { - this.ocrStrategy = ocrStrategy; - } - - /** - * - * @return strategy to use for OCR - */ - public OCR_STRATEGY getOCRStrategy() { - return ocrStrategy; - } - - private boolean getBooleanProp(String p, boolean defaultMissing) { - if (p == null) { - return defaultMissing; - } - if (p.toLowerCase(Locale.ROOT).equals("true")) { - return true; - } else if (p.toLowerCase(Locale.ROOT).equals("false")) { - return false; - } else { - return defaultMissing; - } - } - //throws NumberFormatException if there's a non-null unparseable - //string passed in - private int getIntProp(String p, int defaultMissing) { - if (p == null) { - return defaultMissing; - } - - return Integer.parseInt(p); - } - - /** - * String representation of the image format used to render - * the page image for OCR (examples: png, tiff, jpeg) - * @return - */ - public String getOCRImageFormatName() { - return ocrImageFormatName; - } - - /** - * @see #getOCRImageFormatName() - * - * @param ocrImageFormatName name of image format used to render - * page image - */ - public void setOCRImageFormatName(String ocrImageFormatName) { - this.ocrImageFormatName = ocrImageFormatName; - } - - /** - * Image type used to render the page image for OCR. - * @see #setOCRImageType(ImageType) - * @return image type - */ - public ImageType getOCRImageType() { - return ocrImageType; - } - - /** - * Image type used to render the page image for OCR. - * @param ocrImageType - */ - public void setOCRImageType(ImageType ocrImageType) { - this.ocrImageType = ocrImageType; - } - - /** - * Dots per inch used to render the page image for OCR - * @return dots per inch - */ - public int getOCRDPI() { - return ocrDPI; - } - - /** - * Dots per inche used to render the page image for OCR - * @param ocrDPI - */ - public void setOCRDPI(int ocrDPI) { - this.ocrDPI = ocrDPI; - } - - private ImageType parseImageType(String ocrImageType) { - for (ImageType t : ImageType.values()) { - if (ocrImageType.equalsIgnoreCase(t.toString())) { - return t; - } - } - return null; - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof PDFParserConfig)) return false; - - PDFParserConfig config = (PDFParserConfig) o; - - if (getEnableAutoSpace() != config.getEnableAutoSpace()) return false; - if (getSuppressDuplicateOverlappingText() != config.getSuppressDuplicateOverlappingText()) return false; - if (getExtractAnnotationText() != config.getExtractAnnotationText()) return false; - if (getSortByPosition() != config.getSortByPosition()) return false; - if (getExtractAcroFormContent() != config.getExtractAcroFormContent()) return false; - if (getExtractInlineImages() != config.getExtractInlineImages()) return false; - if (getExtractUniqueInlineImagesOnly() != config.getExtractUniqueInlineImagesOnly()) return false; - if (getIfXFAExtractOnlyXFA() != config.getIfXFAExtractOnlyXFA()) return false; - if (getOCRDPI() != config.getOCRDPI()) return false; - if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) return false; - if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false; - if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false; - if (!getOCRStrategy().equals(config.getOCRStrategy())) return false; - if (getOCRImageType() != config.getOCRImageType()) return false; - if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) return false; - return getAccessChecker().equals(config.getAccessChecker()); - - } - - @Override - public int hashCode() { - int result = (getEnableAutoSpace() ? 1 : 0); - result = 31 * result + (getSuppressDuplicateOverlappingText() ? 1 : 0); - result = 31 * result + (getExtractAnnotationText() ? 1 : 0); - result = 31 * result + (getSortByPosition() ? 1 : 0); - result = 31 * result + (getExtractAcroFormContent() ? 1 : 0); - result = 31 * result + (getExtractInlineImages() ? 1 : 0); - result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0); - result = 31 * result + getAverageCharTolerance().hashCode(); - result = 31 * result + getSpacingTolerance().hashCode(); - result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0); - result = 31 * result + ocrStrategy.hashCode(); - result = 31 * result + getOCRDPI(); - result = 31 * result + getOCRImageType().hashCode(); - result = 31 * result + getOCRImageFormatName().hashCode(); - result = 31 * result + getAccessChecker().hashCode(); - result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0); - return result; - } - - @Override - public String toString() { - return "PDFParserConfig{" + - "enableAutoSpace=" + enableAutoSpace + - ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText + - ", extractAnnotationText=" + extractAnnotationText + - ", sortByPosition=" + sortByPosition + - ", extractAcroFormContent=" + extractAcroFormContent + - ", extractInlineImages=" + extractInlineImages + - ", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly + - ", averageCharTolerance=" + averageCharTolerance + - ", spacingTolerance=" + spacingTolerance + - ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + - ", ocrStrategy=" + ocrStrategy + - ", ocrDPI=" + ocrDPI + - ", ocrImageType=" + ocrImageType + - ", ocrImageFormatName='" + ocrImageFormatName + '\'' + - ", accessChecker=" + accessChecker + - ", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions + - '}'; - } -}
