Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,721 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.Writer; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.ListIterator; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.commons.io.IOExceptionWithCause; +import org.apache.commons.io.IOUtils; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentCatalog; +import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; +import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDResources; +import org.apache.pdfbox.pdmodel.common.COSObjectable; +import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; +import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; +import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDCcitt; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm; +import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; +import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; +import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; +import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; +import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDField; +import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; +import org.apache.pdfbox.util.PDFTextStripper; +import org.apache.pdfbox.util.TextPosition; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.EmbeddedContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Utility class that overrides the {@link PDFTextStripper} functionality + * to produce a semi-structured XHTML SAX events instead of a plain text + * stream. + */ +class PDF2XHTML extends PDFTextStripper { + + /** + * Maximum recursive depth during AcroForm processing. + * Prevents theoretical AcroForm recursion bomb. + */ + private final static int MAX_ACROFORM_RECURSIONS = 10; + /** + * Format used for signature dates + * TODO Make this thread-safe + */ + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT); + private final ContentHandler originalHandler; + private final ParseContext context; + private final XHTMLContentHandler handler; + private final PDFParserConfig config; + /** + * This keeps track of the pdf object ids for inline + * images that have been processed. + * If {@link PDFParserConfig#getExtractUniqueInlineImagesOnly() + * is true, this will be checked before extracting an embedded image. + * The integer keeps track of the inlineImageCounter for that image. + * This integer is used to identify images in the markup. + * + * This is used across the document. To avoid infinite recursion + * TIKA-1742, we're limiting the export to one image per page. + */ + private Map<String, Integer> processedInlineImages = new HashMap<>(); + private int inlineImageCounter = 0; + private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws IOException { + //source of config (derives from context or PDFParser?) is + //already determined in PDFParser. No need to check context here. + this.config = config; + this.originalHandler = handler; + this.context = context; + this.handler = new XHTMLContentHandler(handler, metadata); + } + + /** + * Converts the given PDF document (and related metadata) to a stream + * of XHTML SAX events sent to the given content handler. + * + * @param document PDF document + * @param handler SAX content handler + * @param metadata PDF metadata + * @throws SAXException if the content handler fails to process SAX events + * @throws TikaException if the PDF document can not be processed + */ + public static void process( + PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, + PDFParserConfig config) + throws SAXException, TikaException { + try { + // Extract text using a dummy Writer as we override the + // key methods to output to the given content + // handler. + PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata, config); + + config.configure(pdf2XHTML); + + pdf2XHTML.writeText(document, new Writer() { + @Override + public void write(char[] cbuf, int off, int len) { + } + + @Override + public void flush() { + } + + @Override + public void close() { + } + }); + + } catch (IOException e) { + if (e.getCause() instanceof SAXException) { + throw (SAXException) e.getCause(); + } else { + throw new TikaException("Unable to extract PDF content", e); + } + } + } + + void extractBookmarkText() throws SAXException { + PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline(); + if (outline != null) { + extractBookmarkText(outline); + } + } + + void extractBookmarkText(PDOutlineNode bookmark) throws SAXException { + PDOutlineItem current = bookmark.getFirstChild(); + if (current != null) { + handler.startElement("ul"); + while (current != null) { + handler.startElement("li"); + handler.characters(current.getTitle()); + handler.endElement("li"); + // Recurse: + extractBookmarkText(current); + current = current.getNextSibling(); + } + handler.endElement("ul"); + } + } + + @Override + protected void startDocument(PDDocument pdf) throws IOException { + try { + handler.startDocument(); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a document", e); + } + } + + @Override + protected void endDocument(PDDocument pdf) throws IOException { + try { + // Extract text for any bookmarks: + extractBookmarkText(); + extractEmbeddedDocuments(pdf, originalHandler); + + //extract acroform data at end of doc + if (config.getExtractAcroFormContent() == true) { + extractAcroForm(pdf, handler); + } + handler.endDocument(); + } catch (TikaException e) { + throw new IOExceptionWithCause("Unable to end a document", e); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to end a document", e); + } + } + + @Override + protected void startPage(PDPage page) throws IOException { + try { + handler.startElement("div", "class", "page"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a page", e); + } + writeParagraphStart(); + } + + @Override + protected void endPage(PDPage page) throws IOException { + try { + writeParagraphEnd(); + + extractImages(page.getResources(), new HashSet<COSBase>()); + + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); + for (PDAnnotation annotation : page.getAnnotations()) { + + if (annotation instanceof PDAnnotationFileAttachment) { + PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation; + PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile(); + try { + extractMultiOSPDEmbeddedFiles("", fileSpec, extractor); + } catch (SAXException e) { + throw new IOExceptionWithCause("file embedded in annotation sax exception", e); + } catch (TikaException e) { + throw new IOExceptionWithCause("file embedded in annotation tika exception", e); + } + } + // TODO: remove once PDFBOX-1143 is fixed: + if (config.getExtractAnnotationText()) { + if (annotation instanceof PDAnnotationLink) { + PDAnnotationLink annotationlink = (PDAnnotationLink) annotation; + if (annotationlink.getAction() != null) { + PDAction action = annotationlink.getAction(); + if (action instanceof PDActionURI) { + PDActionURI uri = (PDActionURI) action; + String link = uri.getURI(); + if (link != null) { + handler.startElement("div", "class", "annotation"); + handler.startElement("a", "href", link); + handler.endElement("a"); + handler.endElement("div"); + } + } + } + } + + if (annotation instanceof PDAnnotationMarkup) { + PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation; + String title = annotationMarkup.getTitlePopup(); + String subject = annotationMarkup.getSubject(); + String contents = annotationMarkup.getContents(); + // TODO: maybe also annotationMarkup.getRichContents()? + if (title != null || subject != null || contents != null) { + handler.startElement("div", "class", "annotation"); + + if (title != null) { + handler.startElement("div", "class", "annotationTitle"); + handler.characters(title); + handler.endElement("div"); + } + + if (subject != null) { + handler.startElement("div", "class", "annotationSubject"); + handler.characters(subject); + handler.endElement("div"); + } + + if (contents != null) { + handler.startElement("div", "class", "annotationContents"); + handler.characters(contents); + handler.endElement("div"); + } + + handler.endElement("div"); + } + } + } + } + + handler.endElement("div"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to end a page", e); + } + page.clear(); + } + + private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException { + if (resources == null || config.getExtractInlineImages() == false) { + return; + } + + Map<String, PDXObject> xObjects = resources.getXObjects(); + if (xObjects == null) { + return; + } + + for (Map.Entry<String, PDXObject> entry : xObjects.entrySet()) { + + PDXObject object = entry.getValue(); + if (object == null) { + continue; + } + COSBase cosObject = object.getCOSObject(); + if (seenThisPage.contains(cosObject)) { + //avoid infinite recursion TIKA-1742 + continue; + } + seenThisPage.add(cosObject); + + if (object instanceof PDXObjectForm) { + extractImages(((PDXObjectForm) object).getResources(), seenThisPage); + } else if (object instanceof PDXObjectImage) { + + PDXObjectImage image = (PDXObjectImage) object; + + Metadata metadata = new Metadata(); + String extension = ""; + if (image instanceof PDJpeg) { + metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); + extension = ".jpg"; + } else if (image instanceof PDCcitt) { + metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); + extension = ".tif"; + } else if (image instanceof PDPixelMap) { + metadata.set(Metadata.CONTENT_TYPE, "image/png"); + extension = ".png"; + } + + Integer imageNumber = processedInlineImages.get(entry.getKey()); + if (imageNumber == null) { + imageNumber = inlineImageCounter++; + } + String fileName = "image" + imageNumber + extension; + metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); + + // Output the img tag + AttributesImpl attr = new AttributesImpl(); + attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); + attr.addAttribute("", "alt", "alt", "CDATA", fileName); + handler.startElement("img", attr); + handler.endElement("img"); + + //Do we only want to process unique COSObject ids? + //If so, have we already processed this one? + if (config.getExtractUniqueInlineImagesOnly() == true) { + String cosObjectId = entry.getKey(); + if (processedInlineImages.containsKey(cosObjectId)) { + continue; + } + processedInlineImages.put(cosObjectId, imageNumber); + } + + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + + EmbeddedDocumentExtractor extractor = + getEmbeddedDocumentExtractor(); + if (extractor.shouldParseEmbedded(metadata)) { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + try { + image.write2OutputStream(buffer); + image.clear(); + extractor.parseEmbedded( + new ByteArrayInputStream(buffer.toByteArray()), + new EmbeddedContentHandler(handler), + metadata, false); + } catch (IOException e) { + // could not extract this image, so just skip it... + } + } + } + } + resources.clear(); + } + + protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { + EmbeddedDocumentExtractor extractor = + context.get(EmbeddedDocumentExtractor.class); + if (extractor == null) { + extractor = new ParsingEmbeddedDocumentExtractor(context); + } + return extractor; + } + + @Override + protected void writeParagraphStart() throws IOException { + super.writeParagraphStart(); + try { + handler.startElement("p"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to start a paragraph", e); + } + } + + @Override + protected void writeParagraphEnd() throws IOException { + super.writeParagraphEnd(); + try { + handler.endElement("p"); + } catch (SAXException e) { + throw new IOExceptionWithCause("Unable to end a paragraph", e); + } + } + + @Override + protected void writeString(String text) throws IOException { + try { + handler.characters(text); + } catch (SAXException e) { + throw new IOExceptionWithCause( + "Unable to write a string: " + text, e); + } + } + + @Override + protected void writeCharacters(TextPosition text) throws IOException { + try { + handler.characters(text.getCharacter()); + } catch (SAXException e) { + throw new IOExceptionWithCause( + "Unable to write a character: " + text.getCharacter(), e); + } + } + + @Override + protected void writeWordSeparator() throws IOException { + try { + handler.characters(getWordSeparator()); + } catch (SAXException e) { + throw new IOExceptionWithCause( + "Unable to write a space character", e); + } + } + + @Override + protected void writeLineSeparator() throws IOException { + try { + handler.newline(); + } catch (SAXException e) { + throw new IOExceptionWithCause( + "Unable to write a newline character", e); + } + } + + private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) + throws IOException, SAXException, TikaException { + PDDocumentCatalog catalog = document.getDocumentCatalog(); + PDDocumentNameDictionary names = catalog.getNames(); + if (names == null) { + return; + } + PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); + + if (embeddedFiles == null) { + return; + } + + Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); + //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. + //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java + //If there is a need we could add a fully recursive search to find a non-null + //Map<String, COSObjectable> that contains the doc info. + if (embeddedFileNames != null) { + processEmbeddedDocNames(embeddedFileNames); + } else { + List<PDNameTreeNode> kids = embeddedFiles.getKids(); + if (kids == null) { + return; + } + for (PDNameTreeNode n : kids) { + Map<String, COSObjectable> childNames = n.getNames(); + if (childNames != null) { + processEmbeddedDocNames(childNames); + } + } + } + } + + + private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames) + throws IOException, SAXException, TikaException { + if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { + return; + } + + EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); + for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) { + PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); + extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); + } + } + + private void extractMultiOSPDEmbeddedFiles(String defaultName, + PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) throws IOException, + SAXException, TikaException { + + if (spec == null) { + return; + } + //current strategy is to pull all, not just first non-null + extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor); + extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor); + } + + private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file, + EmbeddedDocumentExtractor extractor) + throws SAXException, IOException, TikaException { + + if (file == null) { + //skip silently + return; + } + + fileName = (fileName == null) ? defaultName : fileName; + + // TODO: other metadata? + Metadata metadata = new Metadata(); + metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); + metadata.set(Metadata.CONTENT_TYPE, file.getSubtype()); + metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize())); + metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, + TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString()); + + if (extractor.shouldParseEmbedded(metadata)) { + TikaInputStream stream = null; + try { + stream = TikaInputStream.get(file.createInputStream()); + extractor.parseEmbedded( + stream, + new EmbeddedContentHandler(handler), + metadata, false); + + AttributesImpl attributes = new AttributesImpl(); + attributes.addAttribute("", "class", "class", "CDATA", "embedded"); + attributes.addAttribute("", "id", "id", "CDATA", fileName); + handler.startElement("div", attributes); + handler.endElement("div"); + } finally { + IOUtils.closeQuietly(stream); + } + } + } + + private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException, + SAXException { + //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields + //this code derives from Ben's code + PDDocumentCatalog catalog = pdf.getDocumentCatalog(); + + if (catalog == null) + return; + + PDAcroForm form = catalog.getAcroForm(); + if (form == null) + return; + + @SuppressWarnings("rawtypes") + List fields = form.getFields(); + + if (fields == null) + return; + + @SuppressWarnings("rawtypes") + ListIterator itr = fields.listIterator(); + + if (itr == null) + return; + + handler.startElement("div", "class", "acroform"); + handler.startElement("ol"); + + while (itr.hasNext()) { + Object obj = itr.next(); + if (obj != null && obj instanceof PDField) { + processAcroField((PDField) obj, handler, 0); + } + } + handler.endElement("ol"); + handler.endElement("div"); + } + + private void processAcroField(PDField field, XHTMLContentHandler handler, final int currentRecursiveDepth) + throws SAXException, IOException { + + if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { + return; + } + + addFieldString(field, handler); + + List<COSObjectable> kids = field.getKids(); + if (kids != null) { + + int r = currentRecursiveDepth + 1; + handler.startElement("ol"); + //TODO: can generate <ol/>. Rework to avoid that. + for (COSObjectable pdfObj : kids) { + if (pdfObj != null && pdfObj instanceof PDField) { + PDField kid = (PDField) pdfObj; + //recurse + processAcroField(kid, handler, r); + } + } + handler.endElement("ol"); + } + } + + private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException { + //Pick partial name to present in content and altName for attribute + //Ignoring FullyQualifiedName for now + String partName = field.getPartialName(); + String altName = field.getAlternateFieldName(); + + StringBuilder sb = new StringBuilder(); + AttributesImpl attrs = new AttributesImpl(); + + if (partName != null) { + sb.append(partName).append(": "); + } + if (altName != null) { + attrs.addAttribute("", "altName", "altName", "CDATA", altName); + } + //return early if PDSignature field + if (field instanceof PDSignatureField) { + handleSignature(attrs, (PDSignatureField) field, handler); + return; + } + try { + //getValue can throw an IOException if there is no value + String value = field.getValue(); + if (value != null && !value.equals("null")) { + sb.append(value); + } + } catch (IOException e) { + //swallow + } + + if (attrs.getLength() > 0 || sb.length() > 0) { + handler.startElement("li", attrs); + handler.characters(sb.toString()); + handler.endElement("li"); + } + } + + private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField, + XHTMLContentHandler handler) throws SAXException { + + + PDSignature sig = sigField.getSignature(); + if (sig == null) { + return; + } + Map<String, String> vals = new TreeMap<String, String>(); + vals.put("name", sig.getName()); + vals.put("contactInfo", sig.getContactInfo()); + vals.put("location", sig.getLocation()); + vals.put("reason", sig.getReason()); + + Calendar cal = sig.getSignDate(); + if (cal != null) { + dateFormat.setTimeZone(cal.getTimeZone()); + vals.put("date", dateFormat.format(cal.getTime())); + } + //see if there is any data + int nonNull = 0; + for (String val : vals.keySet()) { + if (val != null && !val.equals("")) { + nonNull++; + } + } + //if there is, process it + if (nonNull > 0) { + handler.startElement("li", parentAttributes); + + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); + + handler.startElement("ol", attrs); + for (Map.Entry<String, String> e : vals.entrySet()) { + if (e.getValue() == null || e.getValue().equals("")) { + continue; + } + attrs = new AttributesImpl(); + attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); + handler.startElement("li", attrs); + handler.characters(e.getValue()); + handler.endElement("li"); + } + handler.endElement("ol"); + handler.endElement("li"); + } + } +} +
Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.pdf; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.pdfparser.BaseParser; + +import static java.nio.charset.StandardCharsets.ISO_8859_1; + +/** + * In fairly rare cases, a PDF's XMP will contain a string that + * has incorrectly been encoded with PDFEncoding: an octal for non-ascii and + * ascii for ascii, e.g. "\376\377\000M\000i\000c\000r\000o\000s\000o\000f\000t\000" + * <p> + * This class can be used to decode those strings. + * <p> + * See TIKA-1678. Many thanks to Andrew Jackson for raising this issue + * and Tilman Hausherr for the solution. + * <p> + * As of this writing, we are only handling strings that start with + * an encoded BOM. Andrew Jackson found a handful of other examples (e.g. + * this ISO-8859-7 string: + * "Microsoft Word - \\323\\365\\354\\354\\345\\364\\357\\367\\336 + * \\364\\347\\362 PRAKSIS \\363\\364\\357") + * that we aren't currently handling. + */ +class PDFEncodedStringDecoder { + + private static final String[] PDF_ENCODING_BOMS = { + "\\376\\377", //UTF-16BE + "\\377\\376", //UTF-16LE + "\\357\\273\\277"//UTF-8 + }; + + /** + * Does this string contain an octal-encoded UTF BOM? + * Call this statically to determine if you should bother creating a new parser to parse it. + * @param s + * @return + */ + static boolean shouldDecode(String s) { + if (s == null || s.length() < 8) { + return false; + } + for (String BOM : PDF_ENCODING_BOMS) { + if (s.startsWith(BOM)) { + return true; + } + } + return false; + } + + /** + * This assumes that {@link #shouldDecode(String)} has been called + * and has returned true. If you run this on a non-octal encoded string, + * disaster will happen! + * + * @param value + * @return + */ + String decode(String value) { + try { + byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1); + InputStream is = new ByteArrayInputStream(bytes); + COSStringParser p = new COSStringParser(is); + String parsed = p.myParseCOSString(); + if (parsed != null) { + return parsed; + } + } catch (IOException e) { + //oh well, we tried. + } + //just return value if something went wrong + return value; + } + + class COSStringParser extends BaseParser { + + COSStringParser(InputStream buffer) throws IOException { + super(buffer); + } + + /** + * + * @return parsed string or null if something went wrong. + */ + String myParseCOSString() { + try { + COSString cosString = parseCOSString(); + if (cosString != null) { + return cosString.getString(); + } + } catch (IOException e) { + } + return null; + } + } +} Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,609 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.Calendar; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.jempbox.xmp.XMPSchema; +import org.apache.jempbox.xmp.XMPSchemaDublinCore; +import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; +import org.apache.pdfbox.exceptions.CryptographyException; +import org.apache.pdfbox.io.RandomAccess; +import org.apache.pdfbox.io.RandomAccessBuffer; +import org.apache.pdfbox.io.RandomAccessFile; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.encryption.AccessPermission; +import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PagedText; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.PasswordProvider; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * PDF parser. + * <p/> + * This parser can process also encrypted PDF documents if the required + * password is given as a part of the input metadata associated with a + * document. If no password is given, then this parser will try decrypting + * the document using the empty password that's often used with PDFs. If + * the PDF contains any embedded documents (for example as part of a PDF + * package) then this parser will use the {@link EmbeddedDocumentExtractor} + * to handle them. + * <p/> + * As of Tika 1.6, it is possible to extract inline images with + * the {@link EmbeddedDocumentExtractor} as if they were regular + * attachments. By default, this feature is turned off because of + * the potentially enormous number and size of inline images. To + * turn this feature on, see + * {@link PDFParserConfig#setExtractInlineImages(boolean)}. + */ +public class PDFParser extends AbstractParser { + + + /** + * Metadata key for giving the document password to the parser. + * + * @since Apache Tika 0.5 + * @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead + */ + public static final String PASSWORD = "org.apache.tika.parser.pdf.password"; + private static final MediaType MEDIA_TYPE = MediaType.application("pdf"); + /** + * Serial version UID + */ + private static final long serialVersionUID = -752276948656079347L; + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MEDIA_TYPE); + private PDFParserConfig defaultConfig = new PDFParserConfig(); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + public void parse( + InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + PDDocument pdfDocument = null; + TemporaryResources tmp = new TemporaryResources(); + //config from context, or default if not set via context + PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig); + String password = ""; + try { + // PDFBox can process entirely in memory, or can use a temp file + // for unpacked / processed resources + // Decide which to do based on if we're reading from a file or not already + TikaInputStream tstream = TikaInputStream.cast(stream); + password = getPassword(metadata, context); + if (tstream != null && tstream.hasFile()) { + // File based, take that as a cue to use a temporary file + RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw"); + if (localConfig.getUseNonSequentialParser() == true) { + pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password); + } else { + pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); + } + } else { + // Go for the normal, stream based in-memory parsing + if (localConfig.getUseNonSequentialParser() == true) { + pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer(), password); + } else { + pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); + } + } + metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); + + //if using the classic parser and the doc is encrypted, we must manually decrypt + if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) { + pdfDocument.decrypt(password); + } + + metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); + extractMetadata(pdfDocument, metadata); + + AccessChecker checker = localConfig.getAccessChecker(); + checker.check(metadata); + if (handler != null) { + PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); + } + + } catch (CryptographyException e) { + //seq parser throws CryptographyException for bad password + throw new EncryptedDocumentException(e); + } catch (IOException e) { + //nonseq parser throws IOException for bad password + //At the Tika level, we want the same exception to be thrown + if (e.getMessage() != null && + e.getMessage().contains("Error (CryptographyException)")) { + metadata.set("pdf:encrypted", Boolean.toString(true)); + throw new EncryptedDocumentException(e); + } + //rethrow any other IOExceptions + throw e; + } finally { + if (pdfDocument != null) { + pdfDocument.close(); + } + tmp.dispose(); + //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200) + PDFont.clearResources(); + } + } + + private String getPassword(Metadata metadata, ParseContext context) { + String password = null; + + // Did they supply a new style Password Provider? + PasswordProvider passwordProvider = context.get(PasswordProvider.class); + if (passwordProvider != null) { + password = passwordProvider.getPassword(metadata); + } + + // Fall back on the old style metadata if set + if (password == null && metadata.get(PASSWORD) != null) { + password = metadata.get(PASSWORD); + } + + // If no password is given, use an empty string as the default + if (password == null) { + password = ""; + } + return password; + } + + + private void extractMetadata(PDDocument document, Metadata metadata) + throws TikaException { + + //first extract AccessPermissions + AccessPermission ap = document.getCurrentAccessPermission(); + metadata.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, + Boolean.toString(ap.canExtractForAccessibility())); + metadata.set(AccessPermissions.EXTRACT_CONTENT, + Boolean.toString(ap.canExtractContent())); + metadata.set(AccessPermissions.ASSEMBLE_DOCUMENT, + Boolean.toString(ap.canAssembleDocument())); + metadata.set(AccessPermissions.FILL_IN_FORM, + Boolean.toString(ap.canFillInForm())); + metadata.set(AccessPermissions.CAN_MODIFY, + Boolean.toString(ap.canModify())); + metadata.set(AccessPermissions.CAN_MODIFY_ANNOTATIONS, + Boolean.toString(ap.canModifyAnnotations())); + metadata.set(AccessPermissions.CAN_PRINT, + Boolean.toString(ap.canPrint())); + metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, + Boolean.toString(ap.canPrintDegraded())); + + + //now go for the XMP stuff + org.apache.jempbox.xmp.XMPMetadata xmp = null; + XMPSchemaDublinCore dcSchema = null; + try { + if (document.getDocumentCatalog().getMetadata() != null) { + xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); + } + if (xmp != null) { + dcSchema = xmp.getDublinCoreSchema(); + } + } catch (IOException e) { + //swallow + } + PDDocumentInformation info = document.getDocumentInformation(); + metadata.set(PagedText.N_PAGES, document.getNumberOfPages()); + extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema); + extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, info.getAuthor(), dcSchema); + extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, null, dcSchema); + addMetadata(metadata, TikaCoreProperties.CREATOR_TOOL, info.getCreator()); + addMetadata(metadata, TikaCoreProperties.KEYWORDS, info.getKeywords()); + addMetadata(metadata, "producer", info.getProducer()); + extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema); + + // TODO: Move to description in Tika 2.0 + addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); + addMetadata(metadata, "trapped", info.getTrapped()); + try { + // TODO Remove these in Tika 2.0 + addMetadata(metadata, "created", info.getCreationDate()); + addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); + } catch (IOException e) { + // Invalid date format, just ignore + } + try { + Calendar modified = info.getModificationDate(); + addMetadata(metadata, Metadata.LAST_MODIFIED, modified); + addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); + } catch (IOException e) { + // Invalid date format, just ignore + } + + // All remaining metadata is custom + // Copy this over as-is + List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", + "Keywords", "Producer", "Subject", "Title", "Trapped"); + for (COSName key : info.getDictionary().keySet()) { + String name = key.getName(); + if (!handledMetadata.contains(name)) { + addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); + } + } + + //try to get the various versions + //Caveats: + // there is currently a fair amount of redundancy + // TikaCoreProperties.FORMAT can be multivalued + // There are also three potential pdf specific version keys: pdf:PDFVersion, pdfa:PDFVersion, pdf:PDFExtensionVersion + metadata.set("pdf:PDFVersion", Float.toString(document.getDocument().getVersion())); + metadata.add(TikaCoreProperties.FORMAT.getName(), + MEDIA_TYPE.toString() + "; version=" + + Float.toString(document.getDocument().getVersion())); + + try { + if (xmp != null) { + xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class); + XMPSchemaPDFAId pdfaxmp = (XMPSchemaPDFAId) xmp.getSchemaByClass(XMPSchemaPDFAId.class); + if (pdfaxmp != null) { + if (pdfaxmp.getPart() != null) { + metadata.set("pdfaid:part", Integer.toString(pdfaxmp.getPart())); + } + if (pdfaxmp.getConformance() != null) { + metadata.set("pdfaid:conformance", pdfaxmp.getConformance()); + String version = "A-" + pdfaxmp.getPart() + pdfaxmp.getConformance().toLowerCase(Locale.ROOT); + metadata.set("pdfa:PDFVersion", version); + metadata.add(TikaCoreProperties.FORMAT.getName(), + MEDIA_TYPE.toString() + "; version=\"" + version + "\""); + } + } + // TODO WARN if this XMP version is inconsistent with document header version? + } + } catch (IOException e) { + metadata.set(TikaCoreProperties.TIKA_META_PREFIX + "pdf:metadata-xmp-parse-failed", "" + e); + } + //TODO: Let's try to move this into PDFBox. + //Attempt to determine Adobe extension level, if present: + COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); + COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); + if (extensions != null) { + for (COSName extName : extensions.keySet()) { + // If it's an Adobe one, interpret it to determine the extension level: + if (extName.equals(COSName.getPDFName("ADBE"))) { + COSDictionary adobeExt = (COSDictionary) extensions.getDictionaryObject(extName); + if (adobeExt != null) { + String baseVersion = adobeExt.getNameAsString(COSName.getPDFName("BaseVersion")); + int el = adobeExt.getInt(COSName.getPDFName("ExtensionLevel")); + //-1 is sentinel value that something went wrong in getInt + if (el != -1) { + metadata.set("pdf:PDFExtensionVersion", baseVersion + " Adobe Extension Level " + el); + metadata.add(TikaCoreProperties.FORMAT.getName(), + MEDIA_TYPE.toString() + "; version=\"" + baseVersion + " Adobe Extension Level " + el + "\""); + } + } + } else { + // WARN that there is an Extension, but it's not Adobe's, and so is a 'new' format'. + metadata.set("pdf:foundNonAdobeExtensionName", extName.getName()); + } + } + } + } + + /** + * Try to extract all multilingual items from the XMPSchema + * <p/> + * This relies on the property having a valid xmp getName() + * <p/> + * For now, this only extracts the first language if the property does not allow multiple values (see TIKA-1295) + * + * @param metadata + * @param property + * @param pdfBoxBaseline + * @param schema + */ + private void extractMultilingualItems(Metadata metadata, Property property, + String pdfBoxBaseline, XMPSchema schema) { + //if schema is null, just go with pdfBoxBaseline + if (schema == null) { + if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { + addMetadata(metadata, property, pdfBoxBaseline); + } + return; + } + + for (String lang : schema.getLanguagePropertyLanguages(property.getName())) { + String value = schema.getLanguageProperty(property.getName(), lang); + + if (value != null && value.length() > 0) { + //if you're going to add it below in the baseline addition, don't add it now + if (pdfBoxBaseline != null && value.equals(pdfBoxBaseline)) { + continue; + } + addMetadata(metadata, property, value); + if (!property.isMultiValuePermitted()) { + return; + } + } + } + + if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { + //if we've already added something above and multivalue is not permitted + //return. + if (!property.isMultiValuePermitted()) { + if (metadata.get(property) != null) { + return; + } + } + addMetadata(metadata, property, pdfBoxBaseline); + } + } + + + /** + * This tries to read a list from a particular property in + * XMPSchemaDublinCore. + * If it can't find the information, it falls back to the + * pdfboxBaseline. The pdfboxBaseline should be the value + * that pdfbox returns from its PDDocumentInformation object + * (e.g. getAuthor()) This method is designed include the pdfboxBaseline, + * and it should not duplicate the pdfboxBaseline. + * <p/> + * Until PDFBOX-1803/TIKA-1233 are fixed, do not call this + * on dates! + * <p/> + * This relies on the property having a DublinCore compliant getName() + * + * @param property + * @param pdfBoxBaseline + * @param dc + * @param metadata + */ + private void extractDublinCoreListItems(Metadata metadata, Property property, + String pdfBoxBaseline, XMPSchemaDublinCore dc) { + //if no dc, add baseline and return + if (dc == null) { + if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { + addMetadata(metadata, property, pdfBoxBaseline); + } + return; + } + List<String> items = getXMPBagOrSeqList(dc, property.getName()); + if (items == null) { + if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { + addMetadata(metadata, property, pdfBoxBaseline); + } + return; + } + for (String item : items) { + if (pdfBoxBaseline != null && !item.equals(pdfBoxBaseline)) { + addMetadata(metadata, property, item); + } + } + //finally, add the baseline + if (pdfBoxBaseline != null && pdfBoxBaseline.length() > 0) { + addMetadata(metadata, property, pdfBoxBaseline); + } + } + + /** + * As of this writing, XMPSchema can contain bags or sequence lists + * for some attributes...despite standards documentation. + * JempBox expects one or the other for specific attributes. + * Until more flexibility is added to JempBox, Tika will have to handle both. + * + * @param schema + * @param name + * @return list of values or null + */ + private List<String> getXMPBagOrSeqList(XMPSchema schema, String name) { + List<String> ret = schema.getBagList(name); + if (ret == null) { + ret = schema.getSequenceList(name); + } + return ret; + } + + private void addMetadata(Metadata metadata, Property property, String value) { + if (value != null) { + String decoded = decode(value); + if (property.isMultiValuePermitted() || metadata.get(property) == null) { + metadata.add(property, decoded); + } + //silently skip adding property that already exists if multiple values are not permitted + } + } + + private void addMetadata(Metadata metadata, String name, String value) { + if (value != null) { + metadata.add(name, decode(value)); + } + } + + private String decode(String value) { + if (PDFEncodedStringDecoder.shouldDecode(value)) { + PDFEncodedStringDecoder d = new PDFEncodedStringDecoder(); + return d.decode(value); + } + return value; + } + + private void addMetadata(Metadata metadata, String name, Calendar value) { + if (value != null) { + metadata.set(name, value.getTime().toString()); + } + } + + private void addMetadata(Metadata metadata, Property property, Calendar value) { + if (value != null) { + metadata.set(property, value.getTime()); + } + } + + /** + * Used when processing custom metadata entries, as PDFBox won't do + * the conversion for us in the way it does for the standard ones + */ + private void addMetadata(Metadata metadata, String name, COSBase value) { + if (value instanceof COSArray) { + for (Object v : ((COSArray) value).toList()) { + addMetadata(metadata, name, ((COSBase) v)); + } + } else if (value instanceof COSString) { + addMetadata(metadata, name, ((COSString) value).getString()); + } + // Avoid calling COSDictionary#toString, since it can lead to infinite + // recursion. See TIKA-1038 and PDFBOX-1835. + else if (value != null && !(value instanceof COSDictionary)) { + addMetadata(metadata, name, value.toString()); + } + } + + public PDFParserConfig getPDFParserConfig() { + return defaultConfig; + } + + public void setPDFParserConfig(PDFParserConfig config) { + this.defaultConfig = config; + } + + /** + * @see #setUseNonSequentialParser(boolean) + * @deprecated use {@link #getPDFParserConfig()} + */ + public boolean getUseNonSequentialParser() { + return defaultConfig.getUseNonSequentialParser(); + } + + /** + * If true, the parser will use the NonSequentialParser. This may + * be faster than the full doc parser. + * If false (default), this will use the full doc parser. + * + * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} + */ + public void setUseNonSequentialParser(boolean v) { + defaultConfig.setUseNonSequentialParser(v); + } + + /** + * @see #setEnableAutoSpace(boolean) + * @deprecated use {@link #getPDFParserConfig()} + */ + public boolean getEnableAutoSpace() { + return defaultConfig.getEnableAutoSpace(); + } + + /** + * If true (the default), the parser should estimate + * where spaces should be inserted between words. For + * many PDFs this is necessary as they do not include + * explicit whitespace characters. + * + * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} + */ + public void setEnableAutoSpace(boolean v) { + defaultConfig.setEnableAutoSpace(v); + } + + /** + * If true, text in annotations will be extracted. + * + * @deprecated use {@link #getPDFParserConfig()} + */ + public boolean getExtractAnnotationText() { + return defaultConfig.getExtractAnnotationText(); + } + + /** + * If true (the default), text in annotations will be + * extracted. + * + * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} + */ + public void setExtractAnnotationText(boolean v) { + defaultConfig.setExtractAnnotationText(v); + } + + /** + * @see #setSuppressDuplicateOverlappingText(boolean) + * @deprecated use {@link #getPDFParserConfig()} + */ + public boolean getSuppressDuplicateOverlappingText() { + return defaultConfig.getSuppressDuplicateOverlappingText(); + } + + /** + * If true, the parser should try to remove duplicated + * text over the same region. This is needed for some + * PDFs that achieve bolding by re-writing the same + * text in the same area. Note that this can + * slow down extraction substantially (PDFBOX-956) and + * sometimes remove characters that were not in fact + * duplicated (PDFBOX-1155). By default this is disabled. + * + * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} + */ + public void setSuppressDuplicateOverlappingText(boolean v) { + defaultConfig.setSuppressDuplicateOverlappingText(v); + } + + /** + * @see #setSortByPosition(boolean) + * @deprecated use {@link #getPDFParserConfig()} + */ + public boolean getSortByPosition() { + return defaultConfig.getSortByPosition(); + } + + /** + * If true, sort text tokens by their x/y position + * before extracting text. This may be necessary for + * some PDFs (if the text tokens are not rendered "in + * order"), while for other PDFs it can produce the + * wrong result (for example if there are 2 columns, + * the text will be interleaved). Default is false. + * + * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} + */ + public void setSortByPosition(boolean v) { + defaultConfig.setSortByPosition(v); + } + +} Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,469 @@ +package org.apache.tika.parser.pdf; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.Locale; +import java.util.Properties; + +import org.apache.pdfbox.util.PDFTextStripper; + +/** + * Config for PDFParser. + * <p/> + * This allows parameters to be set programmatically: + * <ol> + * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li> + * <li>Constructor of PDFParser</li> + * <li>Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);</li> + * </ol> + * <p/> + * Parameters can also be set by modifying the PDFParserConfig.properties file, + * which lives in the expected places, in trunk: + * tika-parsers/src/main/resources/org/apache/tika/parser/pdf + * <p/> + * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar: + * org/apache/tika/parser/pdf + */ +public class PDFParserConfig implements Serializable { + + private static final long serialVersionUID = 6492570218190936986L; + + // True if we let PDFBox "guess" where spaces should go: + private boolean enableAutoSpace = true; + + // True if we let PDFBox remove duplicate overlapping text: + private boolean suppressDuplicateOverlappingText; + + // True if we extract annotation text ourselves + // (workaround for PDFBOX-1143): + private boolean extractAnnotationText = true; + + // True if we should sort text tokens by position + // (necessary for some PDFs, but messes up other PDFs): + private boolean sortByPosition = false; + + //True if we should use PDFBox's NonSequentialParser + private boolean useNonSequentialParser = false; + + //True if acroform content should be extracted + private boolean extractAcroFormContent = true; + + //True if inline PDXImage objects should be extracted + private boolean extractInlineImages = false; + + //True if inline images (as identified by their object id within + //a pdf file) should only be extracted once. + private boolean extractUniqueInlineImagesOnly = true; + + //The character width-based tolerance value used to estimate where spaces in text should be added + private Float averageCharTolerance; + + //The space width-based tolerance value used to estimate where spaces in text should be added + private Float spacingTolerance; + + private AccessChecker accessChecker; + + public PDFParserConfig() { + init(this.getClass().getResourceAsStream("PDFParser.properties")); + } + + /** + * Loads properties from InputStream and then tries to close InputStream. + * If there is an IOException, this silently swallows the exception + * and goes back to the default. + * + * @param is + */ + public PDFParserConfig(InputStream is) { + init(is); + } + + //initializes object and then tries to close inputstream + private void init(InputStream is) { + + if (is == null) { + return; + } + Properties props = new Properties(); + try { + props.load(is); + } catch (IOException e) { + } finally { + if (is != null) { + try { + is.close(); + } catch (IOException e) { + //swallow + } + } + } + setEnableAutoSpace( + getProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); + setSuppressDuplicateOverlappingText( + getProp(props.getProperty("suppressDuplicateOverlappingText"), + getSuppressDuplicateOverlappingText())); + setExtractAnnotationText( + getProp(props.getProperty("extractAnnotationText"), + getExtractAnnotationText())); + setSortByPosition( + getProp(props.getProperty("sortByPosition"), + getSortByPosition())); + setUseNonSequentialParser( + getProp(props.getProperty("useNonSequentialParser"), + getUseNonSequentialParser())); + setExtractAcroFormContent( + getProp(props.getProperty("extractAcroFormContent"), + getExtractAcroFormContent())); + setExtractInlineImages( + getProp(props.getProperty("extractInlineImages"), + getExtractInlineImages())); + setExtractUniqueInlineImagesOnly( + getProp(props.getProperty("extractUniqueInlineImagesOnly"), + getExtractUniqueInlineImagesOnly())); + + boolean checkExtractAccessPermission = getProp(props.getProperty("checkExtractAccessPermission"), false); + boolean allowExtractionForAccessibility = getProp(props.getProperty("allowExtractionForAccessibility"), true); + + if (checkExtractAccessPermission == false) { + //silently ignore the crazy configuration of checkExtractAccessPermission = false, + //but allowExtractionForAccessibility=false + accessChecker = new AccessChecker(); + } else { + accessChecker = new AccessChecker(allowExtractionForAccessibility); + } + } + + /** + * Configures the given pdf2XHTML. + * + * @param pdf2XHTML + */ + public void configure(PDF2XHTML pdf2XHTML) { + pdf2XHTML.setForceParsing(true); + pdf2XHTML.setSortByPosition(getSortByPosition()); + if (getEnableAutoSpace()) { + pdf2XHTML.setWordSeparator(" "); + } else { + pdf2XHTML.setWordSeparator(""); + } + if (getAverageCharTolerance() != null) { + pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance()); + } + if (getSpacingTolerance() != null) { + pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); + } + pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText()); + } + + /** + * @see #setExtractAcroFormContent(boolean) + */ + public boolean getExtractAcroFormContent() { + return extractAcroFormContent; + } + + /** + * If true (the default), extract content from AcroForms + * at the end of the document. + * + * @param extractAcroFormContent + */ + public void setExtractAcroFormContent(boolean extractAcroFormContent) { + this.extractAcroFormContent = extractAcroFormContent; + + } + + /** + * @see #setExtractInlineImages(boolean) + */ + public boolean getExtractInlineImages() { + return extractInlineImages; + } + + /** + * If true, extract inline embedded OBXImages. + * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain + * thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5, + * there can be surprisingly large memory consumption and/or out of memory errors. + * Set to <code>true</code> with caution. + * <p/> + * The default is <code>false</code>. + * <p/> + * See also: {@see #setExtractUniqueInlineImagesOnly(boolean)}; + * + * @param extractInlineImages + */ + public void setExtractInlineImages(boolean extractInlineImages) { + this.extractInlineImages = extractInlineImages; + } + + /** + * @see #setExtractUniqueInlineImagesOnly(boolean) + */ + public boolean getExtractUniqueInlineImagesOnly() { + return extractUniqueInlineImagesOnly; + } + + /** + * Multiple pages within a PDF file might refer to the same underlying image. + * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, the + * parser will call the EmbeddedExtractor each time the image appears on a page. + * This might be desired for some use cases. However, to avoid duplication of + * extracted images, set this to <code>true</code>. The default is <code>true</code>. + * <p/> + * Note that uniqueness is determined only by the underlying PDF COSObject id, not by + * file hash or similar equality metric. + * If the PDF actually contains multiple copies of the same image + * -- all with different object ids -- then all images will be extracted. + * <p/> + * For this parameter to have any effect, {@link #extractInlineImages} must be + * set to <code>true</code>. + * <p> + * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting + * of this parameter, the extractor will only pull out one copy of each image per + * page. This parameter tries to capture uniqueness across the entire document. + * + * @param extractUniqueInlineImagesOnly + */ + public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { + this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly; + + } + + /** + * @see #setEnableAutoSpace(boolean) + */ + public boolean getEnableAutoSpace() { + return enableAutoSpace; + } + + /** + * If true (the default), the parser should estimate + * where spaces should be inserted between words. For + * many PDFs this is necessary as they do not include + * explicit whitespace characters. + */ + public void setEnableAutoSpace(boolean enableAutoSpace) { + this.enableAutoSpace = enableAutoSpace; + } + + /** + * @see #setSuppressDuplicateOverlappingText(boolean) + */ + public boolean getSuppressDuplicateOverlappingText() { + return suppressDuplicateOverlappingText; + } + + /** + * If true, the parser should try to remove duplicated + * text over the same region. This is needed for some + * PDFs that achieve bolding by re-writing the same + * text in the same area. Note that this can + * slow down extraction substantially (PDFBOX-956) and + * sometimes remove characters that were not in fact + * duplicated (PDFBOX-1155). By default this is disabled. + */ + public void setSuppressDuplicateOverlappingText( + boolean suppressDuplicateOverlappingText) { + this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText; + } + + /** + * @see #setExtractAnnotationText(boolean) + */ + public boolean getExtractAnnotationText() { + return extractAnnotationText; + } + + /** + * If true (the default), text in annotations will be + * extracted. + */ + public void setExtractAnnotationText(boolean extractAnnotationText) { + this.extractAnnotationText = extractAnnotationText; + } + + /** + * @see #setSortByPosition(boolean) + */ + public boolean getSortByPosition() { + return sortByPosition; + } + + /** + * If true, sort text tokens by their x/y position + * before extracting text. This may be necessary for + * some PDFs (if the text tokens are not rendered "in + * order"), while for other PDFs it can produce the + * wrong result (for example if there are 2 columns, + * the text will be interleaved). Default is false. + */ + public void setSortByPosition(boolean sortByPosition) { + this.sortByPosition = sortByPosition; + } + + /** + * @see #setUseNonSequentialParser(boolean) + */ + public boolean getUseNonSequentialParser() { + return useNonSequentialParser; + } + + /** + * If true, uses PDFBox's non-sequential parser. + * The non-sequential parser should be much faster than the traditional + * full doc parser. However, until PDFBOX-XXX is fixed, + * the non-sequential parser fails + * to extract some document metadata. + * <p/> + * Default is false (use the traditional parser) + * + * @param useNonSequentialParser + */ + public void setUseNonSequentialParser(boolean useNonSequentialParser) { + this.useNonSequentialParser = useNonSequentialParser; + } + + /** + * @see #setAverageCharTolerance(Float) + */ + public Float getAverageCharTolerance() { + return averageCharTolerance; + } + + /** + * See {@link PDFTextStripper#setAverageCharTolerance(float)} + */ + public void setAverageCharTolerance(Float averageCharTolerance) { + this.averageCharTolerance = averageCharTolerance; + } + + /** + * @see #setSpacingTolerance(Float) + */ + public Float getSpacingTolerance() { + return spacingTolerance; + } + + /** + * See {@link PDFTextStripper#setSpacingTolerance(float)} + */ + public void setSpacingTolerance(Float spacingTolerance) { + this.spacingTolerance = spacingTolerance; + } + + public AccessChecker getAccessChecker() { + return accessChecker; + } + + public void setAccessChecker(AccessChecker accessChecker) { + this.accessChecker = accessChecker; + } + + private boolean getProp(String p, boolean defaultMissing) { + if (p == null) { + return defaultMissing; + } + if (p.toLowerCase(Locale.ROOT).equals("true")) { + return true; + } else if (p.toLowerCase(Locale.ROOT).equals("false")) { + return false; + } else { + return defaultMissing; + } + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime + * result + + ((averageCharTolerance == null) ? 0 : averageCharTolerance + .hashCode()); + result = prime * result + (enableAutoSpace ? 1231 : 1237); + result = prime * result + (extractAcroFormContent ? 1231 : 1237); + result = prime * result + (extractAnnotationText ? 1231 : 1237); + result = prime * result + (extractInlineImages ? 1231 : 1237); + result = prime * result + (extractUniqueInlineImagesOnly ? 1231 : 1237); + result = prime * result + (sortByPosition ? 1231 : 1237); + result = prime + * result + + ((spacingTolerance == null) ? 0 : spacingTolerance.hashCode()); + result = prime * result + + (suppressDuplicateOverlappingText ? 1231 : 1237); + result = prime * result + (useNonSequentialParser ? 1231 : 1237); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + PDFParserConfig other = (PDFParserConfig) obj; + if (averageCharTolerance == null) { + if (other.averageCharTolerance != null) + return false; + } else if (!averageCharTolerance.equals(other.averageCharTolerance)) + return false; + if (enableAutoSpace != other.enableAutoSpace) + return false; + if (extractAcroFormContent != other.extractAcroFormContent) + return false; + if (extractAnnotationText != other.extractAnnotationText) + return false; + if (extractInlineImages != other.extractInlineImages) + return false; + if (extractUniqueInlineImagesOnly != other.extractUniqueInlineImagesOnly) + return false; + if (sortByPosition != other.sortByPosition) + return false; + if (spacingTolerance == null) { + if (other.spacingTolerance != null) + return false; + } else if (!spacingTolerance.equals(other.spacingTolerance)) + return false; + if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText) + return false; + if (useNonSequentialParser != other.useNonSequentialParser) + return false; + return true; + } + + @Override + public String toString() { + return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace + + ", suppressDuplicateOverlappingText=" + + suppressDuplicateOverlappingText + ", extractAnnotationText=" + + extractAnnotationText + ", sortByPosition=" + sortByPosition + + ", useNonSequentialParser=" + useNonSequentialParser + + ", extractAcroFormContent=" + extractAcroFormContent + + ", extractInlineImages=" + extractInlineImages + + ", extractUniqueInlineImagesOnly=" + + extractUniqueInlineImagesOnly + ", averageCharTolerance=" + + averageCharTolerance + ", spacingTolerance=" + + spacingTolerance + "]"; + } +} Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Jan 6 03:50:50 2016 @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +org.apache.tika.parser.pdf.PDFParser Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties Wed Jan 6 03:50:50 2016 @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +enableAutospace true +extractAnnotationText true +sortByPosition false +suppressDuplicateOverlappingText false +useNonSequentialParser false +extractAcroFormContent true +extractInlineImages false +extractUniqueInlineImagesOnly true +checkExtractAccessPermission false +allowExtractionForAccessibility true Added: tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java?rev=1723223&view=auto ============================================================================== --- tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java (added) +++ tika/branches/2.x/tika-parser-modules/tika-pdf-module/src/test/java/org/apache/tika/parser/pdf/AccessCheckerTest.java Wed Jan 6 03:50:50 2016 @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pdf; + + +import static org.junit.Assert.assertTrue; + +import org.apache.tika.exception.AccessPermissionException; +import org.apache.tika.metadata.AccessPermissions; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PropertyTypeException; +import org.junit.Test; + +public class AccessCheckerTest { + + @Test + public void testLegacy() throws AccessPermissionException { + + Metadata m = getMetadata(false, false); + //legacy behavior; don't bother checking + AccessChecker checker = new AccessChecker(); + checker.check(m); + assertTrue("no exception", true); + + m = getMetadata(false, true); + assertTrue("no exception", true); + checker.check(m); + + m = getMetadata(true, true); + assertTrue("no exception", true); + checker.check(m); + } + + @Test + public void testNoExtraction() { + + Metadata m = null; + //allow nothing + AccessChecker checker = new AccessChecker(false); + boolean ex = false; + try { + m = getMetadata(false, false); + checker.check(m); + } catch (AccessPermissionException e) { + ex = true; + } + assertTrue("correct exception with no extraction, no extract for accessibility", ex); + ex = false; + try { + //document allows extraction for accessibility + m = getMetadata(false, true); + checker.check(m); + } catch (AccessPermissionException e) { + //but application is not an accessibility application + ex = true; + } + assertTrue("correct exception with no extraction, no extract for accessibility", ex); + } + + @Test + public void testExtractOnlyForAccessibility() throws AccessPermissionException { + Metadata m = getMetadata(false, true); + //allow accessibility + AccessChecker checker = new AccessChecker(true); + checker.check(m); + assertTrue("no exception", true); + boolean ex = false; + try { + m = getMetadata(false, false); + checker.check(m); + } catch (AccessPermissionException e) { + ex = true; + } + assertTrue("correct exception", ex); + } + + @Test + public void testCrazyExtractNotForAccessibility() throws AccessPermissionException { + Metadata m = getMetadata(true, false); + //allow accessibility + AccessChecker checker = new AccessChecker(true); + checker.check(m); + assertTrue("no exception", true); + + //don't extract for accessibility + checker = new AccessChecker(false); + //if extract content is allowed, the checker shouldn't + //check the value of extract for accessibility + checker.check(m); + assertTrue("no exception", true); + + } + + @Test + public void testCantAddMultiplesToMetadata() { + Metadata m = new Metadata(); + boolean ex = false; + m.add(AccessPermissions.EXTRACT_CONTENT, "true"); + try { + m.add(AccessPermissions.EXTRACT_CONTENT, "false"); + } catch (PropertyTypeException e) { + ex = true; + } + assertTrue("can't add multiple values", ex); + + m = new Metadata(); + ex = false; + m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "true"); + try { + m.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, "false"); + } catch (PropertyTypeException e) { + ex = true; + } + assertTrue("can't add multiple values", ex); + } + + private Metadata getMetadata(boolean allowExtraction, boolean allowExtractionForAccessibility) { + Metadata m = new Metadata(); + m.set(AccessPermissions.EXTRACT_CONTENT, Boolean.toString(allowExtraction)); + m.set(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY, Boolean.toString(allowExtractionForAccessibility)); + return m; + } +}
