Repository: tika Updated Branches: refs/heads/master 1924c3f3f -> 98eb56ec7
TIKA-1285 -- upgrade to PDFBox 2.0.0 Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/98eb56ec Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/98eb56ec Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/98eb56ec Branch: refs/heads/master Commit: 98eb56ec78f2e1d27de644f4f6647ea1cfbc930b Parents: 1924c3f Author: tballison <[email protected]> Authored: Mon Mar 21 22:30:37 2016 -0400 Committer: tballison <[email protected]> Committed: Mon Mar 21 22:30:37 2016 -0400 ---------------------------------------------------------------------- CHANGES.txt | 6 + tika-bundle/pom.xml | 2 +- tika-parsers/pom.xml | 23 ++- .../tika/parser/font/AdobeFontMetricParser.java | 16 +- .../apache/tika/parser/font/TrueTypeParser.java | 4 +- .../org/apache/tika/parser/pdf/PDF2XHTML.java | 178 ++++++++++--------- .../parser/pdf/PDFEncodedStringDecoder.java | 14 +- .../org/apache/tika/parser/pdf/PDFParser.java | 93 ++-------- .../apache/tika/parser/pdf/PDFParserConfig.java | 35 +--- .../apache/tika/parser/pdf/PDFParser.properties | 1 - .../apache/tika/parser/pdf/PDFParserTest.java | 113 ------------ 11 files changed, 162 insertions(+), 323 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index ed825cb..0ca8494 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,11 @@ Release 1.13 - ??? + * Upgrade to PDFBox 2.0.0 (TIKA-1285). MAJOR CHANGES in PDFParser: + * The classic sequential parser is no longer available. + * Tiff files are no longer extracted by default. See + https://pdfbox.apache.org/2.0/dependencies.html#optional-components + for optional components to process Tiff files. + * Add XMPMM support to PDFParser and JpegParser via Jempbox (TIKA-1894). * Move serialization of TikaConfig to tika-core and enable dumping http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-bundle/pom.xml ---------------------------------------------------------------------- diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml index 20cb07c..5b8f214 100644 --- a/tika-bundle/pom.xml +++ b/tika-bundle/pom.xml @@ -127,7 +127,7 @@ tika-parsers;inline=true, commons-compress, xz, commons-codec, commons-csv, commons-io, commons-exec, junrar, - pdfbox,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on, + pdfbox,pdfbox-tools,pdfbox-debugger,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on, poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas, curvesapi, xmlbeans, http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/pom.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index 8c67c68..bd0a734 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -42,7 +42,8 @@ <tukaani.version>1.5</tukaani.version> <mime4j.version>0.7.2</mime4j.version> <vorbis.version>0.8</vorbis.version> - <pdfbox.version>1.8.11</pdfbox.version> + <pdfbox.version>2.0.0</pdfbox.version> + <jempbox.version>1.8.11</jempbox.version> <netcdf-java.version>4.5.5</netcdf-java.version> <cxf.version>3.0.3</cxf.version> <sis.version>0.6</sis.version> @@ -133,6 +134,16 @@ <artifactId>pdfbox</artifactId> <version>${pdfbox.version}</version> </dependency> + <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>pdfbox-tools</artifactId> + <version>${pdfbox.version}</version> + </dependency> + <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>jempbox</artifactId> + <version>${jempbox.version}</version> + </dependency> <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies as optional, but we prefer to have them always to avoid problems with encrypted PDFs. --> @@ -298,7 +309,15 @@ <artifactId>slf4j-log4j12</artifactId> <scope>test</scope> </dependency> - + <!-- Copied from PDFBox: + For legal reasons (incompatible license), jai-imageio-core is to be used + only in the tests and may not be distributed. See also LEGAL-195 --> + <dependency> + <groupId>com.github.jai-imageio</groupId> + <artifactId>jai-imageio-core</artifactId> + <version>1.3.1</version> + <scope>test</scope> + </dependency> <!-- edu.ucar dependencies --> <dependency> <groupId>edu.ucar</groupId> http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java index e4bdca7..000ff10 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/font/AdobeFontMetricParser.java @@ -18,12 +18,13 @@ package org.apache.tika.parser.font; import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import org.apache.fontbox.afm.AFMParser; -import org.apache.fontbox.afm.FontMetric; +import org.apache.fontbox.afm.FontMetrics; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -67,16 +68,19 @@ public class AdobeFontMetricParser extends AbstractParser { public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - FontMetric fontMetrics; + FontMetrics fontMetrics; AFMParser parser = new AFMParser( stream ); // Have FontBox process the file - parser.parse(); - fontMetrics = parser.getResult(); + fontMetrics = parser.parse(); // Get the comments in the file to display in xhtml - List<String> comments = fontMetrics.getComments(); - + List<String> unModifiableComments = fontMetrics.getComments(); + //have to copy because we modify list in extractCreationDate + List<String> comments = new ArrayList<>(); + for (String comment : unModifiableComments) { + comments.add(comment); + } // Get the creation date extractCreationDate( metadata, comments ); http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java index 26c1368..c207e0b 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/font/TrueTypeParser.java @@ -64,9 +64,9 @@ public class TrueTypeParser extends AbstractParser { TrueTypeFont font; TTFParser parser = new TTFParser(); if (tis != null && tis.hasFile()) { - font = parser.parseTTF(tis.getFile()); + font = parser.parse(tis.getFile()); } else { - font = parser.parseTTF(stream); + font = parser.parse(stream); } // Report the details of the font http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index d656d5a..2790d47 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -17,10 +17,13 @@ package org.apache.tika.parser.pdf; import javax.xml.stream.XMLStreamException; +import java.awt.image.BufferedImage; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.io.Writer; import java.text.SimpleDateFormat; import java.util.Calendar; @@ -36,24 +39,23 @@ import java.util.TreeMap; import org.apache.commons.io.IOExceptionWithCause; import org.apache.commons.io.IOUtils; import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary; import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; -import org.apache.pdfbox.pdmodel.common.COSObjectable; import org.apache.pdfbox.pdmodel.common.PDNameTreeNode; import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification; import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile; -import org.apache.pdfbox.pdmodel.graphics.xobject.PDCcitt; -import org.apache.pdfbox.pdmodel.graphics.xobject.PDJpeg; -import org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap; -import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject; -import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm; -import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; -import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; -import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; +import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.pdmodel.interactive.action.PDAction; +import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; @@ -64,10 +66,12 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; import org.apache.pdfbox.pdmodel.interactive.form.PDField; +import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField; import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; -import org.apache.pdfbox.pdmodel.interactive.form.PDXFA; -import org.apache.pdfbox.util.PDFTextStripper; -import org.apache.pdfbox.util.TextPosition; +import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.apache.pdfbox.tools.imageio.ImageIOUtil; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; @@ -311,22 +315,16 @@ class PDF2XHTML extends PDFTextStripper { } catch (SAXException e) { throw new IOExceptionWithCause("Unable to end a page", e); } - page.clear(); } - private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException { + private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException { if (resources == null || config.getExtractInlineImages() == false) { return; } - Map<String, PDXObject> xObjects = resources.getXObjects(); - if (xObjects == null) { - return; - } + for (COSName name : resources.getXObjectNames()) { - for (Map.Entry<String, PDXObject> entry : xObjects.entrySet()) { - - PDXObject object = entry.getValue(); + PDXObject object = resources.getXObject(name); if (object == null) { continue; } @@ -337,30 +335,32 @@ class PDF2XHTML extends PDFTextStripper { } seenThisPage.add(cosObject); - if (object instanceof PDXObjectForm) { - extractImages(((PDXObjectForm) object).getResources(), seenThisPage); - } else if (object instanceof PDXObjectImage) { + if (object instanceof PDFormXObject) { + extractImages(((PDFormXObject) object).getResources(), seenThisPage); + } else if (object instanceof PDImageXObject) { - PDXObjectImage image = (PDXObjectImage) object; + PDImageXObject image = (PDImageXObject) object; Metadata metadata = new Metadata(); - String extension = ""; - if (image instanceof PDJpeg) { + String extension = image.getSuffix(); + if (extension == null) { + metadata.set(Metadata.CONTENT_TYPE, "image/png"); + extension = "png"; + } else if (extension.equals("jpg")) { metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); - extension = ".jpg"; - } else if (image instanceof PDCcitt) { + } else if (extension.equals("tiff")) { metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); - extension = ".tif"; - } else if (image instanceof PDPixelMap) { - metadata.set(Metadata.CONTENT_TYPE, "image/png"); - extension = ".png"; + extension = "tif"; + } else { + //TODO: determine if we need to add more image types + //throw new RuntimeException("EXTEN:" + extension); } - Integer imageNumber = processedInlineImages.get(entry.getKey()); + Integer imageNumber = processedInlineImages.get(name.getName()); if (imageNumber == null) { imageNumber = inlineImageCounter++; } - String fileName = "image" + imageNumber + extension; + String fileName = "image" + imageNumber + "."+extension; metadata.set(Metadata.RESOURCE_NAME_KEY, fileName); // Output the img tag @@ -373,7 +373,7 @@ class PDF2XHTML extends PDFTextStripper { //Do we only want to process unique COSObject ids? //If so, have we already processed this one? if (config.getExtractUniqueInlineImagesOnly() == true) { - String cosObjectId = entry.getKey(); + String cosObjectId = name.getName(); if (processedInlineImages.containsKey(cosObjectId)) { continue; } @@ -388,8 +388,8 @@ class PDF2XHTML extends PDFTextStripper { if (extractor.shouldParseEmbedded(metadata)) { ByteArrayOutputStream buffer = new ByteArrayOutputStream(); try { - image.write2OutputStream(buffer); - image.clear(); + //TODO: handle image.getMetadata()? + writeToBuffer(image, extension, buffer); extractor.parseEmbedded( new ByteArrayInputStream(buffer.toByteArray()), new EmbeddedContentHandler(handler), @@ -400,7 +400,35 @@ class PDF2XHTML extends PDFTextStripper { } } } - resources.clear(); + } + + //nearly directly copied from PDFBox ExtractImages + private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream out) + throws IOException { + + BufferedImage image = pdImage.getImage(); + if (image != null) { + if ("jpg".equals(suffix)) { + String colorSpaceName = pdImage.getColorSpace().getName(); + //TODO: figure out if we want directJPEG as a configuration + //previously: if (directJPeg || PDDeviceGray.... + if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || + PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) { + // RGB or Gray colorspace: get and write the unmodifiedJPEG stream + //TODO: shouldn't need to do this: should be able to call createInputStream directly?! + //version clash somewhere?! + InputStream data = pdImage.getStream().createInputStream(); + org.apache.pdfbox.io.IOUtils.copy(data, out); + org.apache.pdfbox.io.IOUtils.closeQuietly(data); + } else { + // for CMYK and other "unusual" colorspaces, the JPEG will be converted + ImageIOUtil.writeImage(image, suffix, out); + } + } else { + ImageIOUtil.writeImage(image, suffix, out); + } + } + out.flush(); } protected EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() { @@ -445,10 +473,10 @@ class PDF2XHTML extends PDFTextStripper { @Override protected void writeCharacters(TextPosition text) throws IOException { try { - handler.characters(text.getCharacter()); + handler.characters(text.getUnicode()); } catch (SAXException e) { throw new IOExceptionWithCause( - "Unable to write a character: " + text.getCharacter(), e); + "Unable to write a character: " + text.getUnicode(), e); } } @@ -474,18 +502,14 @@ class PDF2XHTML extends PDFTextStripper { private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler) throws IOException, SAXException, TikaException { - PDDocumentCatalog catalog = document.getDocumentCatalog(); - PDDocumentNameDictionary names = catalog.getNames(); - if (names == null) { + PDDocumentNameDictionary namesDictionary = + new PDDocumentNameDictionary( document.getDocumentCatalog() ); + PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); + if (efTree == null) { return; } - PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles(); - if (embeddedFiles == null) { - return; - } - - Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames(); + Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames(); //For now, try to get the embeddedFileNames out of embeddedFiles or its kids. //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java //If there is a need we could add a fully recursive search to find a non-null @@ -493,35 +517,35 @@ class PDF2XHTML extends PDFTextStripper { if (embeddedFileNames != null) { processEmbeddedDocNames(embeddedFileNames); } else { - List<PDNameTreeNode> kids = embeddedFiles.getKids(); + List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids(); if (kids == null) { return; } - for (PDNameTreeNode n : kids) { - Map<String, COSObjectable> childNames = n.getNames(); - if (childNames != null) { - processEmbeddedDocNames(childNames); + for (PDNameTreeNode<PDComplexFileSpecification> node : kids) { + embeddedFileNames = node.getNames(); + if (embeddedFileNames != null) { + processEmbeddedDocNames(embeddedFileNames); } } } } - - private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames) + private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames) throws IOException, SAXException, TikaException { if (embeddedFileNames == null || embeddedFileNames.isEmpty()) { return; } EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor(); - for (Map.Entry<String, COSObjectable> ent : embeddedFileNames.entrySet()) { - PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue(); + for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) { + PDComplexFileSpecification spec = ent.getValue(); extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor); } } private void extractMultiOSPDEmbeddedFiles(String defaultName, - PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) throws IOException, + PDComplexFileSpecification spec, + EmbeddedDocumentExtractor extractor) throws IOException, SAXException, TikaException { if (spec == null) { @@ -589,7 +613,8 @@ class PDF2XHTML extends PDFTextStripper { //if it has xfa, try that. //if it doesn't exist or there's an exception, //go with traditional AcroForm - PDXFA pdxfa = form.getXFA(); + PDXFAResource pdxfa = form.getXFA(); + if (pdxfa != null) { XFAExtractor xfaExtractor = new XFAExtractor(); try { @@ -626,27 +651,19 @@ class PDF2XHTML extends PDFTextStripper { handler.endElement("div"); } - private void processAcroField(PDField field, XHTMLContentHandler handler, final int currentRecursiveDepth) + private void processAcroField(PDField field, + XHTMLContentHandler handler, final int currentRecursiveDepth) throws SAXException, IOException { if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) { return; } - addFieldString(field, handler); - - List<COSObjectable> kids = field.getKids(); - if (kids != null) { - + if (field instanceof PDNonTerminalField) { int r = currentRecursiveDepth + 1; handler.startElement("ol"); - //TODO: can generate <ol/>. Rework to avoid that. - for (COSObjectable pdfObj : kids) { - if (pdfObj != null && pdfObj instanceof PDField) { - PDField kid = (PDField) pdfObj; - //recurse - processAcroField(kid, handler, r); - } + for (PDField child : ((PDNonTerminalField)field).getChildren()) { + processAcroField(child, handler, r); } handler.endElement("ol"); } @@ -672,14 +689,9 @@ class PDF2XHTML extends PDFTextStripper { handleSignature(attrs, (PDSignatureField) field, handler); return; } - try { - //getValue can throw an IOException if there is no value - String value = field.getValue(); - if (value != null && !value.equals("null")) { - sb.append(value); - } - } catch (IOException e) { - //swallow + String value = field.getValueAsString(); + if (value != null && !value.equals("null")) { + sb.append(value); } if (attrs.getLength() > 0 || sb.length() > 0) { @@ -697,7 +709,7 @@ class PDF2XHTML extends PDFTextStripper { if (sig == null) { return; } - Map<String, String> vals = new TreeMap<String, String>(); + Map<String, String> vals = new TreeMap<>(); vals.put("name", sig.getName()); vals.put("contactInfo", sig.getContactInfo()); vals.put("location", sig.getLocation()); http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java index 0d7e3ba..057f833 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFEncodedStringDecoder.java @@ -17,14 +17,16 @@ package org.apache.tika.parser.pdf; +import static java.nio.charset.StandardCharsets.ISO_8859_1; + import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import org.apache.pdfbox.cos.COSString; -import org.apache.pdfbox.pdfparser.BaseParser; - -import static java.nio.charset.StandardCharsets.ISO_8859_1; +import org.apache.pdfbox.io.RandomAccessBuffer; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.pdfparser.COSParser; /** * In fairly rare cases, a PDF's XMP will contain a string that @@ -81,7 +83,7 @@ class PDFEncodedStringDecoder { try { byte[] bytes = new String("(" + value + ")").getBytes(ISO_8859_1); InputStream is = new ByteArrayInputStream(bytes); - COSStringParser p = new COSStringParser(is); + COSStringParser p = new COSStringParser(new RandomAccessBuffer(is)); String parsed = p.myParseCOSString(); if (parsed != null) { return parsed; @@ -93,9 +95,9 @@ class PDFEncodedStringDecoder { return value; } - class COSStringParser extends BaseParser { + class COSStringParser extends COSParser { - COSStringParser(InputStream buffer) throws IOException { + COSStringParser(RandomAccessRead buffer) throws IOException { super(buffer); } http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 8cb1b98..17785c9 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -28,6 +28,7 @@ import java.util.Locale; import java.util.Set; import org.apache.commons.io.input.CloseShieldInputStream; +import org.apache.jempbox.xmp.XMPMetadata; import org.apache.jempbox.xmp.XMPSchema; import org.apache.jempbox.xmp.XMPSchemaDublinCore; import org.apache.jempbox.xmp.XMPSchemaMediaManagement; @@ -37,14 +38,10 @@ import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSString; -import org.apache.pdfbox.exceptions.CryptographyException; -import org.apache.pdfbox.io.RandomAccess; -import org.apache.pdfbox.io.RandomAccessBuffer; -import org.apache.pdfbox.io.RandomAccessFile; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; -import org.apache.pdfbox.pdmodel.font.PDFont; +import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; @@ -119,31 +116,17 @@ public class PDFParser extends AbstractParser { // PDFBox can process entirely in memory, or can use a temp file // for unpacked / processed resources // Decide which to do based on if we're reading from a file or not already + //TODO: make this configurable via MemoryUsageSetting TikaInputStream tstream = TikaInputStream.cast(stream); password = getPassword(metadata, context); if (tstream != null && tstream.hasFile()) { - // File based, take that as a cue to use a temporary file - RandomAccess scratchFile = new RandomAccessFile(tmp.createTemporaryFile(), "rw"); - if (localConfig.getUseNonSequentialParser() == true) { - pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), scratchFile, password); - } else { - pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), scratchFile, true); - } + // File based -- send file directly to PDFBox + pdfDocument = PDDocument.load(tstream.getPath().toFile(), password); } else { - // Go for the normal, stream based in-memory parsing - if (localConfig.getUseNonSequentialParser() == true) { - pdfDocument = PDDocument.loadNonSeq(new CloseShieldInputStream(stream), new RandomAccessBuffer(), password); - } else { - pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), true); - } + pdfDocument = PDDocument.load(new CloseShieldInputStream(stream), password); } metadata.set("pdf:encrypted", Boolean.toString(pdfDocument.isEncrypted())); - //if using the classic parser and the doc is encrypted, we must manually decrypt - if (!localConfig.getUseNonSequentialParser() && pdfDocument.isEncrypted()) { - pdfDocument.decrypt(password); - } - metadata.set(Metadata.CONTENT_TYPE, "application/pdf"); extractMetadata(pdfDocument, metadata); @@ -156,27 +139,13 @@ public class PDFParser extends AbstractParser { PDF2XHTML.process(pdfDocument, handler, context, metadata, localConfig); } } - - } catch (CryptographyException e) { - //seq parser throws CryptographyException for bad password + } catch (InvalidPasswordException e) { + metadata.set("pdf:encrypted", "true"); throw new EncryptedDocumentException(e); - } catch (IOException e) { - //nonseq parser throws IOException for bad password - //At the Tika level, we want the same exception to be thrown - if (e.getMessage() != null && - e.getMessage().contains("Error (CryptographyException)")) { - metadata.set("pdf:encrypted", Boolean.toString(true)); - throw new EncryptedDocumentException(e); - } - //rethrow any other IOExceptions - throw e; } finally { if (pdfDocument != null) { pdfDocument.close(); } - tmp.dispose(); - //TODO: once we migrate to PDFBox 2.0, remove this (PDFBOX-2200) - PDFont.clearResources(); } } @@ -231,7 +200,8 @@ public class PDFParser extends AbstractParser { XMPSchemaMediaManagement mmSchema = null; try { if (document.getDocumentCatalog().getMetadata() != null) { - xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); + InputStream xmpIs = document.getDocumentCatalog().getMetadata().exportXMPMetadata(); + xmp = XMPMetadata.load(xmpIs); } } catch (IOException e) {} @@ -256,29 +226,21 @@ public class PDFParser extends AbstractParser { // TODO: Move to description in Tika 2.0 addMetadata(metadata, TikaCoreProperties.TRANSITION_SUBJECT_TO_OO_SUBJECT, info.getSubject()); addMetadata(metadata, "trapped", info.getTrapped()); - try { // TODO Remove these in Tika 2.0 - addMetadata(metadata, "created", info.getCreationDate()); - addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); - } catch (IOException e) { - // Invalid date format, just ignore - } - try { - Calendar modified = info.getModificationDate(); - addMetadata(metadata, Metadata.LAST_MODIFIED, modified); - addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); - } catch (IOException e) { - // Invalid date format, just ignore - } + addMetadata(metadata, "created", info.getCreationDate()); + addMetadata(metadata, TikaCoreProperties.CREATED, info.getCreationDate()); + Calendar modified = info.getModificationDate(); + addMetadata(metadata, Metadata.LAST_MODIFIED, modified); + addMetadata(metadata, TikaCoreProperties.MODIFIED, modified); // All remaining metadata is custom // Copy this over as-is List<String> handledMetadata = Arrays.asList("Author", "Creator", "CreationDate", "ModDate", "Keywords", "Producer", "Subject", "Title", "Trapped"); - for (COSName key : info.getDictionary().keySet()) { + for (COSName key : info.getCOSObject().keySet()) { String name = key.getName(); if (!handledMetadata.contains(name)) { - addMetadata(metadata, name, info.getDictionary().getDictionaryObject(key)); + addMetadata(metadata, name, info.getCOSObject().getDictionaryObject(key)); } } @@ -315,7 +277,7 @@ public class PDFParser extends AbstractParser { } //TODO: Let's try to move this into PDFBox. //Attempt to determine Adobe extension level, if present: - COSDictionary root = document.getDocumentCatalog().getCOSDictionary(); + COSDictionary root = document.getDocumentCatalog().getCOSObject(); COSDictionary extensions = (COSDictionary) root.getDictionaryObject(COSName.getPDFName("Extensions")); if (extensions != null) { for (COSName extName : extensions.keySet()) { @@ -544,25 +506,6 @@ public class PDFParser extends AbstractParser { } /** - * @see #setUseNonSequentialParser(boolean) - * @deprecated use {@link #getPDFParserConfig()} - */ - public boolean getUseNonSequentialParser() { - return defaultConfig.getUseNonSequentialParser(); - } - - /** - * If true, the parser will use the NonSequentialParser. This may - * be faster than the full doc parser. - * If false (default), this will use the full doc parser. - * - * @deprecated use {@link #setPDFParserConfig(PDFParserConfig)} - */ - public void setUseNonSequentialParser(boolean v) { - defaultConfig.setUseNonSequentialParser(v); - } - - /** * @see #setEnableAutoSpace(boolean) * @deprecated use {@link #getPDFParserConfig()} */ http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index 2a650dd..ea43761 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -23,7 +23,7 @@ import java.io.Serializable; import java.util.Locale; import java.util.Properties; -import org.apache.pdfbox.util.PDFTextStripper; +import org.apache.pdfbox.text.PDFTextStripper; /** * Config for PDFParser. @@ -60,9 +60,6 @@ public class PDFParserConfig implements Serializable { // (necessary for some PDFs, but messes up other PDFs): private boolean sortByPosition = false; - //True if we should use PDFBox's NonSequentialParser - private boolean useNonSequentialParser = false; - //True if acroform content should be extracted private boolean extractAcroFormContent = true; @@ -130,9 +127,6 @@ public class PDFParserConfig implements Serializable { setSortByPosition( getProp(props.getProperty("sortByPosition"), getSortByPosition())); - setUseNonSequentialParser( - getProp(props.getProperty("useNonSequentialParser"), - getUseNonSequentialParser())); setExtractAcroFormContent( getProp(props.getProperty("extractAcroFormContent"), getExtractAcroFormContent())); @@ -165,7 +159,6 @@ public class PDFParserConfig implements Serializable { * @param pdf2XHTML */ public void configure(PDF2XHTML pdf2XHTML) { - pdf2XHTML.setForceParsing(true); pdf2XHTML.setSortByPosition(getSortByPosition()); if (getEnableAutoSpace()) { pdf2XHTML.setWordSeparator(" "); @@ -350,28 +343,6 @@ public class PDFParserConfig implements Serializable { } /** - * @see #setUseNonSequentialParser(boolean) - */ - public boolean getUseNonSequentialParser() { - return useNonSequentialParser; - } - - /** - * If true, uses PDFBox's non-sequential parser. - * The non-sequential parser should be much faster than the traditional - * full doc parser. However, until PDFBOX-XXX is fixed, - * the non-sequential parser fails - * to extract some document metadata. - * <p/> - * Default is false (use the traditional parser) - * - * @param useNonSequentialParser - */ - public void setUseNonSequentialParser(boolean useNonSequentialParser) { - this.useNonSequentialParser = useNonSequentialParser; - } - - /** * @see #setAverageCharTolerance(Float) */ public Float getAverageCharTolerance() { @@ -439,7 +410,6 @@ public class PDFParserConfig implements Serializable { + ((spacingTolerance == null) ? 0 : spacingTolerance.hashCode()); result = prime * result + (suppressDuplicateOverlappingText ? 1231 : 1237); - result = prime * result + (useNonSequentialParser ? 1231 : 1237); result = prime * result + (ifXFAExtractOnlyXFA ? 1231 : 1237); return result; } @@ -477,8 +447,6 @@ public class PDFParserConfig implements Serializable { return false; if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText) return false; - if (useNonSequentialParser != other.useNonSequentialParser) - return false; if (ifXFAExtractOnlyXFA != other.ifXFAExtractOnlyXFA) return false; @@ -491,7 +459,6 @@ public class PDFParserConfig implements Serializable { + ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText + ", extractAnnotationText=" + extractAnnotationText + ", sortByPosition=" + sortByPosition - + ", useNonSequentialParser=" + useNonSequentialParser + ", extractAcroFormContent=" + extractAcroFormContent + ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + ", extractInlineImages=" + extractInlineImages http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties index bcfe1c6..153950e 100644 --- a/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties +++ b/tika-parsers/src/main/resources/org/apache/tika/parser/pdf/PDFParser.properties @@ -17,7 +17,6 @@ enableAutoSpace true extractAnnotationText true sortByPosition false suppressDuplicateOverlappingText false -useNonSequentialParser false extractAcroFormContent true extractInlineImages false extractUniqueInlineImagesOnly true http://git-wip-us.apache.org/repos/asf/tika/blob/98eb56ec/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 47f3e0a..82a000c 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -22,14 +22,11 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; -import java.io.File; -import java.io.FileInputStream; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; @@ -39,7 +36,6 @@ import org.apache.log4j.Logger; import org.apache.tika.TikaTest; import org.apache.tika.exception.AccessPermissionException; import org.apache.tika.exception.EncryptedDocumentException; -import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.ContainerExtractor; import org.apache.tika.extractor.DocumentSelector; import org.apache.tika.extractor.ParserContainerExtractor; @@ -258,36 +254,6 @@ public class PDFParserTest extends TikaTest { //pdf:encrypted, X-Parsed-By and Content-Type assertEquals("very little metadata should be parsed", 3, metadata.names().length); assertEquals(0, content.length()); - - //now test wrong password with non sequential parser - handler = new BodyContentHandler(); - metadata = new Metadata(); - context = new ParseContext(); - context.set(PasswordProvider.class, new PasswordProvider() { - public String getPassword(Metadata metadata) { - return "WRONG!!!!"; - } - }); - PDFParserConfig config = new PDFParserConfig(); - config.setUseNonSequentialParser(true); - context.set(PDFParserConfig.class, config); - - ; - ex = false; - try (InputStream stream = PDFParserTest.class.getResourceAsStream( - "/test-documents/testPDF_protected.pdf")) { - parser.parse(stream, handler, metadata, context); - } catch (EncryptedDocumentException e) { - ex = true; - } - content = handler.toString(); - assertTrue("encryption exception", ex); - assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); - assertEquals("true", metadata.get("pdf:encrypted")); - - //pdf:encrypted, X-Parsed-By and Content-Type - assertEquals("very little metadata should be parsed", 3, metadata.names().length); - assertEquals(0, content.length()); } @Test @@ -612,85 +578,6 @@ public class PDFParserTest extends TikaTest { assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2)); } - /** - * tests for equality between traditional sequential parser - * and newer nonsequential parser. - * <p/> - * TODO: more testing - */ - @Test - public void testSequentialParser() throws Exception { - - Parser sequentialParser = new AutoDetectParser(); - Parser nonSequentialParser = new AutoDetectParser(); - - ParseContext seqContext = new ParseContext(); - PDFParserConfig seqConfig = new PDFParserConfig(); - seqConfig.setUseNonSequentialParser(false); - seqContext.set(PDFParserConfig.class, seqConfig); - - ParseContext nonSeqContext = new ParseContext(); - PDFParserConfig nonSeqConfig = new PDFParserConfig(); - nonSeqConfig.setUseNonSequentialParser(true); - nonSeqContext.set(PDFParserConfig.class, nonSeqConfig); - - File testDocs = new File(this.getClass().getResource("/test-documents").toURI()); - int pdfs = 0; - //empty as of PDFBox 1.8.11 - //leave this in for the 1.8.x series in case something new happens - Set<String> knownMetadataDiffs = new HashSet<String>(); - - //empty for now - Set<String> knownContentDiffs = new HashSet<String>(); - - for (File f : testDocs.listFiles()) { - if (!f.getName().toLowerCase(Locale.ROOT).endsWith(".pdf")) { - continue; - } - - String sequentialContent = null; - Metadata sequentialMetadata = new Metadata(); - try { - sequentialContent = getText(new FileInputStream(f), - sequentialParser, seqContext, sequentialMetadata); - } catch (EncryptedDocumentException e) { - //silently skip a file that requires a user password - continue; - } catch (Exception e) { - throw new TikaException("Sequential Parser failed on test file " + f, e); - } - - pdfs++; - - String nonSequentialContent = null; - Metadata nonSequentialMetadata = new Metadata(); - try { - nonSequentialContent = getText(new FileInputStream(f), - nonSequentialParser, nonSeqContext, nonSequentialMetadata); - } catch (Exception e) { - throw new TikaException("Non-Sequential Parser failed on test file " + f, e); - } - - if (knownContentDiffs.contains(f.getName())) { - assertFalse(f.getName(), sequentialContent.equals(nonSequentialContent)); - } else { - assertEquals(f.getName(), sequentialContent, nonSequentialContent); - } - - //skip this one file. - if (knownMetadataDiffs.contains(f.getName())) { - assertFalse(f.getName(), sequentialMetadata.equals(nonSequentialMetadata)); - } else { - assertEquals(f.getName(), sequentialMetadata, nonSequentialMetadata); - } - } - //make sure nothing went wrong with getting the resource to test-documents - //must have tested >= 15 pdfs - boolean ge15 = (pdfs >= 15); - assertTrue("Number of pdf files tested >= 15 in non-sequential parser test", ge15); - } - - // TIKA-973 //commented out until test documents that are unambiguously //consistent with Apache License v2.0 are contributed.
