Author: tallison
Date: Tue Dec 10 01:14:11 2013
New Revision: 1549727
URL: http://svn.apache.org/r1549727
Log:
TIKA-973 added basic extraction of pdf AcroForm content. Many thanks to Ben
Litchfield for org.apache.pdfbox.examples.fdf.PrintFields, on which this patch
relies.
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroForm1.pdf
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroForm2.pdf
(with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1549727&r1=1549726&r2=1549727&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
Tue Dec 10 01:14:11 2013
@@ -18,6 +18,9 @@ package org.apache.tika.parser.pdf;
import java.io.IOException;
import java.io.Writer;
+import java.util.Iterator;
+import java.util.List;
+import java.util.ListIterator;
import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -35,6 +38,8 @@ import org.apache.pdfbox.pdmodel.interac
import
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import org.apache.tika.exception.TikaException;
@@ -48,6 +53,7 @@ import org.apache.tika.sax.EmbeddedConte
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
@@ -55,6 +61,13 @@ import org.xml.sax.SAXException;
* stream.
*/
class PDF2XHTML extends PDFTextStripper {
+
+ /**
+ * Maximum recursive depth during AcroForm processing.
+ * Prevents theoretical AcroForm recursion bomb.
+ */
+ private final static int MAX_ACROFORM_RECURSIONS = 10;
+
// TODO: remove once PDFBOX-1130 is fixed:
private boolean inParagraph = false;
@@ -165,6 +178,11 @@ class PDF2XHTML extends PDFTextStripper
// Extract text for any bookmarks:
extractBookmarkText();
extractEmbeddedDocuments(pdf, originalHandler);
+
+ //extract acroform data at end of doc
+ if (config.getExtractAcroFormContent() == true){
+ extractAcroForm(pdf, handler);
+ }
handler.endDocument();
} catch (TikaException e){
throw new IOExceptionWithCause("Unable to end a document", e);
@@ -360,4 +378,104 @@ class PDF2XHTML extends PDFTextStripper
}
}
}
+ private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler)
throws IOException,
+ SAXException {
+ //Thank you, Ben Litchfield, for
org.apache.pdfbox.examples.fdf.PrintFields
+ //this code derives from Ben's code
+ PDDocumentCatalog catalog = pdf.getDocumentCatalog();
+
+ if (catalog == null)
+ return;
+
+ PDAcroForm form = catalog.getAcroForm();
+ if (form == null)
+ return;
+
+ @SuppressWarnings("rawtypes")
+ List fields = form.getFields();
+
+ if (fields == null)
+ return;
+
+ @SuppressWarnings("rawtypes")
+ ListIterator itr = fields.listIterator();
+
+ if (itr == null)
+ return;
+
+ handler.startElement("div", "class", "acroform");
+ handler.startElement("ol");
+ while (itr.hasNext()){
+ Object obj = itr.next();
+ if (obj != null && obj instanceof PDField){
+ processAcroField((PDField)obj, handler, 0);
+ }
+ }
+ handler.endElement("ol");
+ handler.endElement("div");
+ }
+
+ private void processAcroField(PDField field, XHTMLContentHandler handler,
final int recurseDepth)
+ throws SAXException, IOException {
+
+ if (recurseDepth >= MAX_ACROFORM_RECURSIONS){
+ return;
+ }
+
+ addFieldString(field, handler);
+
+ @SuppressWarnings("rawtypes")
+ List kids = field.getKids();
+ if(kids != null){
+
+ @SuppressWarnings("rawtypes")
+ Iterator kidsIter = kids.iterator();
+ if (kidsIter == null){
+ return;
+ }
+ int r = recurseDepth+1;
+ handler.startElement("ol");
+ while(kidsIter.hasNext()){
+ Object pdfObj = kidsIter.next();
+ if(pdfObj != null && pdfObj instanceof PDField){
+ PDField kid = (PDField)pdfObj;
+ //recurse
+ processAcroField(kid, handler, r);
+ }
+ }
+ handler.endElement("ol");
+ }
+ }
+
+ private void addFieldString(PDField field, XHTMLContentHandler handler)
throws SAXException{
+ //Pick partial name to present in content and altName for attribute
+ //Ignoring FullyQualifiedName for now
+ String partName = field.getPartialName();
+ String altName = field.getAlternateFieldName();
+
+ StringBuilder sb = new StringBuilder();
+ AttributesImpl attrs = new AttributesImpl();
+
+ if (partName != null){
+ sb.append(partName).append(": ");
+ }
+ if (altName != null){
+ attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+ }
+ String value = "";
+ try {
+ value = field.getValue();
+ } catch (IOException e) {
+ //swallow
+ }
+
+ if (value != null && ! value.equals("null")){
+ sb.append(value);
+ }
+ if (attrs.getLength() > 0 || sb.length() > 0){
+ handler.startElement("li", attrs);
+ handler.characters(sb.toString());
+ handler.endElement("li");
+ }
+ }
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java?rev=1549727&r1=1549726&r2=1549727&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
Tue Dec 10 01:14:11 2013
@@ -59,6 +59,9 @@ public class PDFParserConfig implements
//True if we should use PDFBox's NonSequentialParser
private boolean useNonSequentialParser = false;
+
+ //True if acroform content should be extracted
+ private boolean extractAcroFormContent = true;
public PDFParserConfig(){
init(this.getClass().getResourceAsStream("PDFParser.properties"));
@@ -108,6 +111,26 @@ public class PDFParserConfig implements
setUseNonSequentialParser(
getProp(props.getProperty("useNonSequentialParser"),
getUseNonSequentialParser()));
+ setExtractAcroFormContent(
+ getProp(props.getProperty("extractAcroFormContent"),
+ getExtractAcroFormContent()));
+ }
+
+
+ /**
+ * If true (the default), extract content from AcroForms
+ * at the end of the document.
+ *
+ * @param b
+ */
+ public void setExtractAcroFormContent(boolean extractAcroFormContent) {
+ this.extractAcroFormContent = extractAcroFormContent;
+
+ }
+
+ /** @see #setExtractAcroFormContent(boolean) */
+ public boolean getExtractAcroFormContent() {
+ return extractAcroFormContent;
}
/** @see #setEnableAutoSpace. */
@@ -210,6 +233,7 @@ public class PDFParserConfig implements
final int prime = 31;
int result = 1;
result = prime * result + (enableAutoSpace ? 1231 : 1237);
+ result = prime * result + (extractAcroFormContent ? 1231 : 1237);
result = prime * result + (extractAnnotationText ? 1231 : 1237);
result = prime * result + (sortByPosition ? 1231 : 1237);
result = prime * result
@@ -229,6 +253,8 @@ public class PDFParserConfig implements
PDFParserConfig other = (PDFParserConfig) obj;
if (enableAutoSpace != other.enableAutoSpace)
return false;
+ if (extractAcroFormContent != other.extractAcroFormContent)
+ return false;
if (extractAnnotationText != other.extractAnnotationText)
return false;
if (sortByPosition != other.sortByPosition)
@@ -246,7 +272,10 @@ public class PDFParserConfig implements
+ ", suppressDuplicateOverlappingText="
+ suppressDuplicateOverlappingText + ", extractAnnotationText="
+ extractAnnotationText + ", sortByPosition=" + sortByPosition
- + ", useNonSequentialParser=" + useNonSequentialParser + "]";
+ + ", useNonSequentialParser=" + useNonSequentialParser
+ + ", extractAcroFormContent=" + extractAcroFormContent + "]";
}
+
+
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1549727&r1=1549726&r2=1549727&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Tue Dec 10 01:14:11 2013
@@ -502,33 +502,36 @@ public class PDFParserTest extends TikaT
PDFParserConfig config = new PDFParserConfig();
config.setUseNonSequentialParser(true);
context.set(PDFParserConfig.class, config);
-
+
File testDocs = new
File(this.getClass().getResource("/test-documents").toURI());
int pdfs = 0;
Set<String> knownMetadataDiffs = new HashSet<String>();
//PDFBox-1792/Tika-1203
knownMetadataDiffs.add("testAnnotations.pdf");
-
+ //PDFBox-1806
+ knownMetadataDiffs.add("test_acroForm2.pdf");
+
//empty for now
Set<String> knownContentDiffs = new HashSet<String>();
-
+
for (File f : testDocs.listFiles()){
if (! f.getName().toLowerCase().endsWith(".pdf")){
continue;
}
+
pdfs++;
Metadata defaultMetadata = new Metadata();
String defaultContent = getText(new FileInputStream(f),
defaultParser, defaultMetadata);
Metadata sequentialMetadata = new Metadata();
String sequentialContent = getText(new FileInputStream(f),
sequentialParser, context, sequentialMetadata);
-
+
if (knownContentDiffs.contains(f.getName())){
assertFalse(f.getName(),
defaultContent.equals(sequentialContent));
} else {
assertEquals(f.getName(), defaultContent, sequentialContent);
}
-
+
//skip this one file.
if (knownMetadataDiffs.contains(f.getName())){
assertFalse(f.getName(),
defaultMetadata.equals(sequentialMetadata));
@@ -537,8 +540,53 @@ public class PDFParserTest extends TikaT
}
}
//make sure nothing went wrong with getting the resource to
test-documents
- assertEquals("Number of pdf files tested", 14, pdfs);
+ assertEquals("Number of pdf files tested", 16, pdfs);
}
+ // TIKA-973
+ public void testAcroForm() throws Exception{
+ Parser p = new AutoDetectParser();
+ ParseContext context = new ParseContext();
+ InputStream stream =
getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
+ String txt = getText(stream, p, context);
+ stream.close();
+
+ //simple first level form contents
+ assertContains("to: John Doe", txt);
+ //checkbox
+ assertContains("xpackaging: Yes", txt);
+
+ //this guarantees that the form processor
+ //worked recursively at least once...i.e. it didn't just
+ //take the first form
+ stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
+ txt = getText(stream, p, context);
+ stream.close();
+ assertContains("123 Main St.", txt);
+
+
+ //now test with nonsequential parser
+ PDFParserConfig config = new PDFParserConfig();
+ config.setUseNonSequentialParser(true);
+ context.set(PDFParserConfig.class, config);
+ stream = getResourceAsStream("/test-documents/testPDF_acroForm1.pdf");
+ txt = getText(stream, p, context);
+ stream.close();
+
+ //simple first level form contents
+ assertContains("to: John Doe", txt);
+ //checkbox
+ assertContains("xpackaging: Yes", txt);
+
+ //this guarantees that the form processor
+ //worked recursively at least once...i.e. it didn't just
+ //take the first form
+ stream = getResourceAsStream("/test-documents/testPDF_acroForm2.pdf");
+ txt = getText(stream, p, context);
+ assertContains("123 Main St.", txt);
+ stream.close();
+
+
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroForm1.pdf
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroForm1.pdf?rev=1549727&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroForm1.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroForm2.pdf
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroForm2.pdf?rev=1549727&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroForm2.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream