Author: tallison Date: Mon Jan 27 13:09:16 2014 New Revision: 1561661 URL: http://svn.apache.org/r1561661 Log: TIKA-1226: PDF TextStripper fails when it encounters PDSignature Field.
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf (with props) Modified: tika/trunk/CHANGES.txt tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Modified: tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1561661&r1=1561660&r2=1561661&view=diff ============================================================================== --- tika/trunk/CHANGES.txt (original) +++ tika/trunk/CHANGES.txt Mon Jan 27 13:09:16 2014 @@ -1,5 +1,9 @@ Release 1.5 - Current Development + * Added option to use alternate NonSequentialPDFParser (TIKA-1201). + + * Content from PDF AcroForms is now extracted (TIKA-973). + * Fixed invalid asterisks from master slide in PPT (TIKA-1171). * Added test cases to confirm handling of auto-date in PPT and PPTX (TIKA-817). @@ -14,7 +18,7 @@ Release 1.5 - Current Development * Upgraded POI to 3.10-beta2 (TIKA-1173). - * Upgraded PDFBox to 1.8.2 (TIKA-1153). + * Upgraded PDFBox to 1.8.3 (TIKA-1200). * Made HtmlEncodingDetector more flexible in finding meta header charset (TIKA-1001). @@ -24,7 +28,7 @@ Release 1.5 - Current Development * Fixed bug that prevented attachments within a PDF from being processed if the PDF itself was an attachment (TIKA-1124). - * Text from paragraph-level structured document tags is now extracted (TIKA-1130). + * Text from paragraph-level structured document tags in DOCX files is now extracted (TIKA-1130). * RTF: Fixed ArrayIndexOutOfBoundsException when parsing list override (TIKA-1192). Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1561661&r1=1561660&r2=1561661&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Mon Jan 27 13:09:16 2014 @@ -18,10 +18,13 @@ package org.apache.tika.parser.pdf; import java.io.IOException; import java.io.Writer; +import java.text.SimpleDateFormat; +import java.util.Calendar; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.Map; +import java.util.TreeMap; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; @@ -35,11 +38,13 @@ import org.apache.pdfbox.pdmodel.interac import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup; +import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; import org.apache.pdfbox.pdmodel.interactive.form.PDField; +import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.pdfbox.util.TextPosition; import org.apache.tika.exception.TikaException; @@ -63,6 +68,11 @@ import org.xml.sax.helpers.AttributesImp class PDF2XHTML extends PDFTextStripper { /** + * format used for signature dates + */ + private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ"); + + /** * Maximum recursive depth during AcroForm processing. * Prevents theoretical AcroForm recursion bomb. */ @@ -446,36 +456,90 @@ class PDF2XHTML extends PDFTextStripper handler.endElement("ol"); } } + private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException{ + //Pick partial name to present in content and altName for attribute + //Ignoring FullyQualifiedName for now + String partName = field.getPartialName(); + String altName = field.getAlternateFieldName(); + + StringBuilder sb = new StringBuilder(); + AttributesImpl attrs = new AttributesImpl(); + + if (partName != null){ + sb.append(partName).append(": "); + } + if (altName != null){ + attrs.addAttribute("", "altName", "altName", "CDATA", altName); + } + //return early if PDSignature field + if (field instanceof PDSignatureField){ + handleSignature(attrs, (PDSignatureField)field, handler); + return; + } + try { + //getValue can throw an IOException if there is no value + String value = field.getValue(); + if (value != null && ! value.equals("null")){ + sb.append(value); + } + } catch (IOException e) { + //swallow + } - private void addFieldString(PDField field, XHTMLContentHandler handler) throws SAXException{ - //Pick partial name to present in content and altName for attribute - //Ignoring FullyQualifiedName for now - String partName = field.getPartialName(); - String altName = field.getAlternateFieldName(); + if (attrs.getLength() > 0 || sb.length() > 0){ + handler.startElement("li", attrs); + handler.characters(sb.toString()); + handler.endElement("li"); + } + } - StringBuilder sb = new StringBuilder(); - AttributesImpl attrs = new AttributesImpl(); + private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField, + XHTMLContentHandler handler) throws SAXException{ + - if (partName != null){ - sb.append(partName).append(": "); - } - if (altName != null){ - attrs.addAttribute("", "altName", "altName", "CDATA", altName); - } - String value = ""; - try { - value = field.getValue(); - } catch (IOException e) { - //swallow - } - - if (value != null && ! value.equals("null")){ - sb.append(value); - } - if (attrs.getLength() > 0 || sb.length() > 0){ - handler.startElement("li", attrs); - handler.characters(sb.toString()); - handler.endElement("li"); - } - } + PDSignature sig = sigField.getSignature(); + if (sig == null){ + return; + } + Map<String, String> vals= new TreeMap<String, String>(); + vals.put("name", sig.getName()); + vals.put("contactInfo", sig.getContactInfo()); + vals.put("location", sig.getLocation()); + vals.put("reason", sig.getReason()); + + Calendar cal = sig.getSignDate(); + if (cal != null){ + dateFormat.setTimeZone(cal.getTimeZone()); + vals.put("date", dateFormat.format(cal.getTime())); + } + //see if there is any data + int nonNull = 0; + for (String val : vals.keySet()){ + if (val != null && ! val.equals("")){ + nonNull++; + } + } + //if there is, process it + if (nonNull > 0){ + handler.startElement("li", parentAttributes); + + AttributesImpl attrs = new AttributesImpl(); + attrs.addAttribute("", "type", "type", "CDATA", "signaturedata"); + + handler.startElement("ol", attrs); + for (Map.Entry<String, String> e : vals.entrySet()){ + if (e.getValue() == null || e.getValue().equals("")){ + continue; + } + attrs = new AttributesImpl(); + attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey()); + handler.startElement("li", attrs); + handler.characters(e.getValue()); + handler.endElement("li"); + System.out.println("SIG DATA: " + e.getKey() + " : " + e.getValue()); + } + handler.endElement("ol"); + handler.endElement("li"); + } + } } Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1561661&r1=1561660&r2=1561661&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Mon Jan 27 13:09:16 2014 @@ -564,7 +564,7 @@ public class PDFParserTest extends TikaT //make sure nothing went wrong with getting the resource to test-documents //This will require modification with each new pdf test. //If this is too annoying, we can turn it off. - assertEquals("Number of pdf files tested", 14, pdfs); + assertEquals("Number of pdf files tested", 15, pdfs); } @@ -616,4 +616,13 @@ public class PDFParserTest extends TikaT stream.close(); } */ + + //TIKA-1226 + public void testSignatureInAcroForm() throws Exception{ + //The current test doc does not contain any content in the signature area. + //This just tests that a RuntimeException is not thrown. + //TODO: find a better test file for this issue. + String xml = getXML("/testPDF_acroform3.pdf").xml; + assertTrue("found", (xml.indexOf("<li>aTextField: TIKA-1226</li>") > -1)); + } } Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf?rev=1561661&view=auto ============================================================================== Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream