svn commit: r1561661 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/resources/test-documents/

tallison Mon, 27 Jan 2014 05:11:12 -0800

Author: tallison
Date: Mon Jan 27 13:09:16 2014
New Revision: 1561661

URL: http://svn.apache.org/r1561661
Log:
TIKA-1226: PDF TextStripper fails when it encounters PDSignature Field.


Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf 
  (with props)
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1561661&r1=1561660&r2=1561661&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Jan 27 13:09:16 2014
@@ -1,5 +1,9 @@
 Release 1.5 - Current Development
 
+  * Added option to use alternate NonSequentialPDFParser (TIKA-1201).
+
+  * Content from PDF AcroForms is now extracted (TIKA-973).
+
   * Fixed invalid asterisks from master slide in PPT (TIKA-1171).
 
   * Added test cases to confirm handling of auto-date in PPT and PPTX 
(TIKA-817).
@@ -14,7 +18,7 @@ Release 1.5 - Current Development
 
   * Upgraded POI to 3.10-beta2 (TIKA-1173).
 
-  * Upgraded PDFBox to 1.8.2 (TIKA-1153).
+  * Upgraded PDFBox to 1.8.3 (TIKA-1200).
 
   * Made HtmlEncodingDetector more flexible in finding meta 
     header charset (TIKA-1001).
@@ -24,7 +28,7 @@ Release 1.5 - Current Development
   * Fixed bug that prevented attachments within a PDF from being processed
     if the PDF itself was an attachment (TIKA-1124).
 
-  * Text from paragraph-level structured document tags is now extracted 
(TIKA-1130).
+  * Text from paragraph-level structured document tags in DOCX files is now 
extracted (TIKA-1130).
 
   * RTF: Fixed ArrayIndexOutOfBoundsException when parsing list override 
(TIKA-1192).
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1561661&r1=1561660&r2=1561661&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java 
Mon Jan 27 13:09:16 2014
@@ -18,10 +18,13 @@ package org.apache.tika.parser.pdf;
 
 import java.io.IOException;
 import java.io.Writer;
+import java.text.SimpleDateFormat;
+import java.util.Calendar;
 import java.util.Iterator;
 import java.util.List;
 import java.util.ListIterator;
 import java.util.Map;
+import java.util.TreeMap;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
@@ -35,11 +38,13 @@ import org.apache.pdfbox.pdmodel.interac
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
 import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
 import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 import 
org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
 import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
 import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
 import org.apache.pdfbox.util.PDFTextStripper;
 import org.apache.pdfbox.util.TextPosition;
 import org.apache.tika.exception.TikaException;
@@ -63,6 +68,11 @@ import org.xml.sax.helpers.AttributesImp
 class PDF2XHTML extends PDFTextStripper {
     
     /**
+     * format used for signature dates
+     */
+    private final SimpleDateFormat dateFormat = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
+ 
+    /**
      * Maximum recursive depth during AcroForm processing.
      * Prevents theoretical AcroForm recursion bomb. 
      */
@@ -446,36 +456,90 @@ class PDF2XHTML extends PDFTextStripper 
              handler.endElement("ol");
           }
       }
+    private void addFieldString(PDField field, XHTMLContentHandler handler) 
throws SAXException{
+        //Pick partial name to present in content and altName for attribute
+        //Ignoring FullyQualifiedName for now
+        String partName = field.getPartialName();
+        String altName = field.getAlternateFieldName();
+
+        StringBuilder sb = new StringBuilder();
+        AttributesImpl attrs = new AttributesImpl();
+
+        if (partName != null){
+            sb.append(partName).append(": ");
+        }
+        if (altName != null){
+            attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+        }
+        //return early if PDSignature field
+        if (field instanceof PDSignatureField){
+            handleSignature(attrs, (PDSignatureField)field, handler);
+            return;
+        }
+        try {
+            //getValue can throw an IOException if there is no value
+            String value = field.getValue();
+            if (value != null && ! value.equals("null")){
+                sb.append(value);
+            }
+        } catch (IOException e) {
+            //swallow
+        }
 
-      private void addFieldString(PDField field, XHTMLContentHandler handler) 
throws SAXException{
-          //Pick partial name to present in content and altName for attribute
-          //Ignoring FullyQualifiedName for now
-          String partName = field.getPartialName();
-          String altName = field.getAlternateFieldName();
+        if (attrs.getLength() > 0 || sb.length() > 0){
+            handler.startElement("li", attrs);
+            handler.characters(sb.toString());
+            handler.endElement("li");
+        }
+    }
 
-          StringBuilder sb = new StringBuilder();
-          AttributesImpl attrs = new AttributesImpl();
+    private void handleSignature(AttributesImpl parentAttributes, 
PDSignatureField sigField,
+            XHTMLContentHandler handler) throws SAXException{
+       
 
-          if (partName != null){
-             sb.append(partName).append(": ");
-          }
-          if (altName != null){
-             attrs.addAttribute("", "altName", "altName", "CDATA", altName);
-          }
-          String value = "";
-          try {
-              value = field.getValue();
-          } catch (IOException e) {
-               //swallow
-          }
-          
-          if (value != null && ! value.equals("null")){
-              sb.append(value);
-          }
-          if (attrs.getLength() > 0 || sb.length() > 0){
-              handler.startElement("li", attrs);
-              handler.characters(sb.toString());
-              handler.endElement("li");
-          }
-      }
+        PDSignature sig = sigField.getSignature();
+        if (sig == null){
+            return;
+        }
+        Map<String, String> vals= new TreeMap<String, String>();
+        vals.put("name", sig.getName());
+        vals.put("contactInfo", sig.getContactInfo());
+        vals.put("location", sig.getLocation());
+        vals.put("reason", sig.getReason());
+
+        Calendar cal = sig.getSignDate();
+        if (cal != null){
+            dateFormat.setTimeZone(cal.getTimeZone());
+            vals.put("date", dateFormat.format(cal.getTime()));
+        }
+        //see if there is any data
+        int nonNull = 0;
+        for (String val : vals.keySet()){
+            if (val != null && ! val.equals("")){
+                nonNull++;
+            }
+        }
+        //if there is, process it
+        if (nonNull > 0){
+            handler.startElement("li", parentAttributes);
+
+            AttributesImpl attrs = new AttributesImpl();
+            attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
+
+            handler.startElement("ol", attrs);
+            for (Map.Entry<String, String> e : vals.entrySet()){
+                if (e.getValue() == null || e.getValue().equals("")){
+                    continue;
+                }
+                attrs = new AttributesImpl();
+                attrs.addAttribute("", "signdata", "signdata", "CDATA", 
e.getKey());
+                handler.startElement("li", attrs);
+                handler.characters(e.getValue());
+                handler.endElement("li");
+                System.out.println("SIG DATA: " + e.getKey() + " : " + 
e.getValue());
+            }
+            handler.endElement("ol");
+            handler.endElement("li");
+        }
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1561661&r1=1561660&r2=1561661&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 Mon Jan 27 13:09:16 2014
@@ -564,7 +564,7 @@ public class PDFParserTest extends TikaT
         //make sure nothing went wrong with getting the resource to 
test-documents
         //This will require modification with each new pdf test.
         //If this is too annoying, we can turn it off.
-        assertEquals("Number of pdf files tested", 14, pdfs);
+        assertEquals("Number of pdf files tested", 15, pdfs);
     }
 
 
@@ -616,4 +616,13 @@ public class PDFParserTest extends TikaT
        stream.close();     
     }
 */
+
+    //TIKA-1226
+    public void testSignatureInAcroForm() throws Exception{
+        //The current test doc does not contain any content in the signature 
area.
+        //This just tests that a RuntimeException is not thrown.
+        //TODO: find a better test file for this issue.
+        String xml = getXML("/testPDF_acroform3.pdf").xml;
+        assertTrue("found", (xml.indexOf("<li>aTextField: TIKA-1226</li>") > 
-1));
+    }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf?rev=1561661&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_acroform3.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

svn commit: r1561661 - in /tika/trunk: ./ tika-parsers/src/main/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/java/org/apache/tika/parser/pdf/ tika-parsers/src/test/resources/test-documents/

Reply via email to