VisualSignatureParser.java

adam Fri, 15 Apr 2011 19:52:54 -0700

Author: adam
Date: Sat Apr 16 02:52:25 2011
New Revision: 1092856

URL: http://svn.apache.org/viewvc?rev=1092856&view=rev
Log:
PDFBOX-912: PDF signing interface and improvements


Added:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java

Added: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java?rev=1092856&view=auto
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java
 (added)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java
 Sat Apr 16 02:52:25 2011
@@ -0,0 +1,228 @@
+package org.apache.pdfbox.pdfparser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.pdfwriter.COSWriter;
+import org.apache.pdfbox.persistence.util.COSObjectKey;
+
+public class VisualSignatureParser extends BaseParser {
+
+    /**
+     * Log instance.
+     */
+    private static final Log log = LogFactory.getLog(PDFParser.class);
+
+    public VisualSignatureParser(InputStream input) throws IOException {
+        super(input);
+    }
+
+    public void parse() throws IOException {
+        document = new COSDocument();
+        skipToNextObj();
+
+        boolean wasLastParsedObjectEOF = false;
+        try {
+            while(!wasLastParsedObjectEOF) {
+                if(pdfSource.isEOF()) {
+                    break;
+                }
+                try {
+                    wasLastParsedObjectEOF = parseObject();
+                } catch(IOException e) {
+                    /*
+                     * Warning is sent to the PDFBox.log and to the Console 
that
+                     * we skipped over an object
+                     */
+                    log.warn("Parsing Error, Skipping Object", e);
+                    skipToNextObj();
+                }
+                skipSpaces();
+            }
+        } catch(IOException e) {
+            /*
+             * PDF files may have random data after the EOF marker. Ignore 
errors if
+             * last object processed is EOF.
+             */
+            if(!wasLastParsedObjectEOF) {
+                throw e;
+            }
+        }
+    }
+
+    private void skipToNextObj() throws IOException {
+        byte[] b = new byte[16];
+        Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL);
+        /* Read a buffer of data each time to see if it starts with a
+         * known keyword. This is not the most efficient design, but we should
+         * rarely be needing this function. We could update this to use the
+         * circular buffer, like in readUntilEndStream().
+         */
+        while(!pdfSource.isEOF()) {
+            int l = pdfSource.read(b);
+            if(l < 1) {
+                break;
+            }
+            String s = new String(b, "US-ASCII");
+            if(s.startsWith("trailer")
+                    || s.startsWith("xref")
+                    || s.startsWith("startxref")
+                    || s.startsWith("stream")
+                    || p.matcher(s).matches()) {
+                pdfSource.unread(b);
+                break;
+            } else {
+                pdfSource.unread(b, 1, l - 1);
+            }
+        }
+    }
+
+    private boolean parseObject() throws IOException {
+        boolean isEndOfFile = false;
+        skipSpaces();
+        //peek at the next character to determine the type of object we are 
parsing
+        char peekedChar = (char) pdfSource.peek();
+
+        //ignore endobj and endstream sections.
+        while(peekedChar == 'e') {
+            //there are times when there are multiple endobj, so lets
+            //just read them and move on.
+            readString();
+            skipSpaces();
+            peekedChar = (char) pdfSource.peek();
+        }
+        if(pdfSource.isEOF()) {
+            // end of file we will return a false and call it a day.
+        } else if(peekedChar == 'x') {
+            //xref table. Note: The contents of the Xref table are currently 
ignored
+            return true;
+        } else if(peekedChar == 't' || peekedChar == 's') {
+            // Note: startxref can occur in either a trailer section or by 
itself
+            if(peekedChar == 't') {
+                return true;
+            }
+            if(peekedChar == 's') {
+                skipToNextObj();
+                //verify that EOF exists
+                String eof = readExpectedString("%%EOF");
+                if(eof.indexOf("%%EOF") == -1 && !pdfSource.isEOF()) {
+                    throw new IOException("expected='%%EOF' actual='" + eof + 
"' next=" + readString()
+                            + " next=" + readString());
+                }
+                isEndOfFile = true;
+            }
+        } else {
+            //we are going to parse an normal object
+            int number = -1;
+            int genNum = -1;
+            String objectKey = null;
+            boolean missingObjectNumber = false;
+            try {
+                char peeked = (char) pdfSource.peek();
+                if(peeked == '<') {
+                    missingObjectNumber = true;
+                } else {
+                    number = readInt();
+                }
+            } catch(IOException e) {
+                //ok for some reason "GNU Ghostscript 5.10" puts two endobj
+                //statements after an object, of course this is nonsense
+                //but because we want to support as many PDFs as possible
+                //we will simply try again
+                number = readInt();
+            }
+            if(!missingObjectNumber) {
+                skipSpaces();
+                genNum = readInt();
+
+                objectKey = readString(3);
+                //System.out.println( "parseObject() num=" + number +
+                //" genNumber=" + genNum + " key='" + objectKey + "'" );
+                if(!objectKey.equals("obj")) {
+                    throw new IOException("expected='obj' actual='" + 
objectKey + "' " + pdfSource);
+                }
+            } else {
+                number = -1;
+                genNum = -1;
+            }
+
+            skipSpaces();
+            COSBase pb = parseDirObject();
+            String endObjectKey = readString();
+
+            if(endObjectKey.equals("stream")) {
+                pdfSource.unread(endObjectKey.getBytes());
+                pdfSource.unread(' ');
+                if(pb instanceof COSDictionary) {
+                    pb = parseCOSStream((COSDictionary) pb, 
getDocument().getScratchFile());
+
+                } else {
+                    // this is not legal
+                    // the combination of a dict and the stream/endstream 
forms a complete stream object
+                    throw new IOException("stream not preceded by dictionary");
+                }
+                endObjectKey = readString();
+            }
+
+            COSObjectKey key = new COSObjectKey(number, genNum);
+            COSObject pdfObject = document.getObjectFromPool(key);
+            pb.setNeedToBeUpdate(true);
+            pdfObject.setObject(pb);
+
+            if(!endObjectKey.equals("endobj")) {
+                if(endObjectKey.startsWith("endobj")) {
+                    /*
+                     * Some PDF files don't contain a new line after endobj so 
we
+                     * need to make sure that the next object number is 
getting read separately
+                     * and not part of the endobj keyword. Ex. Some files 
would have "endobj28"
+                     * instead of "endobj"
+                     */
+                    pdfSource.unread(endObjectKey.substring(6).getBytes());
+                } else if(!pdfSource.isEOF()) {
+                    try {
+                        //It is possible that the endobj  is missing, there
+                        //are several PDFs out there that do that so skip it 
and move on.
+                        Float.parseFloat(endObjectKey);
+                        pdfSource.unread(COSWriter.SPACE);
+                        pdfSource.unread(endObjectKey.getBytes());
+                    } catch(NumberFormatException e) {
+                        //we will try again incase there was some garbage which
+                        //some writers will leave behind.
+                        String secondEndObjectKey = readString();
+                        if(!secondEndObjectKey.equals("endobj")) {
+                            if(isClosing()) {
+                                //found a case with 17506.pdf object 41 that 
was like this
+                                //41 0 obj [/Pattern /DeviceGray] ] endobj
+                                //notice the second array close, here we are 
reading it
+                                //and ignoring and attempting to continue
+                                pdfSource.read();
+                            }
+                            skipSpaces();
+                            String thirdPossibleEndObj = readString();
+                            if(!thirdPossibleEndObj.equals("endobj")) {
+                                throw new IOException("expected='endobj' 
firstReadAttempt='" + endObjectKey + "' "
+                                        + "secondReadAttempt='" + 
secondEndObjectKey + "' " + pdfSource);
+                            }
+                        }
+                    }
+                }
+            }
+            skipSpaces();
+        }
+        return isEndOfFile;
+    }
+
+    public COSDocument getDocument() throws IOException {
+        if(document == null) {
+            throw new IOException("You must call parse() before calling 
getDocument()");
+        }
+        return document;
+    }
+}

svn commit: r1092856 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java

Reply via email to