Author: adam
Date: Sat Apr 16 02:52:25 2011
New Revision: 1092856
URL: http://svn.apache.org/viewvc?rev=1092856&view=rev
Log:
PDFBOX-912: PDF signing interface and improvements
Added:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java
Added:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java?rev=1092856&view=auto
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java
(added)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/VisualSignatureParser.java
Sat Apr 16 02:52:25 2011
@@ -0,0 +1,228 @@
+package org.apache.pdfbox.pdfparser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.pdfwriter.COSWriter;
+import org.apache.pdfbox.persistence.util.COSObjectKey;
+
+public class VisualSignatureParser extends BaseParser {
+
+ /**
+ * Log instance.
+ */
+ private static final Log log = LogFactory.getLog(PDFParser.class);
+
+ public VisualSignatureParser(InputStream input) throws IOException {
+ super(input);
+ }
+
+ public void parse() throws IOException {
+ document = new COSDocument();
+ skipToNextObj();
+
+ boolean wasLastParsedObjectEOF = false;
+ try {
+ while(!wasLastParsedObjectEOF) {
+ if(pdfSource.isEOF()) {
+ break;
+ }
+ try {
+ wasLastParsedObjectEOF = parseObject();
+ } catch(IOException e) {
+ /*
+ * Warning is sent to the PDFBox.log and to the Console
that
+ * we skipped over an object
+ */
+ log.warn("Parsing Error, Skipping Object", e);
+ skipToNextObj();
+ }
+ skipSpaces();
+ }
+ } catch(IOException e) {
+ /*
+ * PDF files may have random data after the EOF marker. Ignore
errors if
+ * last object processed is EOF.
+ */
+ if(!wasLastParsedObjectEOF) {
+ throw e;
+ }
+ }
+ }
+
+ private void skipToNextObj() throws IOException {
+ byte[] b = new byte[16];
+ Pattern p = Pattern.compile("\\d+\\s+\\d+\\s+obj.*", Pattern.DOTALL);
+ /* Read a buffer of data each time to see if it starts with a
+ * known keyword. This is not the most efficient design, but we should
+ * rarely be needing this function. We could update this to use the
+ * circular buffer, like in readUntilEndStream().
+ */
+ while(!pdfSource.isEOF()) {
+ int l = pdfSource.read(b);
+ if(l < 1) {
+ break;
+ }
+ String s = new String(b, "US-ASCII");
+ if(s.startsWith("trailer")
+ || s.startsWith("xref")
+ || s.startsWith("startxref")
+ || s.startsWith("stream")
+ || p.matcher(s).matches()) {
+ pdfSource.unread(b);
+ break;
+ } else {
+ pdfSource.unread(b, 1, l - 1);
+ }
+ }
+ }
+
+ private boolean parseObject() throws IOException {
+ boolean isEndOfFile = false;
+ skipSpaces();
+ //peek at the next character to determine the type of object we are
parsing
+ char peekedChar = (char) pdfSource.peek();
+
+ //ignore endobj and endstream sections.
+ while(peekedChar == 'e') {
+ //there are times when there are multiple endobj, so lets
+ //just read them and move on.
+ readString();
+ skipSpaces();
+ peekedChar = (char) pdfSource.peek();
+ }
+ if(pdfSource.isEOF()) {
+ // end of file we will return a false and call it a day.
+ } else if(peekedChar == 'x') {
+ //xref table. Note: The contents of the Xref table are currently
ignored
+ return true;
+ } else if(peekedChar == 't' || peekedChar == 's') {
+ // Note: startxref can occur in either a trailer section or by
itself
+ if(peekedChar == 't') {
+ return true;
+ }
+ if(peekedChar == 's') {
+ skipToNextObj();
+ //verify that EOF exists
+ String eof = readExpectedString("%%EOF");
+ if(eof.indexOf("%%EOF") == -1 && !pdfSource.isEOF()) {
+ throw new IOException("expected='%%EOF' actual='" + eof +
"' next=" + readString()
+ + " next=" + readString());
+ }
+ isEndOfFile = true;
+ }
+ } else {
+ //we are going to parse an normal object
+ int number = -1;
+ int genNum = -1;
+ String objectKey = null;
+ boolean missingObjectNumber = false;
+ try {
+ char peeked = (char) pdfSource.peek();
+ if(peeked == '<') {
+ missingObjectNumber = true;
+ } else {
+ number = readInt();
+ }
+ } catch(IOException e) {
+ //ok for some reason "GNU Ghostscript 5.10" puts two endobj
+ //statements after an object, of course this is nonsense
+ //but because we want to support as many PDFs as possible
+ //we will simply try again
+ number = readInt();
+ }
+ if(!missingObjectNumber) {
+ skipSpaces();
+ genNum = readInt();
+
+ objectKey = readString(3);
+ //System.out.println( "parseObject() num=" + number +
+ //" genNumber=" + genNum + " key='" + objectKey + "'" );
+ if(!objectKey.equals("obj")) {
+ throw new IOException("expected='obj' actual='" +
objectKey + "' " + pdfSource);
+ }
+ } else {
+ number = -1;
+ genNum = -1;
+ }
+
+ skipSpaces();
+ COSBase pb = parseDirObject();
+ String endObjectKey = readString();
+
+ if(endObjectKey.equals("stream")) {
+ pdfSource.unread(endObjectKey.getBytes());
+ pdfSource.unread(' ');
+ if(pb instanceof COSDictionary) {
+ pb = parseCOSStream((COSDictionary) pb,
getDocument().getScratchFile());
+
+ } else {
+ // this is not legal
+ // the combination of a dict and the stream/endstream
forms a complete stream object
+ throw new IOException("stream not preceded by dictionary");
+ }
+ endObjectKey = readString();
+ }
+
+ COSObjectKey key = new COSObjectKey(number, genNum);
+ COSObject pdfObject = document.getObjectFromPool(key);
+ pb.setNeedToBeUpdate(true);
+ pdfObject.setObject(pb);
+
+ if(!endObjectKey.equals("endobj")) {
+ if(endObjectKey.startsWith("endobj")) {
+ /*
+ * Some PDF files don't contain a new line after endobj so
we
+ * need to make sure that the next object number is
getting read separately
+ * and not part of the endobj keyword. Ex. Some files
would have "endobj28"
+ * instead of "endobj"
+ */
+ pdfSource.unread(endObjectKey.substring(6).getBytes());
+ } else if(!pdfSource.isEOF()) {
+ try {
+ //It is possible that the endobj is missing, there
+ //are several PDFs out there that do that so skip it
and move on.
+ Float.parseFloat(endObjectKey);
+ pdfSource.unread(COSWriter.SPACE);
+ pdfSource.unread(endObjectKey.getBytes());
+ } catch(NumberFormatException e) {
+ //we will try again incase there was some garbage which
+ //some writers will leave behind.
+ String secondEndObjectKey = readString();
+ if(!secondEndObjectKey.equals("endobj")) {
+ if(isClosing()) {
+ //found a case with 17506.pdf object 41 that
was like this
+ //41 0 obj [/Pattern /DeviceGray] ] endobj
+ //notice the second array close, here we are
reading it
+ //and ignoring and attempting to continue
+ pdfSource.read();
+ }
+ skipSpaces();
+ String thirdPossibleEndObj = readString();
+ if(!thirdPossibleEndObj.equals("endobj")) {
+ throw new IOException("expected='endobj'
firstReadAttempt='" + endObjectKey + "' "
+ + "secondReadAttempt='" +
secondEndObjectKey + "' " + pdfSource);
+ }
+ }
+ }
+ }
+ }
+ skipSpaces();
+ }
+ return isEndOfFile;
+ }
+
+ public COSDocument getDocument() throws IOException {
+ if(document == null) {
+ throw new IOException("You must call parse() before calling
getDocument()");
+ }
+ return document;
+ }
+}