pdf_full_grammar.jj

leleueri Sat, 27 Aug 2011 04:54:50 -0700

Author: leleueri
Date: Sat Aug 27 11:54:18 2011
New Revision: 1162326

URL: http://svn.apache.org/viewvc?rev=1162326&view=rev
Log:
[PDFBox-1101] Improve JavaCC Grammar and Preflight to manage Xref stream object 
+ Fix String Literal parsing error


Modified:
    
pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java
    pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj

Modified: 
pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java?rev=1162326&r1=1162325&r2=1162326&view=diff
==============================================================================
--- 
pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java
 (original)
+++ 
pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java
 Sat Aug 27 11:54:18 2011
@@ -42,6 +42,7 @@ import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSObject;
 import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.persistence.util.COSObjectKey;
 
 /**
  * @author eric
@@ -54,87 +55,143 @@ public class TrailerValidationHelper ext
        super(cfg);
   }
 
-  /*
-   * (non-Javadoc)
-   * 
-   * @see
-   * net.awl.edoc.pdfa.validation.helpers.AbstractValidationHelper#validate(
-   * net.awl.edoc.pdfa.validation.DocumentHandler)
-   */
-  @Override
-  public List<ValidationError> innerValidate(DocumentHandler handler)
-      throws ValidationException {
-
-    List<ValidationError> result = new ArrayList<ValidationError>(0);
-    PDDocument pdfDoc = handler.getDocument();
-
-    COSDictionary linearizedDict = isLinearizedPdf(pdfDoc);
-    if (linearizedDict != null) {
-      // it is a linearized PDF, check the linearized dictionary
-      checkLinearizedDictionnary(linearizedDict, result);
-
-      // if the pdf is a linearized pdf. the first trailer must be checked
-      // and it must have the same ID than the last trailer.
-      List<String> lTrailers = handler.getPdfExtractor().getAllTrailers();
-      String firstTrailer = lTrailers.get(0);
-      String lastTrailer = lTrailers.get(lTrailers.size() - 1);
-
-      COSDictionary first = null;
-      COSDictionary last = null;
-      COSDocument cd = null;
-      try {
-        cd = new COSDocument();
-        PdfElementParser parser1 = new PdfElementParser(cd, firstTrailer
-            .getBytes());
-        first = parser1.parseAsDictionary();
-        PdfElementParser parser2 = new PdfElementParser(cd, lastTrailer
-            .getBytes());
-        last = parser2.parseAsDictionary();
-
-        checkMainTrailer(pdfDoc.getDocument(), first, result);
-        if (!compareIds(first, last, pdfDoc.getDocument())) {
-          result.add(new ValidationResult.ValidationError(
-              ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY,
-              "ID is different in the first and the last trailer"));
-        }
-
-      } catch (IOException e) {
-        result.add(new ValidationResult.ValidationError(
-            ValidationConstants.ERROR_SYNTAX_TRAILER,
-            "Unable to parse trailers of the linearized PDF"));
-      } finally {
-        COSUtils.closeDocumentQuietly(cd);
-      }
-
-    } else {
-      // If the PDF isn't a linearized one, only the last trailer must be
-      // checked
-      List<String> lTrailers = handler.getPdfExtractor().getAllTrailers();
-      String lastTrailer = lTrailers.get(lTrailers.size() - 1);
-
-      COSDocument cd = null;
-      try {
-        cd = new COSDocument();
-        PdfElementParser parser = new PdfElementParser(cd, lastTrailer
-            .getBytes());
-        COSDictionary trailer = parser.parseAsDictionary();
-        checkMainTrailer(pdfDoc.getDocument(), trailer, result);
-      } catch (IOException e) {
-        result.add(new ValidationResult.ValidationError(
-            ValidationConstants.ERROR_SYNTAX_TRAILER,
-            "The trailer dictionary is missing"));
-      } finally {
-        try {
-          cd.close();
-        } catch (IOException e) {
-          COSUtils.closeDocumentQuietly(cd);
-        }
-      }
-
-    }
-    return result;
-  }
-
+       /*
+        * (non-Javadoc)
+        * 
+        * @see
+        * 
net.awl.edoc.pdfa.validation.helpers.AbstractValidationHelper#validate(
+        * net.awl.edoc.pdfa.validation.DocumentHandler)
+        */
+       @Override
+       public List<ValidationError> innerValidate(DocumentHandler handler)
+       throws ValidationException {
+
+               List<ValidationError> result = new 
ArrayList<ValidationError>(0);
+               PDDocument pdfDoc = handler.getDocument();
+
+               COSDictionary linearizedDict = isLinearizedPdf(pdfDoc);
+               if (linearizedDict != null) {
+                       // it is a linearized PDF, check the linearized 
dictionary
+                       checkLinearizedDictionnary(linearizedDict, result);
+
+                       // if the pdf is a linearized pdf. the first trailer 
must be checked
+                       // and it must have the same ID than the last trailer.
+                       // According to the PDF version, trailers are available 
by the trailer key word (pdf <= 1.4)
+                       // or in the dictionary of the XRef stream ( PDF >= 1.5)
+                       String pdfVersion = 
pdfDoc.getDocument().getHeaderString();
+                       if ( pdfVersion != null && 
pdfVersion.matches("%PDF-1\\.[1-4]")) {
+                               checkTrailersForLinearizedPDF14(handler, 
result);
+                       } else {
+                               checkTrailersForLinearizedPDF15(handler, 
result);
+                       }
+
+               } else {
+                       // If the PDF isn't a linearized one, only the last 
trailer must be checked
+                       checkMainTrailer(pdfDoc.getDocument(), 
pdfDoc.getDocument().getTrailer(), result);
+
+               }
+
+               return result;
+       }
+
+       /**
+        * Extracts and compares first and last trailers for PDF version 
between 1.1 and 1.4
+        * @param handler
+        * @param result
+        */
+       protected void checkTrailersForLinearizedPDF14(DocumentHandler handler, 
List<ValidationError> result) {
+               PDDocument pdfDoc = handler.getDocument();
+               List<String> lTrailers = 
handler.getPdfExtractor().getAllTrailers();
+
+               if (lTrailers.isEmpty()) {
+                       result.add(new ValidationResult.ValidationError(
+                                       
ValidationConstants.ERROR_SYNTAX_TRAILER,
+                       "There are no trailer in the PDF file"));
+               } else {
+                       String firstTrailer = lTrailers.get(0);
+                       String lastTrailer = lTrailers.get(lTrailers.size() - 
1);
+
+                       COSDictionary first = null;
+                       COSDictionary last = null;
+                       COSDocument cd = null;
+                       try {
+                               cd = new COSDocument();
+
+                               PdfElementParser parser1 = new 
PdfElementParser(cd, firstTrailer.getBytes());
+                               first = parser1.parseAsDictionary();
+
+                               PdfElementParser parser2 = new 
PdfElementParser(cd, lastTrailer.getBytes());
+                               last = parser2.parseAsDictionary();
+
+                               checkMainTrailer(pdfDoc.getDocument(), first, 
result);
+                               if (!compareIds(first, last, 
pdfDoc.getDocument())) {
+                                       result.add(new 
ValidationResult.ValidationError(
+                                                       
ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY,
+                                       "ID is different in the first and the 
last trailer"));
+                               }
+
+                       } catch (IOException e) {
+                               result.add(new ValidationResult.ValidationError(
+                                               
ValidationConstants.ERROR_SYNTAX_TRAILER,
+                               "Unable to parse trailers of the linearized 
PDF"));
+                       } finally {
+                               COSUtils.closeDocumentQuietly(cd);
+                       }
+               }
+       }
+
+       /**
+        * Accesses and compares First and Last trailers for a PDF version 
higher than 1.4.
+        * 
+        * @param handler
+        * @param result
+        */
+       protected void checkTrailersForLinearizedPDF15(DocumentHandler handler, 
List<ValidationError> result) {
+               PDDocument pdfDoc = handler.getDocument();
+               try {
+                       COSDocument cosDocument = pdfDoc.getDocument();
+                       List<COSObject> xrefs = 
cosDocument.getObjectsByType(COSName.XREF);
+
+                       if (xrefs.isEmpty()) {
+                               // no XRef CosObject, may by this pdf file used 
the PDF 1.4 syntaxe
+                               checkTrailersForLinearizedPDF14(handler, 
result);
+
+                       } else {
+
+                               int min = Integer.MAX_VALUE;
+                               int max = Integer.MIN_VALUE;
+                               COSDictionary firstTrailer = null;
+                               COSDictionary lastTrailer = null;
+
+                               // Search First and Last trailers according to 
offset position.
+                               for(COSObject co : xrefs) {
+                                       int offset = 
cosDocument.getXrefTable().get(new COSObjectKey(co));
+                                       if (offset < min) {
+                                               min = offset;
+                                               firstTrailer = 
(COSDictionary)co.getObject();
+                                       }
+
+                                       if (offset > max) {
+                                               max = offset;
+                                               lastTrailer = 
(COSDictionary)co.getObject();
+                                       }
+
+                               }
+
+                               checkMainTrailer(pdfDoc.getDocument(), 
firstTrailer, result);
+                               if (!compareIds(firstTrailer, lastTrailer, 
pdfDoc.getDocument())) {
+                                       result.add(new 
ValidationResult.ValidationError(
+                                                       
ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY,
+                                       "ID is different in the first and the 
last trailer"));
+                               }
+                       }
+               } catch (IOException e) {
+                       result.add(new ValidationResult.ValidationError(
+                                       
ValidationConstants.ERROR_SYNTAX_TRAILER,
+                                       "Unable to check PDF Trailers due to : 
" + e.getMessage()));
+               }
+       }
+       
   /**
    * Return true if the ID of the first dictionary is the same as the id of the
    * last dictionary Return false otherwise.
@@ -143,12 +200,10 @@ public class TrailerValidationHelper ext
    * @param last
    * @return
    */
-  protected boolean compareIds(COSDictionary first, COSDictionary last,
-      COSDocument doc) {
-    COSBase idFirst = first.getItem(COSName
-        .getPDFName(TRAILER_DICTIONARY_KEY_ID));
-    COSBase idLast = last
-        .getItem(COSName.getPDFName(TRAILER_DICTIONARY_KEY_ID));
+       protected boolean compareIds(COSDictionary first, COSDictionary last, 
COSDocument doc) {
+               COSBase idFirst = 
first.getItem(COSName.getPDFName(TRAILER_DICTIONARY_KEY_ID));
+               COSBase idLast = 
last.getItem(COSName.getPDFName(TRAILER_DICTIONARY_KEY_ID));
+
 
     if (idFirst == null || idLast == null) {
       return false;
@@ -170,9 +225,8 @@ public class TrailerValidationHelper ext
       for (Object ol : al.toList()) {
         // ---- according to PDF Reference 1-4, ID is an array containing two
         // strings
-        if (!oneIsEquals)
-          oneIsEquals = ((COSString) ol).getString().equals(
-              ((COSString) of).getString());
+                               if (!oneIsEquals)
+                                       oneIsEquals = ((COSString) 
ol).getString().equals(((COSString) of).getString());
       }
       isEqual = isEqual && oneIsEquals;
     }

Modified: pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj?rev=1162326&r1=1162325&r2=1162326&view=diff
==============================================================================
--- pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj (original)
+++ pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj Sat Aug 27 
11:54:18 2011
@@ -43,7 +43,9 @@ import static org.apache.padaf.preflight
 public class PDFParser
 {
 
-       public static boolean parse (InputStream is) throws 
IOException,ParseException {
+   public String pdfHeader = "";
+
+   public static boolean parse (InputStream is) throws 
IOException,ParseException {
                PDFParser parser = new PDFParser (is);
                parser.PDF();
                return true;
@@ -127,7 +129,7 @@ TOKEN :
 {
        < PERCENT: "%" > |
        < PDFA_HEADER: "PDF-1."["1"-"6"] > |
-       < BINARY_TAG : (["\u0080"-"\uFFFF"]){4,} > 
+       < BINARY_TAG : (["\u0080"-"\u00FF"]){2,} >  
 }
 
 
@@ -148,7 +150,7 @@ TOKEN :
  <OBJ_BOOLEAN : ("true"|"false") > | 
  <OBJ_NUMERIC : ("+"|"-")? ( ((<DIGITS>)+ ("."(<DIGITS>)*)? ) | 
("."(<DIGITS>)+)) > |
  <OBJ_STRING_HEX : "<"((<DIGITS>|["a"-"f"]|["A"-"F"]){2})+">"> |
- <OBJ_STRING_LIT : "("(~[")","("])*> : WithinLIT |
+ <OBJ_STRING_LIT : "("(~["(",")"])*> : WithinLIT |
  <OBJ_ARRAY_START : "[" > |
  <OBJ_ARRAY_END : "]" > |
  <OBJ_NAME : "/"(~[" " , "(" , ")" , "[" , "]" , "{" , "}" , "/" , "<" , ">" , 
"%" , "\t" , "\n" , "\r"])+ > |
@@ -165,6 +167,19 @@ TOKEN :
   <~["(", ")"]>
 }
 
+<WithinLIT> SKIP :
+{
+  <UNICODE : ["\u0080"-"\uFFFF"]> |
+  <UNBALANCED_LEFT_PARENTHESES : "\\("> |
+  <UNBALANCED_RIGHT_PARENTHESES : "\\)">  
+}
+
+<WithinLIT> TOKEN :
+{
+   < END_LITERAL : ")" > |
+   < INNER_START_LIT : "("(~[")","("])*>
+}
+
 // -- Content of Stream isn't check by the JavaCC Parser
 // -- Will be done by the PDFBox API
 
@@ -207,12 +222,6 @@ TOKEN :
        < END_DICTONNARY : ">>" >
 }
 
-<WithinLIT> TOKEN :
-{
-       < END_LITERAL : ")" > |
-       < INNER_START_LIT : "("(~[")","("])*>
-}
-
 TOKEN :
 {
        < STARTXREF_TAG : "startxref" > : WithinTrailer
@@ -229,7 +238,7 @@ void indirect_object() :
 {
        <START_OBJECT>
                object_content()
-       <END_OBJECT> ( < EOL > )?
+       <END_OBJECT>
 }
 
 void object_content() :
@@ -288,35 +297,43 @@ void start_literal () :
 
 JAVACODE
 void literal() {
-       Token currentToken = null;
-       int nesting =  1;
-       int literalLength = 0;
-    while(true) {
+         Token currentToken = null;
+         int nesting =  1;
+         int literalLength = 0;
+
+         while(true) {
+            Token previous = getToken(0);
             currentToken = getToken(1);
             if (currentToken.kind == 0 ){
-                    throw new ParseException("EOF reach before the end of the 
literal string.");
+               throw new ParseException("EOF reach before the end of the 
literal string.");
             }
             literalLength += currentToken.image.getBytes().length;
             if ( currentToken.kind == OBJ_STRING_LIT ) {
-                    jj_consume_token(OBJ_STRING_LIT);
-                    ++nesting;
+               jj_consume_token(OBJ_STRING_LIT);
+               if (previous != null && 
previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') {
+                  ++nesting;
+               }
             } else if ( currentToken.kind == INNER_START_LIT ) {
-                    jj_consume_token(INNER_START_LIT);
-                    ++nesting;            
+               jj_consume_token(INNER_START_LIT);
+               if (previous != null && 
previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') {
+                  ++nesting;
+               }
             } else if ( currentToken.kind == END_LITERAL ) {
-                    --nesting;
-                    jj_consume_token(END_LITERAL);
-                    if (nesting == 0) {
-                       this.token_source.curLexState = 
PDFParserConstants.DEFAULT;
-                        break;
-                    }
+               if (previous != null && 
previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') {
+                  --nesting;
+               }
+               jj_consume_token(END_LITERAL);
+               if (nesting == 0) {
+                  this.token_source.curLexState = PDFParserConstants.DEFAULT;
+                  break;
+               }
             } else {
-               currentToken = getNextToken();
+               currentToken = getNextToken();
             }
-    }
-    if (literalLength > MAX_STRING_LENGTH) {
-       throw new PdfParseException("Literal String too long", 
ERROR_SYNTAX_LITERAL_TOO_LONG);
-    }
+         }
+         if (literalLength > MAX_STRING_LENGTH) {
+            throw new PdfParseException("Literal String too long", 
ERROR_SYNTAX_LITERAL_TOO_LONG);
+         }
 }
 
 JAVACODE
@@ -329,6 +346,15 @@ void checkNameLength() throws ParseExcep
 }
 
 JAVACODE
+void checkMagicNumberLength() throws ParseException {
+   if (token != null && token.image.getBytes().length < 4) {
+      throw new PdfParseException("Not enough bytes after the Header (at least 
4 bytes should be present with a value bigger than 127) : " + token.image, 
ERROR_SYNTAX_HEADER);
+   } else {
+      // Nothing to do
+   }  
+}
+
+JAVACODE
 void checkStringHexLength() throws ParseException {
        if (token != null && ((token.image.length()-2)/2) > MAX_STRING_LENGTH) {
                throw new PdfParseException("Object String Hexa is toot long", 
ERROR_SYNTAX_HEXA_STRING_TOO_LONG);
@@ -398,8 +424,8 @@ void PDF_header() throws HeaderParseExce
 {}
 {
        try {
-               <PERCENT> <PDFA_HEADER> ( < EOL > )
-               <PERCENT> <BINARY_TAG> ( < EOL > )
+               <PERCENT> <PDFA_HEADER> { pdfHeader = token.image;} ( < EOL > )
+               <PERCENT> <BINARY_TAG>  checkMagicNumberLength() ( < EOL > )
        } catch (ParseException e) {
                throw new HeaderParseException (e);
        } catch (TokenMgrError e) {
@@ -412,9 +438,16 @@ void PDF_body() throws BodyParseExceptio
 {}
 {
        try {
-               ( (<SPACE>|<OTHER_WHITE_SPACE>)+ (<EOL>))?
-               ( indirect_object() ) +
-               ( (<SPACE>|<OTHER_WHITE_SPACE>)+ (<EOL>))?
+               ( 
+         (<SPACE>|<OTHER_WHITE_SPACE>)+ 
+                 (<EOL>)
+               )?
+
+               (  indirect_object() 
+                    (<SPACE>|<OTHER_WHITE_SPACE>)* 
+                    (<EOL>)?
+               ) +
+
        } catch (ParseException e) {
                throw new BodyParseException (e);
        } catch (TokenMgrError e) {
@@ -429,8 +462,8 @@ void PDF_cross_ref_table() throws CrossR
                <XREF_TAG> ( < EOL > )
                (
                        <SUBSECTION_START> 
-                       ( < EOL > ) 
-                       (       <FULL_LINE>  ( <SPACE> ) ? ( < EOL > ) ) +
+                       (( <SPACE> ) *  < EOL > ) 
+                       (       <FULL_LINE>  ( <SPACE> ) * ( < EOL > ) ) +
                )+
        } catch (ParseException e) {
                throw new CrossRefParseException (e);
@@ -439,16 +472,13 @@ void PDF_cross_ref_table() throws CrossR
        }
 }
 
-void PDF_trailer() throws TrailerParseException :
+void PDF_trailer_dictionnary() throws TrailerParseException :
 {}
 {
        try {
                <TRAILER_TAG>
                ( <EOL> )
                dictionary_object() (<SPACE>)*<EOL>
-               <STARTXREF_TAG> ( <EOL> )
-               <OBJ_NUMBER> ( <EOL> )
-               <EOF_TRAILER_TAG> ( <EOL> ) ?
        } catch (ParseException e) {
                throw new TrailerParseException (e);
        } catch (TokenMgrError e) {
@@ -457,16 +487,44 @@ void PDF_trailer() throws TrailerParseEx
 
 }
 
+void PDF_Trailer_XRefOffset()  throws TrailerParseException :
+{}
+{
+
+   try {
+      <STARTXREF_TAG> ( <EOL> )
+      <OBJ_NUMBER> ( <EOL> )
+      <EOF_TRAILER_TAG> ( <EOL> ) ?
+   } catch (ParseException e) {
+      throw new TrailerParseException (e);
+   } catch (TokenMgrError e) {
+      throw new TrailerParseException (e.getMessage());
+   }
+}
+
 void
 PDF_linearized_modified() throws PdfParseException  :
-{}
+{
+int foundXref=0;
+int foundTrailer=0;
+}
 {
        try {
-               ( <EOF> | 
-               (PDF_body()
-               PDF_cross_ref_table()
-               PDF_trailer())+
-               <EOF> )
+
+               ( PDF_body()
+                 (
+                   PDF_cross_ref_table() {foundXref++;}
+                   PDF_trailer_dictionnary() {foundTrailer++;}
+                 )?
+                 PDF_Trailer_XRefOffset()
+      )+
+               <EOF> 
+      {
+         boolean expectedXRefAndTrailer = pdfHeader.matches("PDF-1\\.[1-4]");
+         if (expectedXRefAndTrailer && (foundXref <= 0 || foundTrailer <= 0)) {
+            throw new TrailerParseException ("Missing Xref table or Trailer 
keyword in the given PDF.");
+         }
+      }
        } catch (PdfParseException e) { 
                throw e;
        } catch (ParseException e) {
@@ -484,8 +542,5 @@ PDF() throws PdfParseException :
 {}
 {
        PDF_header()
-       PDF_body()
-       PDF_cross_ref_table()
-       PDF_trailer()
        PDF_linearized_modified()
 } 
\ No newline at end of file

svn commit: r1162326 - in /pdfbox/trunk/preflight/src/main: java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java javacc/pdf_full_grammar.jj

Reply via email to