Author: leleueri
Date: Sat Aug 27 11:54:18 2011
New Revision: 1162326
URL: http://svn.apache.org/viewvc?rev=1162326&view=rev
Log:
[PDFBox-1101] Improve JavaCC Grammar and Preflight to manage Xref stream object
+ Fix String Literal parsing error
Modified:
pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java
pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj
Modified:
pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java?rev=1162326&r1=1162325&r2=1162326&view=diff
==============================================================================
---
pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java
(original)
+++
pdfbox/trunk/preflight/src/main/java/org/apache/padaf/preflight/helpers/TrailerValidationHelper.java
Sat Aug 27 11:54:18 2011
@@ -42,6 +42,7 @@ import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.persistence.util.COSObjectKey;
/**
* @author eric
@@ -54,87 +55,143 @@ public class TrailerValidationHelper ext
super(cfg);
}
- /*
- * (non-Javadoc)
- *
- * @see
- * net.awl.edoc.pdfa.validation.helpers.AbstractValidationHelper#validate(
- * net.awl.edoc.pdfa.validation.DocumentHandler)
- */
- @Override
- public List<ValidationError> innerValidate(DocumentHandler handler)
- throws ValidationException {
-
- List<ValidationError> result = new ArrayList<ValidationError>(0);
- PDDocument pdfDoc = handler.getDocument();
-
- COSDictionary linearizedDict = isLinearizedPdf(pdfDoc);
- if (linearizedDict != null) {
- // it is a linearized PDF, check the linearized dictionary
- checkLinearizedDictionnary(linearizedDict, result);
-
- // if the pdf is a linearized pdf. the first trailer must be checked
- // and it must have the same ID than the last trailer.
- List<String> lTrailers = handler.getPdfExtractor().getAllTrailers();
- String firstTrailer = lTrailers.get(0);
- String lastTrailer = lTrailers.get(lTrailers.size() - 1);
-
- COSDictionary first = null;
- COSDictionary last = null;
- COSDocument cd = null;
- try {
- cd = new COSDocument();
- PdfElementParser parser1 = new PdfElementParser(cd, firstTrailer
- .getBytes());
- first = parser1.parseAsDictionary();
- PdfElementParser parser2 = new PdfElementParser(cd, lastTrailer
- .getBytes());
- last = parser2.parseAsDictionary();
-
- checkMainTrailer(pdfDoc.getDocument(), first, result);
- if (!compareIds(first, last, pdfDoc.getDocument())) {
- result.add(new ValidationResult.ValidationError(
- ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY,
- "ID is different in the first and the last trailer"));
- }
-
- } catch (IOException e) {
- result.add(new ValidationResult.ValidationError(
- ValidationConstants.ERROR_SYNTAX_TRAILER,
- "Unable to parse trailers of the linearized PDF"));
- } finally {
- COSUtils.closeDocumentQuietly(cd);
- }
-
- } else {
- // If the PDF isn't a linearized one, only the last trailer must be
- // checked
- List<String> lTrailers = handler.getPdfExtractor().getAllTrailers();
- String lastTrailer = lTrailers.get(lTrailers.size() - 1);
-
- COSDocument cd = null;
- try {
- cd = new COSDocument();
- PdfElementParser parser = new PdfElementParser(cd, lastTrailer
- .getBytes());
- COSDictionary trailer = parser.parseAsDictionary();
- checkMainTrailer(pdfDoc.getDocument(), trailer, result);
- } catch (IOException e) {
- result.add(new ValidationResult.ValidationError(
- ValidationConstants.ERROR_SYNTAX_TRAILER,
- "The trailer dictionary is missing"));
- } finally {
- try {
- cd.close();
- } catch (IOException e) {
- COSUtils.closeDocumentQuietly(cd);
- }
- }
-
- }
- return result;
- }
-
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ *
net.awl.edoc.pdfa.validation.helpers.AbstractValidationHelper#validate(
+ * net.awl.edoc.pdfa.validation.DocumentHandler)
+ */
+ @Override
+ public List<ValidationError> innerValidate(DocumentHandler handler)
+ throws ValidationException {
+
+ List<ValidationError> result = new
ArrayList<ValidationError>(0);
+ PDDocument pdfDoc = handler.getDocument();
+
+ COSDictionary linearizedDict = isLinearizedPdf(pdfDoc);
+ if (linearizedDict != null) {
+ // it is a linearized PDF, check the linearized
dictionary
+ checkLinearizedDictionnary(linearizedDict, result);
+
+ // if the pdf is a linearized pdf. the first trailer
must be checked
+ // and it must have the same ID than the last trailer.
+ // According to the PDF version, trailers are available
by the trailer key word (pdf <= 1.4)
+ // or in the dictionary of the XRef stream ( PDF >= 1.5)
+ String pdfVersion =
pdfDoc.getDocument().getHeaderString();
+ if ( pdfVersion != null &&
pdfVersion.matches("%PDF-1\\.[1-4]")) {
+ checkTrailersForLinearizedPDF14(handler,
result);
+ } else {
+ checkTrailersForLinearizedPDF15(handler,
result);
+ }
+
+ } else {
+ // If the PDF isn't a linearized one, only the last
trailer must be checked
+ checkMainTrailer(pdfDoc.getDocument(),
pdfDoc.getDocument().getTrailer(), result);
+
+ }
+
+ return result;
+ }
+
+ /**
+ * Extracts and compares first and last trailers for PDF version
between 1.1 and 1.4
+ * @param handler
+ * @param result
+ */
+ protected void checkTrailersForLinearizedPDF14(DocumentHandler handler,
List<ValidationError> result) {
+ PDDocument pdfDoc = handler.getDocument();
+ List<String> lTrailers =
handler.getPdfExtractor().getAllTrailers();
+
+ if (lTrailers.isEmpty()) {
+ result.add(new ValidationResult.ValidationError(
+
ValidationConstants.ERROR_SYNTAX_TRAILER,
+ "There are no trailer in the PDF file"));
+ } else {
+ String firstTrailer = lTrailers.get(0);
+ String lastTrailer = lTrailers.get(lTrailers.size() -
1);
+
+ COSDictionary first = null;
+ COSDictionary last = null;
+ COSDocument cd = null;
+ try {
+ cd = new COSDocument();
+
+ PdfElementParser parser1 = new
PdfElementParser(cd, firstTrailer.getBytes());
+ first = parser1.parseAsDictionary();
+
+ PdfElementParser parser2 = new
PdfElementParser(cd, lastTrailer.getBytes());
+ last = parser2.parseAsDictionary();
+
+ checkMainTrailer(pdfDoc.getDocument(), first,
result);
+ if (!compareIds(first, last,
pdfDoc.getDocument())) {
+ result.add(new
ValidationResult.ValidationError(
+
ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY,
+ "ID is different in the first and the
last trailer"));
+ }
+
+ } catch (IOException e) {
+ result.add(new ValidationResult.ValidationError(
+
ValidationConstants.ERROR_SYNTAX_TRAILER,
+ "Unable to parse trailers of the linearized
PDF"));
+ } finally {
+ COSUtils.closeDocumentQuietly(cd);
+ }
+ }
+ }
+
+ /**
+ * Accesses and compares First and Last trailers for a PDF version
higher than 1.4.
+ *
+ * @param handler
+ * @param result
+ */
+ protected void checkTrailersForLinearizedPDF15(DocumentHandler handler,
List<ValidationError> result) {
+ PDDocument pdfDoc = handler.getDocument();
+ try {
+ COSDocument cosDocument = pdfDoc.getDocument();
+ List<COSObject> xrefs =
cosDocument.getObjectsByType(COSName.XREF);
+
+ if (xrefs.isEmpty()) {
+ // no XRef CosObject, may by this pdf file used
the PDF 1.4 syntaxe
+ checkTrailersForLinearizedPDF14(handler,
result);
+
+ } else {
+
+ int min = Integer.MAX_VALUE;
+ int max = Integer.MIN_VALUE;
+ COSDictionary firstTrailer = null;
+ COSDictionary lastTrailer = null;
+
+ // Search First and Last trailers according to
offset position.
+ for(COSObject co : xrefs) {
+ int offset =
cosDocument.getXrefTable().get(new COSObjectKey(co));
+ if (offset < min) {
+ min = offset;
+ firstTrailer =
(COSDictionary)co.getObject();
+ }
+
+ if (offset > max) {
+ max = offset;
+ lastTrailer =
(COSDictionary)co.getObject();
+ }
+
+ }
+
+ checkMainTrailer(pdfDoc.getDocument(),
firstTrailer, result);
+ if (!compareIds(firstTrailer, lastTrailer,
pdfDoc.getDocument())) {
+ result.add(new
ValidationResult.ValidationError(
+
ValidationConstants.ERROR_SYNTAX_TRAILER_ID_CONSISTENCY,
+ "ID is different in the first and the
last trailer"));
+ }
+ }
+ } catch (IOException e) {
+ result.add(new ValidationResult.ValidationError(
+
ValidationConstants.ERROR_SYNTAX_TRAILER,
+ "Unable to check PDF Trailers due to :
" + e.getMessage()));
+ }
+ }
+
/**
* Return true if the ID of the first dictionary is the same as the id of the
* last dictionary Return false otherwise.
@@ -143,12 +200,10 @@ public class TrailerValidationHelper ext
* @param last
* @return
*/
- protected boolean compareIds(COSDictionary first, COSDictionary last,
- COSDocument doc) {
- COSBase idFirst = first.getItem(COSName
- .getPDFName(TRAILER_DICTIONARY_KEY_ID));
- COSBase idLast = last
- .getItem(COSName.getPDFName(TRAILER_DICTIONARY_KEY_ID));
+ protected boolean compareIds(COSDictionary first, COSDictionary last,
COSDocument doc) {
+ COSBase idFirst =
first.getItem(COSName.getPDFName(TRAILER_DICTIONARY_KEY_ID));
+ COSBase idLast =
last.getItem(COSName.getPDFName(TRAILER_DICTIONARY_KEY_ID));
+
if (idFirst == null || idLast == null) {
return false;
@@ -170,9 +225,8 @@ public class TrailerValidationHelper ext
for (Object ol : al.toList()) {
// ---- according to PDF Reference 1-4, ID is an array containing two
// strings
- if (!oneIsEquals)
- oneIsEquals = ((COSString) ol).getString().equals(
- ((COSString) of).getString());
+ if (!oneIsEquals)
+ oneIsEquals = ((COSString)
ol).getString().equals(((COSString) of).getString());
}
isEqual = isEqual && oneIsEquals;
}
Modified: pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj?rev=1162326&r1=1162325&r2=1162326&view=diff
==============================================================================
--- pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj (original)
+++ pdfbox/trunk/preflight/src/main/javacc/pdf_full_grammar.jj Sat Aug 27
11:54:18 2011
@@ -43,7 +43,9 @@ import static org.apache.padaf.preflight
public class PDFParser
{
- public static boolean parse (InputStream is) throws
IOException,ParseException {
+ public String pdfHeader = "";
+
+ public static boolean parse (InputStream is) throws
IOException,ParseException {
PDFParser parser = new PDFParser (is);
parser.PDF();
return true;
@@ -127,7 +129,7 @@ TOKEN :
{
< PERCENT: "%" > |
< PDFA_HEADER: "PDF-1."["1"-"6"] > |
- < BINARY_TAG : (["\u0080"-"\uFFFF"]){4,} >
+ < BINARY_TAG : (["\u0080"-"\u00FF"]){2,} >
}
@@ -148,7 +150,7 @@ TOKEN :
<OBJ_BOOLEAN : ("true"|"false") > |
<OBJ_NUMERIC : ("+"|"-")? ( ((<DIGITS>)+ ("."(<DIGITS>)*)? ) |
("."(<DIGITS>)+)) > |
<OBJ_STRING_HEX : "<"((<DIGITS>|["a"-"f"]|["A"-"F"]){2})+">"> |
- <OBJ_STRING_LIT : "("(~[")","("])*> : WithinLIT |
+ <OBJ_STRING_LIT : "("(~["(",")"])*> : WithinLIT |
<OBJ_ARRAY_START : "[" > |
<OBJ_ARRAY_END : "]" > |
<OBJ_NAME : "/"(~[" " , "(" , ")" , "[" , "]" , "{" , "}" , "/" , "<" , ">" ,
"%" , "\t" , "\n" , "\r"])+ > |
@@ -165,6 +167,19 @@ TOKEN :
<~["(", ")"]>
}
+<WithinLIT> SKIP :
+{
+ <UNICODE : ["\u0080"-"\uFFFF"]> |
+ <UNBALANCED_LEFT_PARENTHESES : "\\("> |
+ <UNBALANCED_RIGHT_PARENTHESES : "\\)">
+}
+
+<WithinLIT> TOKEN :
+{
+ < END_LITERAL : ")" > |
+ < INNER_START_LIT : "("(~[")","("])*>
+}
+
// -- Content of Stream isn't check by the JavaCC Parser
// -- Will be done by the PDFBox API
@@ -207,12 +222,6 @@ TOKEN :
< END_DICTONNARY : ">>" >
}
-<WithinLIT> TOKEN :
-{
- < END_LITERAL : ")" > |
- < INNER_START_LIT : "("(~[")","("])*>
-}
-
TOKEN :
{
< STARTXREF_TAG : "startxref" > : WithinTrailer
@@ -229,7 +238,7 @@ void indirect_object() :
{
<START_OBJECT>
object_content()
- <END_OBJECT> ( < EOL > )?
+ <END_OBJECT>
}
void object_content() :
@@ -288,35 +297,43 @@ void start_literal () :
JAVACODE
void literal() {
- Token currentToken = null;
- int nesting = 1;
- int literalLength = 0;
- while(true) {
+ Token currentToken = null;
+ int nesting = 1;
+ int literalLength = 0;
+
+ while(true) {
+ Token previous = getToken(0);
currentToken = getToken(1);
if (currentToken.kind == 0 ){
- throw new ParseException("EOF reach before the end of the
literal string.");
+ throw new ParseException("EOF reach before the end of the
literal string.");
}
literalLength += currentToken.image.getBytes().length;
if ( currentToken.kind == OBJ_STRING_LIT ) {
- jj_consume_token(OBJ_STRING_LIT);
- ++nesting;
+ jj_consume_token(OBJ_STRING_LIT);
+ if (previous != null &&
previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') {
+ ++nesting;
+ }
} else if ( currentToken.kind == INNER_START_LIT ) {
- jj_consume_token(INNER_START_LIT);
- ++nesting;
+ jj_consume_token(INNER_START_LIT);
+ if (previous != null &&
previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') {
+ ++nesting;
+ }
} else if ( currentToken.kind == END_LITERAL ) {
- --nesting;
- jj_consume_token(END_LITERAL);
- if (nesting == 0) {
- this.token_source.curLexState =
PDFParserConstants.DEFAULT;
- break;
- }
+ if (previous != null &&
previous.image.getBytes()[previous.image.getBytes().length-1]!='\\') {
+ --nesting;
+ }
+ jj_consume_token(END_LITERAL);
+ if (nesting == 0) {
+ this.token_source.curLexState = PDFParserConstants.DEFAULT;
+ break;
+ }
} else {
- currentToken = getNextToken();
+ currentToken = getNextToken();
}
- }
- if (literalLength > MAX_STRING_LENGTH) {
- throw new PdfParseException("Literal String too long",
ERROR_SYNTAX_LITERAL_TOO_LONG);
- }
+ }
+ if (literalLength > MAX_STRING_LENGTH) {
+ throw new PdfParseException("Literal String too long",
ERROR_SYNTAX_LITERAL_TOO_LONG);
+ }
}
JAVACODE
@@ -329,6 +346,15 @@ void checkNameLength() throws ParseExcep
}
JAVACODE
+void checkMagicNumberLength() throws ParseException {
+ if (token != null && token.image.getBytes().length < 4) {
+ throw new PdfParseException("Not enough bytes after the Header (at least
4 bytes should be present with a value bigger than 127) : " + token.image,
ERROR_SYNTAX_HEADER);
+ } else {
+ // Nothing to do
+ }
+}
+
+JAVACODE
void checkStringHexLength() throws ParseException {
if (token != null && ((token.image.length()-2)/2) > MAX_STRING_LENGTH) {
throw new PdfParseException("Object String Hexa is toot long",
ERROR_SYNTAX_HEXA_STRING_TOO_LONG);
@@ -398,8 +424,8 @@ void PDF_header() throws HeaderParseExce
{}
{
try {
- <PERCENT> <PDFA_HEADER> ( < EOL > )
- <PERCENT> <BINARY_TAG> ( < EOL > )
+ <PERCENT> <PDFA_HEADER> { pdfHeader = token.image;} ( < EOL > )
+ <PERCENT> <BINARY_TAG> checkMagicNumberLength() ( < EOL > )
} catch (ParseException e) {
throw new HeaderParseException (e);
} catch (TokenMgrError e) {
@@ -412,9 +438,16 @@ void PDF_body() throws BodyParseExceptio
{}
{
try {
- ( (<SPACE>|<OTHER_WHITE_SPACE>)+ (<EOL>))?
- ( indirect_object() ) +
- ( (<SPACE>|<OTHER_WHITE_SPACE>)+ (<EOL>))?
+ (
+ (<SPACE>|<OTHER_WHITE_SPACE>)+
+ (<EOL>)
+ )?
+
+ ( indirect_object()
+ (<SPACE>|<OTHER_WHITE_SPACE>)*
+ (<EOL>)?
+ ) +
+
} catch (ParseException e) {
throw new BodyParseException (e);
} catch (TokenMgrError e) {
@@ -429,8 +462,8 @@ void PDF_cross_ref_table() throws CrossR
<XREF_TAG> ( < EOL > )
(
<SUBSECTION_START>
- ( < EOL > )
- ( <FULL_LINE> ( <SPACE> ) ? ( < EOL > ) ) +
+ (( <SPACE> ) * < EOL > )
+ ( <FULL_LINE> ( <SPACE> ) * ( < EOL > ) ) +
)+
} catch (ParseException e) {
throw new CrossRefParseException (e);
@@ -439,16 +472,13 @@ void PDF_cross_ref_table() throws CrossR
}
}
-void PDF_trailer() throws TrailerParseException :
+void PDF_trailer_dictionnary() throws TrailerParseException :
{}
{
try {
<TRAILER_TAG>
( <EOL> )
dictionary_object() (<SPACE>)*<EOL>
- <STARTXREF_TAG> ( <EOL> )
- <OBJ_NUMBER> ( <EOL> )
- <EOF_TRAILER_TAG> ( <EOL> ) ?
} catch (ParseException e) {
throw new TrailerParseException (e);
} catch (TokenMgrError e) {
@@ -457,16 +487,44 @@ void PDF_trailer() throws TrailerParseEx
}
+void PDF_Trailer_XRefOffset() throws TrailerParseException :
+{}
+{
+
+ try {
+ <STARTXREF_TAG> ( <EOL> )
+ <OBJ_NUMBER> ( <EOL> )
+ <EOF_TRAILER_TAG> ( <EOL> ) ?
+ } catch (ParseException e) {
+ throw new TrailerParseException (e);
+ } catch (TokenMgrError e) {
+ throw new TrailerParseException (e.getMessage());
+ }
+}
+
void
PDF_linearized_modified() throws PdfParseException :
-{}
+{
+int foundXref=0;
+int foundTrailer=0;
+}
{
try {
- ( <EOF> |
- (PDF_body()
- PDF_cross_ref_table()
- PDF_trailer())+
- <EOF> )
+
+ ( PDF_body()
+ (
+ PDF_cross_ref_table() {foundXref++;}
+ PDF_trailer_dictionnary() {foundTrailer++;}
+ )?
+ PDF_Trailer_XRefOffset()
+ )+
+ <EOF>
+ {
+ boolean expectedXRefAndTrailer = pdfHeader.matches("PDF-1\\.[1-4]");
+ if (expectedXRefAndTrailer && (foundXref <= 0 || foundTrailer <= 0)) {
+ throw new TrailerParseException ("Missing Xref table or Trailer
keyword in the given PDF.");
+ }
+ }
} catch (PdfParseException e) {
throw e;
} catch (ParseException e) {
@@ -484,8 +542,5 @@ PDF() throws PdfParseException :
{}
{
PDF_header()
- PDF_body()
- PDF_cross_ref_table()
- PDF_trailer()
PDF_linearized_modified()
}
\ No newline at end of file