Author: lehmi Date: Thu Jan 22 19:18:27 2015 New Revision: 1654017 URL: http://svn.apache.org/r1654017 Log: PDFBOX-2610: readLine now treats CR+LF as one EOL
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1654017&r1=1654016&r2=1654017&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Thu Jan 22 19:18:27 2015 @@ -1431,8 +1431,8 @@ public abstract class BaseParser impleme /** * This will read bytes until the first end of line marker occurs. - * Note: if you later unread the results of this function, you'll - * need to add a newline character to the end of the string. + * NOTE: The EOL marker may consists of 1 (CR or LF) or 2 (CR and CL) bytes + * which is an important detail if one wants to unread the line. * * @return The characters between the current position and the end of the line. * @@ -1450,12 +1450,18 @@ public abstract class BaseParser impleme int c; while ((c = pdfSource.read()) != -1) { + // CR and LF are valid EOLs if (isEOL(c)) { break; } buffer.append( (char)c ); } + // CR+LF is also a valid EOL + if (isCR(c) && isLF(pdfSource.peek())) + { + pdfSource.read(); + } return buffer.toString(); } @@ -1479,9 +1485,19 @@ public abstract class BaseParser impleme */ protected boolean isEOL(int c) { - return ASCII_LF == c || ASCII_CR == c; + return isLF(c) || isCR(c); } + private boolean isLF(int c) + { + return ASCII_LF == c; + } + + private boolean isCR(int c) + { + return ASCII_CR == c; + } + /** * This will tell if the next byte is whitespace or not. * Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1654017&r1=1654016&r2=1654017&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Thu Jan 22 19:18:27 2015 @@ -2161,6 +2161,7 @@ public class NonSequentialPDFParser exte return false; } //read "trailer" + long currentOffset = pdfSource.getOffset(); String nextLine = readLine(); if( !nextLine.trim().equals( "trailer" ) ) { @@ -2170,10 +2171,10 @@ public class NonSequentialPDFParser exte // Acrobat reader can also deal with this. if (nextLine.startsWith("trailer")) { - byte[] b = nextLine.getBytes(ISO_8859_1); + // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes int len = "trailer".length(); - pdfSource.unread('\n'); - pdfSource.unread(b, len, b.length-len); + // jump back right after "trailer" + pdfSource.seek(currentOffset + len); } else {