Author: lehmi Date: Sun Dec 8 12:58:49 2013 New Revision: 1549025 URL: http://svn.apache.org/r1549025 Log: PDFBOX-1769: identify corrupt stream length and use workaround if necessary
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1549025&r1=1549024&r2=1549025&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java Sun Dec 8 12:58:49 2013 @@ -110,6 +110,7 @@ public class NonSequentialPDFParser exte protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' }; private final File pdfFile; + private long fileLen; private final RandomAccessBufferedFileInputStream raStream; /** @@ -558,7 +559,7 @@ public class NonSequentialPDFParser exte long skipBytes; // ---- read trailing bytes into buffer - final long fileLen = pdfFile.length(); + fileLen = pdfFile.length(); FileInputStream fIn = null; try @@ -1518,31 +1519,34 @@ public class NonSequentialPDFParser exte throw new IOException("Missing length for stream."); } + boolean useReadUntilEnd = false; // ---- get output stream to copy data to out = stream.createFilteredStream(streamLengthObj); - - long remainBytes = streamLengthObj.longValue(); - int bytesRead = 0; - boolean unexpectedEndOfStream = false; - while (remainBytes > 0) - { - final int readBytes = pdfSource.read(streamCopyBuf, 0, - (remainBytes > streamCopyBufLen) ? streamCopyBufLen : (int) remainBytes); - if (readBytes <= 0) - { - // throw new IOException( - // "No more bytes from stream but expected: " + remainBytes - // ); - unexpectedEndOfStream = true; - break; - } - out.write(streamCopyBuf, 0, readBytes); - remainBytes -= readBytes; - bytesRead += readBytes; + if (validateStreamLength(streamLengthObj.longValue())) + { + long remainBytes = streamLengthObj.longValue(); + int bytesRead = 0; + while (remainBytes > 0) + { + final int readBytes = pdfSource.read(streamCopyBuf, 0, + (remainBytes > streamCopyBufLen) ? streamCopyBufLen : (int) remainBytes); + if (readBytes <= 0) + { + useReadUntilEnd = true; + pdfSource.unread(bytesRead); + break; + } + out.write(streamCopyBuf, 0, readBytes); + remainBytes -= readBytes; + bytesRead += readBytes; + } + } + else + { + useReadUntilEnd = true; } - if (unexpectedEndOfStream) + if (useReadUntilEnd) { - pdfSource.unread(bytesRead); out = stream.createFilteredStream(streamLengthObj); readUntilEndStream(out); } @@ -1563,6 +1567,28 @@ public class NonSequentialPDFParser exte return stream; } + private boolean validateStreamLength(long streamLength) throws IOException + { + boolean streamLengthIsValid = true; + long originOffset = pdfSource.getOffset(); + long expectedEndOfStream = originOffset + streamLength; + if (expectedEndOfStream > fileLen) + { + streamLengthIsValid = false; + LOG.error("The end of the stream is out of range, using workaround to read the stream"); + } + else + { + pdfSource.seek(expectedEndOfStream); + if (!checkBytesAtOffset("endstream".getBytes("ISO-8859-1"))) + { + streamLengthIsValid = false; + LOG.error("The end of the stream doesn't point to the correct offset, using workaround to read the stream"); + } + pdfSource.seek(originOffset); + } + return streamLengthIsValid; + } private void readUntilEndStream(final OutputStream out) throws IOException { @@ -1742,6 +1768,11 @@ public class NonSequentialPDFParser exte */ private long calculateFixingOffset(long objectOffset, byte[] string) throws IOException { + if (objectOffset < 0) + { + LOG.error("Invalid object offset " + objectOffset + " for object " + new String(string)); + return 0; + } long originOffset = pdfSource.getOffset(); pdfSource.seek(objectOffset); // most likely the object can be found at the given offset