Author: tilman Date: Fri Mar 7 22:02:25 2014 New Revision: 1575426 URL: http://svn.apache.org/r1575426 Log: PDFBOX-1164: add heuristic by Timo Boehme to detect wrongly assumed end of inline image
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1575426&r1=1575425&r2=1575426&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java Fri Mar 7 22:02:25 2014 @@ -19,6 +19,7 @@ package org.apache.pdfbox.pdfparser; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.PushbackInputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -46,6 +47,8 @@ public class PDFStreamParser extends Bas { private List<Object> streamObjects = new ArrayList<Object>( 100 ); private RandomAccess file; + private final int maxBinCharTestLength = 5; + private final byte[] binCharTestArr = new byte[maxBinCharTestLength]; /** * Constructor that takes a stream to parse. @@ -391,10 +394,11 @@ public class PDFStreamParser extends Bas // PDF spec is kinda unclear about this. Should a whitespace // always appear before EI? Not sure, so that we just read // until EI<whitespace>. - // Be aware not all kind of whitespaces are allowed here. see PDFBOX1561 + // Be aware not all kind of whitespaces are allowed here. see PDFBOX-1561 while( !(lastByte == 'E' && currentByte == 'I' && - isSpaceOrReturn()) && + isSpaceOrReturn() && + hasNoFollowingBinData( pdfSource )) && !pdfSource.isEOF() ) { imageData.write( lastByte ); @@ -436,6 +440,37 @@ public class PDFStreamParser extends Bas } /** + * Looks up next 5 bytes if they contain only ASCII characters (no control + * sequences etc.). + * + * @return <code>true</code> if next 5 bytes are printable ASCII characters, + * otherwise <code>false</code> + */ + private boolean hasNoFollowingBinData(final PushbackInputStream pdfSource) + throws IOException + { + // as suggested in PDFBOX-1164 + final int readBytes = pdfSource.read(binCharTestArr, 0, maxBinCharTestLength); + boolean noBinData = true; + + if (readBytes > 0) + { + for (int bIdx = 0; bIdx < readBytes; bIdx++) + { + final byte b = binCharTestArr[bIdx]; + if ((b < 0x09) || ((b > 0x0a) && (b < 0x20) && (b != 0x0d))) + { + // control character or > 0x7f -> we have binary data + noBinData = false; + break; + } + } + pdfSource.unread(binCharTestArr, 0, readBytes); + } + return noBinData; + } + + /** * This will read an operator from the stream. * * @return The operator that was read from the stream.