Author: tilman
Date: Fri Mar  7 22:02:25 2014
New Revision: 1575426

URL: http://svn.apache.org/r1575426
Log:
PDFBOX-1164: add heuristic by Timo Boehme to detect wrongly assumed end of 
inline image 

Modified:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java?rev=1575426&r1=1575425&r2=1575426&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFStreamParser.java
 Fri Mar  7 22:02:25 2014
@@ -19,6 +19,7 @@ package org.apache.pdfbox.pdfparser;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.PushbackInputStream;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
@@ -46,6 +47,8 @@ public class PDFStreamParser extends Bas
 {
     private List<Object> streamObjects = new ArrayList<Object>( 100 );
     private RandomAccess file;
+    private final int    maxBinCharTestLength = 5;
+    private final byte[] binCharTestArr = new byte[maxBinCharTestLength];
 
     /**
      * Constructor that takes a stream to parse.
@@ -391,10 +394,11 @@ public class PDFStreamParser extends Bas
                 // PDF spec is kinda unclear about this. Should a whitespace
                 // always appear before EI? Not sure, so that we just read
                 // until EI<whitespace>.
-                // Be aware not all kind of whitespaces are allowed here. see 
PDFBOX1561
+                // Be aware not all kind of whitespaces are allowed here. see 
PDFBOX-1561
                 while( !(lastByte == 'E' &&
                          currentByte == 'I' &&
-                         isSpaceOrReturn()) &&
+                         isSpaceOrReturn() &&
+                         hasNoFollowingBinData( pdfSource )) &&
                        !pdfSource.isEOF() )
                 {
                     imageData.write( lastByte );
@@ -436,6 +440,37 @@ public class PDFStreamParser extends Bas
     }
 
     /**
+     * Looks up next 5 bytes if they contain only ASCII characters (no control
+     * sequences etc.).
+     *
+     * @return <code>true</code> if next 5 bytes are printable ASCII 
characters,
+     * otherwise <code>false</code>
+     */
+    private boolean hasNoFollowingBinData(final PushbackInputStream pdfSource) 
+            throws IOException
+    {
+        // as suggested in PDFBOX-1164
+        final int readBytes = pdfSource.read(binCharTestArr, 0, 
maxBinCharTestLength);
+        boolean noBinData = true;
+        
+        if (readBytes > 0)
+        {
+            for (int bIdx = 0; bIdx < readBytes; bIdx++)
+            {
+                final byte b = binCharTestArr[bIdx];
+                if ((b < 0x09) || ((b > 0x0a) && (b < 0x20) && (b != 0x0d)))
+                {
+                    // control character or > 0x7f -> we have binary data
+                    noBinData = false;
+                    break;
+                }
+            }
+            pdfSource.unread(binCharTestArr, 0, readBytes);
+        }
+        return noBinData;
+    }
+
+    /**
      * This will read an operator from the stream.
      *
      * @return The operator that was read from the stream.


Reply via email to