NonSequentialPDFParser.java

lehmi Sun, 08 Dec 2013 04:59:34 -0800

Author: lehmi
Date: Sun Dec  8 12:58:49 2013
New Revision: 1549025

URL: http://svn.apache.org/r1549025
Log:
PDFBOX-1769: identify corrupt stream length and use workaround if necessary


Modified:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1549025&r1=1549024&r2=1549025&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
 Sun Dec  8 12:58:49 2013
@@ -110,6 +110,7 @@ public class NonSequentialPDFParser exte
     protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' };
 
     private final File pdfFile;
+    private long fileLen;
     private final RandomAccessBufferedFileInputStream raStream;
 
     /**
@@ -558,7 +559,7 @@ public class NonSequentialPDFParser exte
         long skipBytes;
 
         // ---- read trailing bytes into buffer
-        final long fileLen = pdfFile.length();
+        fileLen = pdfFile.length();
 
         FileInputStream fIn = null;
         try
@@ -1518,31 +1519,34 @@ public class NonSequentialPDFParser exte
                 throw new IOException("Missing length for stream.");
             }
 
+            boolean useReadUntilEnd = false;
             // ---- get output stream to copy data to
             out = stream.createFilteredStream(streamLengthObj);
-
-            long remainBytes = streamLengthObj.longValue();
-            int bytesRead = 0;
-            boolean unexpectedEndOfStream = false;
-            while (remainBytes > 0)
-            {
-                final int readBytes = pdfSource.read(streamCopyBuf, 0,
-                        (remainBytes > streamCopyBufLen) ? streamCopyBufLen : 
(int) remainBytes);
-                if (readBytes <= 0)
-                {
-                    // throw new IOException(
-                    // "No more bytes from stream but expected: " + remainBytes
-                    // );
-                    unexpectedEndOfStream = true;
-                    break;
-                }
-                out.write(streamCopyBuf, 0, readBytes);
-                remainBytes -= readBytes;
-                bytesRead += readBytes;
+            if (validateStreamLength(streamLengthObj.longValue()))
+            {
+                   long remainBytes = streamLengthObj.longValue();
+                   int bytesRead = 0;
+                   while (remainBytes > 0)
+                   {
+                       final int readBytes = pdfSource.read(streamCopyBuf, 0,
+                               (remainBytes > streamCopyBufLen) ? 
streamCopyBufLen : (int) remainBytes);
+                       if (readBytes <= 0)
+                       {
+                           useReadUntilEnd = true;
+                           pdfSource.unread(bytesRead);
+                           break;
+                       }
+                       out.write(streamCopyBuf, 0, readBytes);
+                       remainBytes -= readBytes;
+                       bytesRead += readBytes;
+                   }
+            }
+            else
+            {
+                useReadUntilEnd = true;
             }
-            if (unexpectedEndOfStream)
+            if (useReadUntilEnd)
             {
-                pdfSource.unread(bytesRead);
                 out = stream.createFilteredStream(streamLengthObj);
                 readUntilEndStream(out);
             }
@@ -1563,6 +1567,28 @@ public class NonSequentialPDFParser exte
         return stream;
     }
 
+    private boolean validateStreamLength(long streamLength) throws IOException
+    {
+       boolean streamLengthIsValid = true;
+       long originOffset = pdfSource.getOffset();
+       long expectedEndOfStream = originOffset + streamLength;
+       if (expectedEndOfStream > fileLen)
+       {
+               streamLengthIsValid = false;
+               LOG.error("The end of the stream is out of range, using 
workaround to read the stream");
+       }
+       else
+       {
+                       pdfSource.seek(expectedEndOfStream);
+               if (!checkBytesAtOffset("endstream".getBytes("ISO-8859-1")))
+               {
+                       streamLengthIsValid = false;
+                       LOG.error("The end of the stream doesn't point to the 
correct offset, using workaround to read the stream");
+               }
+               pdfSource.seek(originOffset);
+       }
+       return streamLengthIsValid;
+    }
     private void readUntilEndStream(final OutputStream out) throws IOException
     {
 
@@ -1742,6 +1768,11 @@ public class NonSequentialPDFParser exte
      */
     private long calculateFixingOffset(long objectOffset, byte[] string) 
throws IOException
     {
+       if (objectOffset < 0)
+       {
+               LOG.error("Invalid object offset " + objectOffset + " for 
object " + new String(string));
+               return 0;
+       }
        long originOffset = pdfSource.getOffset();
        pdfSource.seek(objectOffset);
        // most likely the object can be found at the given offset

svn commit: r1549025 - /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java

Reply via email to