Author: lehmi
Date: Sun Feb 23 14:07:49 2020
New Revision: 1874427

URL: http://svn.apache.org/viewvc?rev=1874427&view=rev
Log:
PDFBOX-3888: reduce number of seek operations, inspired by 
https://github.com/TomRoush/PdfBox-Android/pull/204

Modified:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1874427&r1=1874426&r2=1874427&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java 
(original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java 
Sun Feb 23 14:07:49 2020
@@ -23,7 +23,6 @@ import java.nio.charset.StandardCharsets
 import java.security.GeneralSecurityException;
 import java.security.KeyStore;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -82,7 +81,7 @@ public class COSParser extends BaseParse
     private static final int X = 'x';
 
     private static final int STRMBUFLEN = 2048;
-    private final byte[] strmBuf    = new byte[ STRMBUFLEN ];
+    private final byte[] strmBuf = new byte[ STRMBUFLEN ];
 
     protected final RandomAccessRead source;
 
@@ -1056,7 +1055,7 @@ public class COSParser extends BaseParse
         }
         source.seek(startXRefOffset);
         skipSpaces();
-        if (source.peek() == X && isString(XREF_TABLE))
+        if (isString(XREF_TABLE))
         {
             return startXRefOffset;
         }
@@ -1433,49 +1432,46 @@ public class COSParser extends BaseParse
         Map<String, COSDictionary> trailerDicts = new HashMap<>();
         long originOffset = source.getPosition();
         source.seek(MINIMUM_SEARCH_OFFSET);
-        while (!source.isEOF())
+        // search for trailer marker
+        long trailerOffset = findString(TRAILER_MARKER);
+        while (trailerOffset != -1)
         {
-            // search for trailer marker
-            if (isString(TRAILER_MARKER))
+            try
             {
-                source.seek(source.getPosition() + TRAILER_MARKER.length);
-                try
+                boolean rootFound = false;
+                boolean infoFound = false;
+                skipSpaces();
+                COSDictionary trailerDict = parseCOSDictionary();
+                StringBuilder trailerKeys = new StringBuilder();
+                COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
+                if (rootObj != null)
+                {
+                    long objNumber = rootObj.getObjectNumber();
+                    int genNumber = rootObj.getGenerationNumber();
+                    trailerKeys.append(objNumber).append(" ");
+                    trailerKeys.append(genNumber).append(" ");
+                    rootFound = true;
+                }
+                COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
+                if (infoObj != null)
                 {
-                    boolean rootFound = false;
-                    boolean infoFound = false;
-                    skipSpaces();
-                    COSDictionary trailerDict = parseCOSDictionary();
-                    StringBuilder trailerKeys = new StringBuilder();
-                    COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
-                    if (rootObj != null)
-                    {
-                        long objNumber = rootObj.getObjectNumber();
-                        int genNumber = rootObj.getGenerationNumber();
-                        trailerKeys.append(objNumber).append(" ");
-                        trailerKeys.append(genNumber).append(" ");
-                        rootFound = true;
-                    }
-                    COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
-                    if (infoObj != null)
-                    {
-                        long objNumber = infoObj.getObjectNumber();
-                        int genNumber = infoObj.getGenerationNumber();
-                        trailerKeys.append(objNumber).append(" ");
-                        trailerKeys.append(genNumber).append(" ");
-                        infoFound = true;
-                    }
-                    if (rootFound && infoFound)
-                    {
-                        trailerDicts.put(trailerKeys.toString(), trailerDict);
-                    }
+                    long objNumber = infoObj.getObjectNumber();
+                    int genNumber = infoObj.getGenerationNumber();
+                    trailerKeys.append(objNumber).append(" ");
+                    trailerKeys.append(genNumber).append(" ");
+                    infoFound = true;
                 }
-                catch (IOException exception)
+                if (rootFound && infoFound)
                 {
-                    LOG.debug("An exception occurred during brute force search 
for trailer - ignoring", exception);
-                    continue;
+                    trailerDicts.put(trailerKeys.toString(), trailerDict);
                 }
             }
-            source.read();
+            catch (IOException exception)
+            {
+                LOG.debug("An exception occurred during brute force search for 
trailer - ignoring",
+                        exception);
+            }
+            trailerOffset = findString(TRAILER_MARKER);
         }
         source.seek(originOffset);
         // eliminate double entries
@@ -1562,34 +1558,29 @@ public class COSParser extends BaseParse
         long lastEOFMarker = -1;
         long originOffset = source.getPosition();
         source.seek(MINIMUM_SEARCH_OFFSET);
-        while (!source.isEOF())
+        long tempMarker = findString(EOF_MARKER);
+        while (tempMarker != -1)
         {
-            // search for EOF marker
-            if (isString(EOF_MARKER))
+            try
             {
-                long tempMarker = source.getPosition();
-                source.seek(tempMarker + 5);
-                try
+                // check if the following data is some valid pdf content
+                // which most likely indicates that the pdf is linearized,
+                // updated or just cut off somewhere in the middle
+                skipSpaces();
+                if (!isString(XREF_TABLE))
                 {
-                    // check if the following data is some valid pdf content
-                    // which most likely indicates that the pdf is linearized,
-                    // updated or just cut off somewhere in the middle
-                    skipSpaces();
-                    if (!isString(XREF_TABLE))
-                    {
-                        readObjectNumber();
-                        readGenerationNumber();
-                    }
-                }
-                catch (IOException exception)
-                {
-                    // save the EOF marker as the following data is most 
likely some garbage
-                    LOG.debug("An exception occurred during brute force for 
last EOF - ignoring",
-                            exception);
-                    lastEOFMarker = tempMarker;
+                    readObjectNumber();
+                    readGenerationNumber();
                 }
             }
-            source.read();
+            catch (IOException exception)
+            {
+                // save the EOF marker as the following data is most likely 
some garbage
+                LOG.debug("An exception occurred during brute force for last 
EOF - ignoring",
+                        exception);
+                lastEOFMarker = tempMarker;
+            }
+            tempMarker = findString(EOF_MARKER);
         }
         source.seek(originOffset);
         // no EOF marker found
@@ -1610,14 +1601,15 @@ public class COSParser extends BaseParse
         // save origin offset
         long originOffset = source.getPosition();
 
+        Map<Long, COSObjectKey> bfSearchForObjStreamOffsets = 
bfSearchForObjStreamOffsets();
         // log warning about skipped stream
-        bfSearchForObjStreamOffsets().entrySet().stream() //
+        bfSearchForObjStreamOffsets.entrySet().stream() //
                 .filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) == 
null) //
                 .forEach(o -> LOG.warn(
                         "Skipped incomplete object stream:" + o.getValue() + " 
at " + o.getKey()));
 
         // collect all stream offsets
-        List<Long> objStreamOffsets = 
bfSearchForObjStreamOffsets().entrySet().stream() //
+        List<Long> objStreamOffsets = 
bfSearchForObjStreamOffsets.entrySet().stream() //
                 .filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) != 
null) //
                 .filter(o -> 
o.getKey().equals(bfSearchCOSObjectKeyOffsets.get(o.getValue()))) //
                 .map(Map.Entry::getKey) //
@@ -1686,70 +1678,66 @@ public class COSParser extends BaseParse
         HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new 
HashMap<>();
         source.seek(MINIMUM_SEARCH_OFFSET);
         char[] string = " obj".toCharArray();
-        while (!source.isEOF())
-        {
-            // search for object stream marker
-            if (isString(OBJ_STREAM))
+        // search for object stream marker
+        long positionObjStream = findString(OBJ_STREAM);
+        while (positionObjStream != -1)
+        {
+            // search backwards for the beginning of the object
+            long newOffset = -1;
+            boolean objFound = false;
+            for (int i = 1; i < 40 && !objFound; i++)
             {
-                long currentPosition = source.getPosition();
-                // search backwards for the beginning of the object
-                long newOffset = -1;
-                boolean objFound = false;
-                for (int i = 1; i < 40 && !objFound; i++)
+                long currentOffset = positionObjStream - (i * 10);
+                if (currentOffset > 0)
                 {
-                    long currentOffset = currentPosition - (i * 10);
-                    if (currentOffset > 0)
+                    source.seek(currentOffset);
+                    for (int j = 0; j < 10; j++)
                     {
-                        source.seek(currentOffset);
-                        for (int j = 0; j < 10; j++)
+                        if (isString(string))
                         {
-                            if (isString(string))
+                            long tempOffset = currentOffset - 1;
+                            source.seek(tempOffset);
+                            int genID = source.peek();
+                            // is the next char a digit?
+                            if (isDigit(genID))
                             {
-                                long tempOffset = currentOffset - 1;
+                                tempOffset--;
                                 source.seek(tempOffset);
-                                int genID = source.peek();
-                                // is the next char a digit?
-                                if (isDigit(genID))
+                                if (isSpace())
                                 {
-                                    tempOffset--;
-                                    source.seek(tempOffset);
-                                    if (isSpace())
+                                    int length = 0;
+                                    source.seek(--tempOffset);
+                                    while (tempOffset > MINIMUM_SEARCH_OFFSET 
&& isDigit())
                                     {
-                                        int length = 0;
                                         source.seek(--tempOffset);
-                                        while (tempOffset > 
MINIMUM_SEARCH_OFFSET && isDigit())
-                                        {
-                                            source.seek(--tempOffset);
-                                            length++;
-                                        }
-                                        if (length > 0)
-                                        {
-                                            source.read();
-                                            newOffset = source.getPosition();
-                                            long objNumber = 
readObjectNumber();
-                                            int genNumber = 
readGenerationNumber();
-                                            COSObjectKey streamObjectKey = new 
COSObjectKey(
-                                                    objNumber, genNumber);
-                                            
bfSearchObjStreamsOffsets.put(newOffset,
-                                                    streamObjectKey);
-                                        }
+                                        length++;
+                                    }
+                                    if (length > 0)
+                                    {
+                                        source.read();
+                                        newOffset = source.getPosition();
+                                        long objNumber = readObjectNumber();
+                                        int genNumber = readGenerationNumber();
+                                        COSObjectKey streamObjectKey = new 
COSObjectKey(objNumber,
+                                                genNumber);
+                                        
bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
                                     }
                                 }
-                                LOG.debug("Dictionary start for object stream 
-> " + newOffset);
-                                objFound = true;
-                                break;
-                            }
-                            else
-                            {
-                                currentOffset++;
-                                source.read();
                             }
+                            LOG.debug("Dictionary start for object stream -> " 
+ newOffset);
+                            objFound = true;
+                            break;
+                        }
+                        else
+                        {
+                            currentOffset++;
+                            source.read();
                         }
                     }
                 }
-                source.seek(currentPosition + OBJ_STREAM.length);
             }
-            source.read();
+            source.seek(positionObjStream + OBJ_STREAM.length);
+            positionObjStream = findString(OBJ_STREAM);
         }
         return bfSearchObjStreamsOffsets;
     }
@@ -1764,25 +1752,20 @@ public class COSParser extends BaseParse
         {
             // a pdf may contain more than one xref entry
             bfSearchXRefTablesOffsets = new ArrayList<>();
-            long originOffset = source.getPosition();
             source.seek(MINIMUM_SEARCH_OFFSET);
             // search for xref tables
-            while (!source.isEOF())
+            long newOffset = findString(XREF_TABLE);
+            while (newOffset != -1)
             {
-                if (isString(XREF_TABLE))
+                source.seek(newOffset - 1);
+                // ensure that we don't read "startxref" instead of "xref"
+                if (isWhitespace())
                 {
-                    long newOffset = source.getPosition();
-                    source.seek(newOffset - 1);
-                    // ensure that we don't read "startxref" instead of "xref"
-                    if (isWhitespace())
-                    {
-                        bfSearchXRefTablesOffsets.add(newOffset);
-                    }
-                    source.seek(newOffset + 4);
+                    bfSearchXRefTablesOffsets.add(newOffset);
                 }
-                source.read();
+                source.seek(newOffset + 4);
+                newOffset = findString(XREF_TABLE);
             }
-            source.seek(originOffset);
         }
     }
 
@@ -1797,75 +1780,70 @@ public class COSParser extends BaseParse
         {
             // a pdf may contain more than one /XRef entry
             bfSearchXRefStreamsOffsets = new ArrayList<>();
-            long originOffset = source.getPosition();
             source.seek(MINIMUM_SEARCH_OFFSET);
             // search for XRef streams
             String objString = " obj";
             char[] string = objString.toCharArray();
-            while (!source.isEOF())
+            long xrefOffset = findString(XREF_STREAM);
+            while (xrefOffset != -1)
             {
-                if (isString(XREF_STREAM))
+                // search backwards for the beginning of the stream
+                long newOffset = -1;
+                boolean objFound = false;
+                for (int i = 1; i < 40 && !objFound; i++)
                 {
-                    // search backwards for the beginning of the stream
-                    long newOffset = -1;
-                    long xrefOffset = source.getPosition();
-                    boolean objFound = false;
-                    for (int i = 1; i < 40 && !objFound; i++)
+                    long currentOffset = xrefOffset - (i * 10);
+                    if (currentOffset > 0)
                     {
-                        long currentOffset = xrefOffset - (i * 10);
-                        if (currentOffset > 0)
+                        source.seek(currentOffset);
+                        for (int j = 0; j < 10; j++)
                         {
-                            source.seek(currentOffset);
-                            for (int j = 0; j < 10; j++)
+                            if (isString(string))
                             {
-                                if (isString(string))
+                                long tempOffset = currentOffset - 1;
+                                source.seek(tempOffset);
+                                int genID = source.peek();
+                                // is the next char a digit?
+                                if (isDigit(genID))
                                 {
-                                    long tempOffset = currentOffset - 1;
+                                    tempOffset--;
                                     source.seek(tempOffset);
-                                    int genID = source.peek();
-                                    // is the next char a digit?
-                                    if (isDigit(genID))
+                                    if (isSpace())
                                     {
-                                        tempOffset--;
-                                        source.seek(tempOffset);
-                                        if (isSpace())
+                                        int length = 0;
+                                        source.seek(--tempOffset);
+                                        while (tempOffset > 
MINIMUM_SEARCH_OFFSET && isDigit())
                                         {
-                                            int length = 0;
                                             source.seek(--tempOffset);
-                                            while (tempOffset > 
MINIMUM_SEARCH_OFFSET && isDigit())
-                                            {
-                                                source.seek(--tempOffset);
-                                                length++;
-                                            }
-                                            if (length > 0)
-                                            {
-                                                source.read();
-                                                newOffset = 
source.getPosition();
-                                            }
+                                            length++;
+                                        }
+                                        if (length > 0)
+                                        {
+                                            source.read();
+                                            newOffset = source.getPosition();
                                         }
                                     }
-                                    LOG.debug("Fixed reference for xref stream 
" + xrefOffset
-                                            + " -> " + newOffset);
-                                    objFound = true;
-                                    break;
-                                }
-                                else
-                                {
-                                    currentOffset++;
-                                    source.read();
                                 }
+                                LOG.debug("Fixed reference for xref stream " + 
xrefOffset + " -> "
+                                        + newOffset);
+                                objFound = true;
+                                break;
+                            }
+                            else
+                            {
+                                currentOffset++;
+                                source.read();
                             }
                         }
                     }
-                    if (newOffset > -1)
-                    {
-                        bfSearchXRefStreamsOffsets.add(newOffset);
-                    }
-                    source.seek(xrefOffset + 5);
                 }
-                source.read();
+                if (newOffset > -1)
+                {
+                    bfSearchXRefStreamsOffsets.add(newOffset);
+                }
+                source.seek(xrefOffset + 5);
+                xrefOffset = findString(XREF_STREAM);
             }
-            source.seek(originOffset);
         }
     }
     
@@ -2084,24 +2062,17 @@ public class COSParser extends BaseParse
      */
     private boolean isString(byte[] string) throws IOException
     {
-        boolean bytesMatching = false;
-        if (source.peek() == string[0])
+        boolean bytesMatching = true;
+        long originOffset = source.getPosition();
+        for (byte c : string)
         {
-            int length = string.length;
-            byte[] bytesRead = new byte[length];
-            int numberOfBytes = source.read(bytesRead, 0, length);
-            while (numberOfBytes < length)
+            if (source.read() != c)
             {
-                int readMore = source.read(bytesRead, numberOfBytes, length - 
numberOfBytes);
-                if (readMore < 0)
-                {
-                    break;
-                }
-                numberOfBytes += readMore;
+                bytesMatching = false;
+                break;
             }
-            bytesMatching = Arrays.equals(string, bytesRead);
-            source.rewind(numberOfBytes);
         }
+        source.seek(originOffset);
         return bytesMatching;
     }
 
@@ -2129,6 +2100,45 @@ public class COSParser extends BaseParse
     }
 
     /**
+     * Search for the given string. The search starts at the current position 
and returns the start position if the
+     * string was found. -1 is returned if there isn't any further occurrence 
of the given string. After returning the
+     * current position is either the end of the string or the end of the 
input.
+     * 
+     * @param string the string to be searched
+     * @return the start position of the found string
+     * @throws IOException if something went wrong
+     */
+    private long findString(char[] string) throws IOException
+    {
+        long position = -1L;
+        int stringLength = string.length;
+        int counter = 0;
+        int readChar = source.read();
+        while (readChar != -1)
+        {
+            if (readChar == string[counter])
+            {
+                if (counter == 0)
+                {
+                    position = source.getPosition();
+                }
+                counter++;
+                if (counter == stringLength)
+                {
+                    return position;
+                }
+            }
+            else if (counter > 0)
+            {
+                counter = 0;
+                position = -1L;
+                continue;
+            }
+            readChar = source.read();
+        }
+        return position;
+    }
+    /**
      * This will parse the trailer from the stream and add it to the state.
      *
      * @return false on parsing error


Reply via email to