Author: lehmi
Date: Sun Feb 23 14:07:49 2020
New Revision: 1874427
URL: http://svn.apache.org/viewvc?rev=1874427&view=rev
Log:
PDFBOX-3888: reduce number of seek operations, inspired by
https://github.com/TomRoush/PdfBox-Android/pull/204
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Modified:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1874427&r1=1874426&r2=1874427&view=diff
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
(original)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
Sun Feb 23 14:07:49 2020
@@ -23,7 +23,6 @@ import java.nio.charset.StandardCharsets
import java.security.GeneralSecurityException;
import java.security.KeyStore;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -82,7 +81,7 @@ public class COSParser extends BaseParse
private static final int X = 'x';
private static final int STRMBUFLEN = 2048;
- private final byte[] strmBuf = new byte[ STRMBUFLEN ];
+ private final byte[] strmBuf = new byte[ STRMBUFLEN ];
protected final RandomAccessRead source;
@@ -1056,7 +1055,7 @@ public class COSParser extends BaseParse
}
source.seek(startXRefOffset);
skipSpaces();
- if (source.peek() == X && isString(XREF_TABLE))
+ if (isString(XREF_TABLE))
{
return startXRefOffset;
}
@@ -1433,49 +1432,46 @@ public class COSParser extends BaseParse
Map<String, COSDictionary> trailerDicts = new HashMap<>();
long originOffset = source.getPosition();
source.seek(MINIMUM_SEARCH_OFFSET);
- while (!source.isEOF())
+ // search for trailer marker
+ long trailerOffset = findString(TRAILER_MARKER);
+ while (trailerOffset != -1)
{
- // search for trailer marker
- if (isString(TRAILER_MARKER))
+ try
{
- source.seek(source.getPosition() + TRAILER_MARKER.length);
- try
+ boolean rootFound = false;
+ boolean infoFound = false;
+ skipSpaces();
+ COSDictionary trailerDict = parseCOSDictionary();
+ StringBuilder trailerKeys = new StringBuilder();
+ COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
+ if (rootObj != null)
+ {
+ long objNumber = rootObj.getObjectNumber();
+ int genNumber = rootObj.getGenerationNumber();
+ trailerKeys.append(objNumber).append(" ");
+ trailerKeys.append(genNumber).append(" ");
+ rootFound = true;
+ }
+ COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
+ if (infoObj != null)
{
- boolean rootFound = false;
- boolean infoFound = false;
- skipSpaces();
- COSDictionary trailerDict = parseCOSDictionary();
- StringBuilder trailerKeys = new StringBuilder();
- COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
- if (rootObj != null)
- {
- long objNumber = rootObj.getObjectNumber();
- int genNumber = rootObj.getGenerationNumber();
- trailerKeys.append(objNumber).append(" ");
- trailerKeys.append(genNumber).append(" ");
- rootFound = true;
- }
- COSObject infoObj = trailerDict.getCOSObject(COSName.INFO);
- if (infoObj != null)
- {
- long objNumber = infoObj.getObjectNumber();
- int genNumber = infoObj.getGenerationNumber();
- trailerKeys.append(objNumber).append(" ");
- trailerKeys.append(genNumber).append(" ");
- infoFound = true;
- }
- if (rootFound && infoFound)
- {
- trailerDicts.put(trailerKeys.toString(), trailerDict);
- }
+ long objNumber = infoObj.getObjectNumber();
+ int genNumber = infoObj.getGenerationNumber();
+ trailerKeys.append(objNumber).append(" ");
+ trailerKeys.append(genNumber).append(" ");
+ infoFound = true;
}
- catch (IOException exception)
+ if (rootFound && infoFound)
{
- LOG.debug("An exception occurred during brute force search
for trailer - ignoring", exception);
- continue;
+ trailerDicts.put(trailerKeys.toString(), trailerDict);
}
}
- source.read();
+ catch (IOException exception)
+ {
+ LOG.debug("An exception occurred during brute force search for
trailer - ignoring",
+ exception);
+ }
+ trailerOffset = findString(TRAILER_MARKER);
}
source.seek(originOffset);
// eliminate double entries
@@ -1562,34 +1558,29 @@ public class COSParser extends BaseParse
long lastEOFMarker = -1;
long originOffset = source.getPosition();
source.seek(MINIMUM_SEARCH_OFFSET);
- while (!source.isEOF())
+ long tempMarker = findString(EOF_MARKER);
+ while (tempMarker != -1)
{
- // search for EOF marker
- if (isString(EOF_MARKER))
+ try
{
- long tempMarker = source.getPosition();
- source.seek(tempMarker + 5);
- try
+ // check if the following data is some valid pdf content
+ // which most likely indicates that the pdf is linearized,
+ // updated or just cut off somewhere in the middle
+ skipSpaces();
+ if (!isString(XREF_TABLE))
{
- // check if the following data is some valid pdf content
- // which most likely indicates that the pdf is linearized,
- // updated or just cut off somewhere in the middle
- skipSpaces();
- if (!isString(XREF_TABLE))
- {
- readObjectNumber();
- readGenerationNumber();
- }
- }
- catch (IOException exception)
- {
- // save the EOF marker as the following data is most
likely some garbage
- LOG.debug("An exception occurred during brute force for
last EOF - ignoring",
- exception);
- lastEOFMarker = tempMarker;
+ readObjectNumber();
+ readGenerationNumber();
}
}
- source.read();
+ catch (IOException exception)
+ {
+ // save the EOF marker as the following data is most likely
some garbage
+ LOG.debug("An exception occurred during brute force for last
EOF - ignoring",
+ exception);
+ lastEOFMarker = tempMarker;
+ }
+ tempMarker = findString(EOF_MARKER);
}
source.seek(originOffset);
// no EOF marker found
@@ -1610,14 +1601,15 @@ public class COSParser extends BaseParse
// save origin offset
long originOffset = source.getPosition();
+ Map<Long, COSObjectKey> bfSearchForObjStreamOffsets =
bfSearchForObjStreamOffsets();
// log warning about skipped stream
- bfSearchForObjStreamOffsets().entrySet().stream() //
+ bfSearchForObjStreamOffsets.entrySet().stream() //
.filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) ==
null) //
.forEach(o -> LOG.warn(
"Skipped incomplete object stream:" + o.getValue() + "
at " + o.getKey()));
// collect all stream offsets
- List<Long> objStreamOffsets =
bfSearchForObjStreamOffsets().entrySet().stream() //
+ List<Long> objStreamOffsets =
bfSearchForObjStreamOffsets.entrySet().stream() //
.filter(o -> bfSearchCOSObjectKeyOffsets.get(o.getValue()) !=
null) //
.filter(o ->
o.getKey().equals(bfSearchCOSObjectKeyOffsets.get(o.getValue()))) //
.map(Map.Entry::getKey) //
@@ -1686,70 +1678,66 @@ public class COSParser extends BaseParse
HashMap<Long, COSObjectKey> bfSearchObjStreamsOffsets = new
HashMap<>();
source.seek(MINIMUM_SEARCH_OFFSET);
char[] string = " obj".toCharArray();
- while (!source.isEOF())
- {
- // search for object stream marker
- if (isString(OBJ_STREAM))
+ // search for object stream marker
+ long positionObjStream = findString(OBJ_STREAM);
+ while (positionObjStream != -1)
+ {
+ // search backwards for the beginning of the object
+ long newOffset = -1;
+ boolean objFound = false;
+ for (int i = 1; i < 40 && !objFound; i++)
{
- long currentPosition = source.getPosition();
- // search backwards for the beginning of the object
- long newOffset = -1;
- boolean objFound = false;
- for (int i = 1; i < 40 && !objFound; i++)
+ long currentOffset = positionObjStream - (i * 10);
+ if (currentOffset > 0)
{
- long currentOffset = currentPosition - (i * 10);
- if (currentOffset > 0)
+ source.seek(currentOffset);
+ for (int j = 0; j < 10; j++)
{
- source.seek(currentOffset);
- for (int j = 0; j < 10; j++)
+ if (isString(string))
{
- if (isString(string))
+ long tempOffset = currentOffset - 1;
+ source.seek(tempOffset);
+ int genID = source.peek();
+ // is the next char a digit?
+ if (isDigit(genID))
{
- long tempOffset = currentOffset - 1;
+ tempOffset--;
source.seek(tempOffset);
- int genID = source.peek();
- // is the next char a digit?
- if (isDigit(genID))
+ if (isSpace())
{
- tempOffset--;
- source.seek(tempOffset);
- if (isSpace())
+ int length = 0;
+ source.seek(--tempOffset);
+ while (tempOffset > MINIMUM_SEARCH_OFFSET
&& isDigit())
{
- int length = 0;
source.seek(--tempOffset);
- while (tempOffset >
MINIMUM_SEARCH_OFFSET && isDigit())
- {
- source.seek(--tempOffset);
- length++;
- }
- if (length > 0)
- {
- source.read();
- newOffset = source.getPosition();
- long objNumber =
readObjectNumber();
- int genNumber =
readGenerationNumber();
- COSObjectKey streamObjectKey = new
COSObjectKey(
- objNumber, genNumber);
-
bfSearchObjStreamsOffsets.put(newOffset,
- streamObjectKey);
- }
+ length++;
+ }
+ if (length > 0)
+ {
+ source.read();
+ newOffset = source.getPosition();
+ long objNumber = readObjectNumber();
+ int genNumber = readGenerationNumber();
+ COSObjectKey streamObjectKey = new
COSObjectKey(objNumber,
+ genNumber);
+
bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
}
}
- LOG.debug("Dictionary start for object stream
-> " + newOffset);
- objFound = true;
- break;
- }
- else
- {
- currentOffset++;
- source.read();
}
+ LOG.debug("Dictionary start for object stream -> "
+ newOffset);
+ objFound = true;
+ break;
+ }
+ else
+ {
+ currentOffset++;
+ source.read();
}
}
}
- source.seek(currentPosition + OBJ_STREAM.length);
}
- source.read();
+ source.seek(positionObjStream + OBJ_STREAM.length);
+ positionObjStream = findString(OBJ_STREAM);
}
return bfSearchObjStreamsOffsets;
}
@@ -1764,25 +1752,20 @@ public class COSParser extends BaseParse
{
// a pdf may contain more than one xref entry
bfSearchXRefTablesOffsets = new ArrayList<>();
- long originOffset = source.getPosition();
source.seek(MINIMUM_SEARCH_OFFSET);
// search for xref tables
- while (!source.isEOF())
+ long newOffset = findString(XREF_TABLE);
+ while (newOffset != -1)
{
- if (isString(XREF_TABLE))
+ source.seek(newOffset - 1);
+ // ensure that we don't read "startxref" instead of "xref"
+ if (isWhitespace())
{
- long newOffset = source.getPosition();
- source.seek(newOffset - 1);
- // ensure that we don't read "startxref" instead of "xref"
- if (isWhitespace())
- {
- bfSearchXRefTablesOffsets.add(newOffset);
- }
- source.seek(newOffset + 4);
+ bfSearchXRefTablesOffsets.add(newOffset);
}
- source.read();
+ source.seek(newOffset + 4);
+ newOffset = findString(XREF_TABLE);
}
- source.seek(originOffset);
}
}
@@ -1797,75 +1780,70 @@ public class COSParser extends BaseParse
{
// a pdf may contain more than one /XRef entry
bfSearchXRefStreamsOffsets = new ArrayList<>();
- long originOffset = source.getPosition();
source.seek(MINIMUM_SEARCH_OFFSET);
// search for XRef streams
String objString = " obj";
char[] string = objString.toCharArray();
- while (!source.isEOF())
+ long xrefOffset = findString(XREF_STREAM);
+ while (xrefOffset != -1)
{
- if (isString(XREF_STREAM))
+ // search backwards for the beginning of the stream
+ long newOffset = -1;
+ boolean objFound = false;
+ for (int i = 1; i < 40 && !objFound; i++)
{
- // search backwards for the beginning of the stream
- long newOffset = -1;
- long xrefOffset = source.getPosition();
- boolean objFound = false;
- for (int i = 1; i < 40 && !objFound; i++)
+ long currentOffset = xrefOffset - (i * 10);
+ if (currentOffset > 0)
{
- long currentOffset = xrefOffset - (i * 10);
- if (currentOffset > 0)
+ source.seek(currentOffset);
+ for (int j = 0; j < 10; j++)
{
- source.seek(currentOffset);
- for (int j = 0; j < 10; j++)
+ if (isString(string))
{
- if (isString(string))
+ long tempOffset = currentOffset - 1;
+ source.seek(tempOffset);
+ int genID = source.peek();
+ // is the next char a digit?
+ if (isDigit(genID))
{
- long tempOffset = currentOffset - 1;
+ tempOffset--;
source.seek(tempOffset);
- int genID = source.peek();
- // is the next char a digit?
- if (isDigit(genID))
+ if (isSpace())
{
- tempOffset--;
- source.seek(tempOffset);
- if (isSpace())
+ int length = 0;
+ source.seek(--tempOffset);
+ while (tempOffset >
MINIMUM_SEARCH_OFFSET && isDigit())
{
- int length = 0;
source.seek(--tempOffset);
- while (tempOffset >
MINIMUM_SEARCH_OFFSET && isDigit())
- {
- source.seek(--tempOffset);
- length++;
- }
- if (length > 0)
- {
- source.read();
- newOffset =
source.getPosition();
- }
+ length++;
+ }
+ if (length > 0)
+ {
+ source.read();
+ newOffset = source.getPosition();
}
}
- LOG.debug("Fixed reference for xref stream
" + xrefOffset
- + " -> " + newOffset);
- objFound = true;
- break;
- }
- else
- {
- currentOffset++;
- source.read();
}
+ LOG.debug("Fixed reference for xref stream " +
xrefOffset + " -> "
+ + newOffset);
+ objFound = true;
+ break;
+ }
+ else
+ {
+ currentOffset++;
+ source.read();
}
}
}
- if (newOffset > -1)
- {
- bfSearchXRefStreamsOffsets.add(newOffset);
- }
- source.seek(xrefOffset + 5);
}
- source.read();
+ if (newOffset > -1)
+ {
+ bfSearchXRefStreamsOffsets.add(newOffset);
+ }
+ source.seek(xrefOffset + 5);
+ xrefOffset = findString(XREF_STREAM);
}
- source.seek(originOffset);
}
}
@@ -2084,24 +2062,17 @@ public class COSParser extends BaseParse
*/
private boolean isString(byte[] string) throws IOException
{
- boolean bytesMatching = false;
- if (source.peek() == string[0])
+ boolean bytesMatching = true;
+ long originOffset = source.getPosition();
+ for (byte c : string)
{
- int length = string.length;
- byte[] bytesRead = new byte[length];
- int numberOfBytes = source.read(bytesRead, 0, length);
- while (numberOfBytes < length)
+ if (source.read() != c)
{
- int readMore = source.read(bytesRead, numberOfBytes, length -
numberOfBytes);
- if (readMore < 0)
- {
- break;
- }
- numberOfBytes += readMore;
+ bytesMatching = false;
+ break;
}
- bytesMatching = Arrays.equals(string, bytesRead);
- source.rewind(numberOfBytes);
}
+ source.seek(originOffset);
return bytesMatching;
}
@@ -2129,6 +2100,45 @@ public class COSParser extends BaseParse
}
/**
+ * Search for the given string. The search starts at the current position
and returns the start position if the
+ * string was found. -1 is returned if there isn't any further occurrence
of the given string. After returning the
+ * current position is either the end of the string or the end of the
input.
+ *
+ * @param string the string to be searched
+ * @return the start position of the found string
+ * @throws IOException if something went wrong
+ */
+ private long findString(char[] string) throws IOException
+ {
+ long position = -1L;
+ int stringLength = string.length;
+ int counter = 0;
+ int readChar = source.read();
+ while (readChar != -1)
+ {
+ if (readChar == string[counter])
+ {
+ if (counter == 0)
+ {
+ position = source.getPosition();
+ }
+ counter++;
+ if (counter == stringLength)
+ {
+ return position;
+ }
+ }
+ else if (counter > 0)
+ {
+ counter = 0;
+ position = -1L;
+ continue;
+ }
+ readChar = source.read();
+ }
+ return position;
+ }
+ /**
* This will parse the trailer from the stream and add it to the state.
*
* @return false on parsing error