pdfparser: BaseParser.java BruteForceParser.java COSParser.java XrefParser.java

lehmi Sun, 07 Apr 2024 22:55:37 -0700

Author: lehmi
Date: Mon Apr  8 05:54:20 2024
New Revision: 1916856

URL: http://svn.apache.org/viewvc?rev=1916856&view=rev
Log:
PDFBOX-5032: implement a separate parser for the xref information. 
BruteForceParser no longer extends COSParser


Added:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java   
(with props)
Modified:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1916856&r1=1916855&r2=1916856&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java 
(original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java 
Mon Apr  8 05:54:20 2024
@@ -62,7 +62,7 @@ public abstract class BaseParser
 
     private static final long GENERATION_NUMBER_THRESHOLD = 65535;
 
-    static final int MAX_LENGTH_LONG = Long.toString(Long.MAX_VALUE).length();
+    private static final int MAX_LENGTH_LONG = 
Long.toString(Long.MAX_VALUE).length();
 
     private static final Charset ALTERNATIVE_CHARSET;
 
@@ -133,13 +133,25 @@ public abstract class BaseParser
     private static final char[] NULL = { 'n', 'u', 'l', 'l' };
 
     /**
+     * ASCII code for Null.
+     */
+    private static final byte ASCII_NULL = 0;
+    /**
+     * ASCII code for horizontal tab.
+     */
+    private static final byte ASCII_TAB = 9;
+    /**
      * ASCII code for line feed.
      */
-    protected static final byte ASCII_LF = 10;
+    private static final byte ASCII_LF = 10;
+    /**
+     * ASCII code for form feed.
+     */
+    private static final byte ASCII_FF = 12;
     /**
      * ASCII code for carriage return.
      */
-    protected static final byte ASCII_CR = 13;
+    private static final byte ASCII_CR = 13;
     private static final byte ASCII_ZERO = 48;
     private static final byte ASCII_NINE = 57;
     private static final byte ASCII_SPACE = 32;
@@ -251,8 +263,7 @@ public abstract class BaseParser
     {
         if (document == null)
         {
-            throw new IOException("object reference " + key + " at offset " + 
source
-                    .getPosition()
+            throw new IOException("object reference " + key + " at offset " + 
source.getPosition()
                     + " in content stream");
         }
         return document.getObjectFromPool(key);
@@ -389,32 +400,35 @@ public abstract class BaseParser
         return true;
     }
 
+    /**
+     * Skip the upcoming CRLF or LF which are supposed to follow a stream.
+     * 
+     * @throws IOException
+     */
     protected void skipWhiteSpaces() throws IOException
     {
         //PDF Ref 3.2.7 A stream must be followed by either
         //a CRLF or LF but nothing else.
-
         int whitespace = source.read();
-
         //see brother_scan_cover.pdf, it adds whitespaces
         //after the stream but before the start of the
         //data, so just read those first
-        while (ASCII_SPACE == whitespace)
+        while (isSpace(whitespace))
         {
             whitespace = source.read();
         }
 
-        if (ASCII_CR == whitespace)
+        if (isCR(whitespace))
         {
             whitespace = source.read();
-            if (ASCII_LF != whitespace)
+            if (!isLF(whitespace))
             {
                 source.rewind(1);
                 //The spec says this is invalid but it happens in the real
                 //world so we must support it.
             }
         }
-        else if (ASCII_LF != whitespace)
+        else if (!isLF(whitespace))
         {
             //we are in an error.
             //but again we will do a lenient parsing and just assume that 
everything
@@ -460,10 +474,10 @@ public abstract class BaseParser
         // 4. COSDictionary ends in the next line: LF + '>'
         // 5. Next line contains another COSObject: CR + '/'
         // 6. COSDictionary ends in the next line: CR + '>'
-        if (((nextThreeBytes[0] == ASCII_CR || nextThreeBytes[0] == ASCII_LF)
+        if (((isCR(nextThreeBytes[0]) || isLF(nextThreeBytes[0]))
                 && (nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')) //
                 || //
-                (nextThreeBytes[0] == ASCII_CR && nextThreeBytes[1] == ASCII_LF
+                (isCR(nextThreeBytes[0]) && isLF(nextThreeBytes[1])
                         && (nextThreeBytes[2] == '/' || nextThreeBytes[2] == 
'>')) //
         )
         {
@@ -781,14 +795,14 @@ public abstract class BaseParser
      * @param ch The character
      * @return true if the character terminates a PDF name, otherwise false.
      */
-    protected boolean isEndOfName(int ch)
+    protected static boolean isEndOfName(int ch)
     {
         switch (ch)
         {
         case ASCII_SPACE:
         case ASCII_CR:
         case ASCII_LF:
-        case 9:
+        case ASCII_TAB:
         case '>':
         case '<':
         case '[':
@@ -796,7 +810,7 @@ public abstract class BaseParser
         case ']':
         case ')':
         case '(':
-        case 0:
+        case ASCII_NULL:
         case '\f':
         case '%':
         case -1:
@@ -872,8 +886,8 @@ public abstract class BaseParser
     }
 
     /**
-     * Tries to decode the buffer cotent to an UTF-8 String.
-     * If that fails, tries the alternative Encoding.
+     * Tries to decode the buffer content to an UTF-8 String. If that fails, 
tries the alternative Encoding.
+     * 
      * @param buffer the {@link ByteArrayOutputStream} containing the bytes to 
decode
      * @return the decoded String
      */
@@ -935,7 +949,7 @@ public abstract class BaseParser
         case (char)-1:
             return null;
         default:
-            if( Character.isDigit(c) || c == '-' || c == '+' || c == '.')
+            if (isDigit(c) || c == '-' || c == '+' || c == '.')
             {
                 return parseCOSNumber();
             }
@@ -1086,22 +1100,10 @@ public abstract class BaseParser
     /**
      * This will tell if the next character is a closing brace( close of PDF 
array ).
      *
-     * @return true if the next byte is ']', false otherwise.
-     *
-     * @throws IOException If an IO error occurs.
-     */
-    protected boolean isClosing() throws IOException
-    {
-        return isClosing(source.peek());
-    }
-
-    /**
-     * This will tell if the next character is a closing brace( close of PDF 
array ).
-     *
      * @param c The character to check against end of line
      * @return true if the next byte is ']', false otherwise.
      */
-    protected boolean isClosing(int c)
+    protected static boolean isClosing(int c)
     {
         return c == ']';
     }
@@ -1144,18 +1146,6 @@ public abstract class BaseParser
     }
 
     /**
-     * This will tell if the next byte to be read is an end of line byte.
-     *
-     * @return true if the next byte is 0x0A or 0x0D.
-     *
-     * @throws IOException If there is an error reading from the stream.
-     */
-    protected boolean isEOL() throws IOException
-    {
-        return isEOL(source.peek());
-    }
-
-    /**
      * This will tell if the end of the data is reached.
      * 
      * @return true if the end of the data is reached.
@@ -1172,17 +1162,29 @@ public abstract class BaseParser
      * @param c The character to check against end of line
      * @return true if the next byte is 0x0A or 0x0D.
      */
-    protected boolean isEOL(int c)
+    protected static boolean isEOL(int c)
     {
         return isLF(c) || isCR(c);
     }
 
-    private boolean isLF(int c)
+    /**
+     * This will tell if the next byte to be read is a line feed.
+     *
+     * @param c The character to check against line feed
+     * @return true if the next byte is 0x0A.
+     */
+    private static boolean isLF(int c)
     {
         return ASCII_LF == c;
     }
 
-    private boolean isCR(int c)
+    /**
+     * This will tell if the next byte to be read is a carriage return.
+     *
+     * @param c The character to check against carriage return
+     * @return true if the next byte is 0x0D.
+     */
+    private static boolean isCR(int c)
     {
         return ASCII_CR == c;
     }
@@ -1209,9 +1211,9 @@ public abstract class BaseParser
     {
         switch (c)
         {
-        case 0:
-        case 9:
-        case 12:
+        case ASCII_NULL:
+        case ASCII_TAB:
+        case ASCII_FF:
         case ASCII_LF:
         case ASCII_CR:
         case ASCII_SPACE:
@@ -1239,7 +1241,7 @@ public abstract class BaseParser
      * @param c The character to check against space
      * @return true if the next byte in the stream is a space character.
      */
-    protected boolean isSpace(int c)
+    private static boolean isSpace(int c)
     {
         return ASCII_SPACE == c;
     }
@@ -1327,7 +1329,8 @@ public abstract class BaseParser
         int retval = readInt();
         if(retval < 0 || retval > GENERATION_NUMBER_THRESHOLD)
         {
-            throw new IOException("Generation Number '" + retval + "' has more 
than 5 digits");
+            throw new IOException(
+                    "Generation Number '" + retval + "' has more than 5 digits 
or is negative");
         }
         return retval;
     }
@@ -1399,7 +1402,7 @@ public abstract class BaseParser
     {
         int lastByte;
         StringBuilder buffer = new StringBuilder();
-        while ((lastByte = source.read()) >= '0' && lastByte <= '9')
+        while (isDigit(lastByte = source.read()))
         {
             buffer.append( (char)lastByte );
             if (buffer.length() > MAX_LENGTH_LONG)

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java?rev=1916856&r1=1916855&r2=1916856&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java
 Mon Apr  8 05:54:20 2024
@@ -42,7 +42,7 @@ import org.apache.pdfbox.pdmodel.encrypt
 /**
  * Brute force parser to be used as last resort if a malformed pdf can't be 
read.
  */
-public class BruteForceParser extends COSParser
+public class BruteForceParser
 {
     private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' };
     private static final char[] XREF_STREAM = { '/', 'X', 'R', 'e', 'f' };
@@ -77,18 +77,23 @@ public class BruteForceParser extends CO
 
     private boolean bfSearchTriggered = false;
 
+    private final COSParser parser;
+    private final COSDocument document;
+    private final RandomAccessRead source;
+
     /**
      * Constructor. Triggers a brute force search for all objects of the 
document.
      *
-     * @param source input representing the pdf.
-     * @param document the corresponding COS document
+     * @param cosDocument the corresponding COS document
+     * @param cosParser the COSParser to be used for reading the pdf
      * 
      * @throws IOException if the source data could not be read
      */
-    public BruteForceParser(RandomAccessRead source, COSDocument document) 
throws IOException
+    public BruteForceParser(COSDocument cosDocument, COSParser cosParser) 
throws IOException
     {
-        super(source);
-        this.document = document;
+        document = cosDocument;
+        parser = cosParser;
+        source = parser.source;
     }
 
     /**
@@ -140,25 +145,25 @@ public class BruteForceParser extends CO
             source.seek(currentOffset);
             int nextChar = source.read();
             currentOffset++;
-            if (isWhitespace(nextChar) && isString(OBJ_MARKER))
+            if (COSParser.isWhitespace(nextChar) && 
parser.isString(OBJ_MARKER))
             {
                 long tempOffset = currentOffset - 2;
                 source.seek(tempOffset);
                 int genID = source.peek();
                 // is the next char a digit?
-                if (isDigit(genID))
+                if (COSParser.isDigit(genID))
                 {
                     genID -= 48;
                     tempOffset--;
                     source.seek(tempOffset);
-                    if (isWhitespace())
+                    if (parser.isWhitespace())
                     {
-                        while (tempOffset > MINIMUM_SEARCH_OFFSET && 
isWhitespace())
+                        while (tempOffset > MINIMUM_SEARCH_OFFSET && 
parser.isWhitespace())
                         {
                             source.seek(--tempOffset);
                         }
                         boolean objectIDFound = false;
-                        while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit())
+                        while (tempOffset > MINIMUM_SEARCH_OFFSET && 
parser.isDigit())
                         {
                             source.seek(--tempOffset);
                             objectIDFound = true;
@@ -166,7 +171,7 @@ public class BruteForceParser extends CO
                         if (objectIDFound)
                         {
                             source.read();
-                            long objectId = readObjectNumber();
+                            long objectId = parser.readObjectNumber();
                             if (lastObjOffset > 0)
                             {
                                 // add the former object ID only if there was 
a subsequent object ID
@@ -185,21 +190,21 @@ public class BruteForceParser extends CO
             // check for "endo" as abbreviation for "endobj", as the pdf may 
be cut off
             // in the middle of the keyword, see PDFBOX-3936.
             // We could possibly implement a more intelligent algorithm if 
necessary
-            else if (nextChar == 'e' && isString(endobjString))
+            else if (nextChar == 'e' && parser.isString(endobjString))
             {
                 currentOffset += endobjString.length;
                 source.seek(currentOffset);
-                if (source.isEOF())
+                if (parser.isEOF())
                 {
                     endOfObjFound = true;
                 }
-                else if (isString(endobjRemainingString))
+                else if (parser.isString(endobjRemainingString))
                 {
                     currentOffset += endobjRemainingString.length;
                     endOfObjFound = true;
                 }
             }
-        } while (currentOffset < lastEOFMarker && !source.isEOF());
+        } while (currentOffset < lastEOFMarker && !parser.isEOF());
         if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset 
> 0)
         {
             // if the pdf wasn't cut off in the middle or if the last object 
ends with a "endobj" marker
@@ -291,15 +296,12 @@ public class BruteForceParser extends CO
     /**
      * Brute force search for all objects streams of a pdf.
      * 
-     * @param trailerResolver the trailer resolver of the document
-     * @param securityHandler security handler to be used to decrypt encrypted 
documents
+     * @param xrefTable the cross reference table of the document
+     * 
      * @throws IOException if something went wrong
      */
-    protected void bfSearchForObjStreams(XrefTrailerResolver trailerResolver,
-            SecurityHandler<? extends ProtectionPolicy> securityHandler) 
throws IOException
+    protected void bfSearchForObjStreams(Map<COSObjectKey, Long> xrefTable) 
throws IOException
     {
-        // update security handler
-        this.securityHandler = securityHandler;
         // save origin offset
         long originOffset = source.getPosition();
 
@@ -318,24 +320,24 @@ public class BruteForceParser extends CO
                 .map(Map.Entry::getKey) //
                 .collect(Collectors.toList());
         // add all found compressed objects to the brute force search result
+        SecurityHandler<? extends ProtectionPolicy> securityHandler = 
parser.getSecurityHandler();
         for (Long offset : objStreamOffsets)
         {
             source.seek(offset);
-            long stmObjNumber = readObjectNumber();
-            int stmGenNumber = readGenerationNumber();
-            readExpectedString(OBJ_MARKER, true);
+            long stmObjNumber = parser.readObjectNumber();
+            int stmGenNumber = parser.readGenerationNumber();
+            parser.readExpectedString(OBJ_MARKER, true);
             COSStream stream = null;
             try
             {
-                COSDictionary dict = parseCOSDictionary(false);
-                stream = parseCOSStream(dict);
+                COSDictionary dict = parser.parseCOSDictionary(false);
+                stream = parser.parseCOSStream(dict);
                 if (securityHandler != null)
                 {
                     securityHandler.decryptStream(stream, stmObjNumber, 
stmGenNumber);
                 }
                 PDFObjectStreamParser objStreamParser = new 
PDFObjectStreamParser(stream, document);
                 Map<Long, Integer> objectNumbers = 
objStreamParser.readObjectNumbers();
-                Map<COSObjectKey, Long> xrefOffset = 
trailerResolver.getXrefTable();
                 for (Long objNumber : objectNumbers.keySet())
                 {
                     COSObjectKey objKey = new COSObjectKey(objNumber, 0);
@@ -349,7 +351,7 @@ public class BruteForceParser extends CO
                     if (existingOffset == null || offset > existingOffset)
                     {
                         bfCOSObjectOffsets.put(objKey, -stmObjNumber);
-                        xrefOffset.put(objKey, -stmObjNumber);
+                        xrefTable.put(objKey, -stmObjNumber);
                     }
                 }
             }
@@ -389,8 +391,8 @@ public class BruteForceParser extends CO
             {
                 boolean rootFound = false;
                 boolean infoFound = false;
-                skipSpaces();
-                COSDictionary trailerDict = parseCOSDictionary(true);
+                parser.skipSpaces();
+                COSDictionary trailerDict = parser.parseCOSDictionary(true);
                 COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT);
                 if (rootObj != null)
                 {
@@ -532,11 +534,11 @@ public class BruteForceParser extends CO
                 // check if the following data is some valid pdf content
                 // which most likely indicates that the pdf is linearized,
                 // updated or just cut off somewhere in the middle
-                skipSpaces();
-                if (!isString(XREF_TABLE))
+                parser.skipSpaces();
+                if (!parser.isString(XREF_TABLE))
                 {
-                    readObjectNumber();
-                    readGenerationNumber();
+                    parser.readObjectNumber();
+                    parser.readGenerationNumber();
                 }
             }
             catch (IOException exception)
@@ -583,21 +585,20 @@ public class BruteForceParser extends CO
                     source.seek(currentOffset);
                     for (int j = 0; j < 10; j++)
                     {
-                        if (isString(string))
+                        if (parser.isString(string))
                         {
                             long tempOffset = currentOffset - 1;
                             source.seek(tempOffset);
-                            int genID = source.peek();
                             // is the next char a digit?
-                            if (isDigit(genID))
+                            if (parser.isDigit())
                             {
                                 tempOffset--;
                                 source.seek(tempOffset);
-                                if (isSpace())
+                                if (parser.isSpace())
                                 {
                                     int length = 0;
                                     source.seek(--tempOffset);
-                                    while (tempOffset > MINIMUM_SEARCH_OFFSET 
&& isDigit())
+                                    while (tempOffset > MINIMUM_SEARCH_OFFSET 
&& parser.isDigit())
                                     {
                                         source.seek(--tempOffset);
                                         length++;
@@ -606,8 +607,8 @@ public class BruteForceParser extends CO
                                     {
                                         source.read();
                                         newOffset = source.getPosition();
-                                        long objNumber = readObjectNumber();
-                                        int genNumber = readGenerationNumber();
+                                        long objNumber = 
parser.readObjectNumber();
+                                        int genNumber = 
parser.readGenerationNumber();
                                         COSObjectKey streamObjectKey = new 
COSObjectKey(objNumber,
                                                 genNumber);
                                         
bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey);
@@ -648,7 +649,7 @@ public class BruteForceParser extends CO
         {
             source.seek(newOffset - 1);
             // ensure that we don't read "startxref" instead of "xref"
-            if (isWhitespace())
+            if (parser.isWhitespace())
             {
                 bfSearchXRefTablesOffsets.add(newOffset);
             }
@@ -685,21 +686,20 @@ public class BruteForceParser extends CO
                     source.seek(currentOffset);
                     for (int j = 0; j < 10; j++)
                     {
-                        if (isString(string))
+                        if (parser.isString(string))
                         {
                             long tempOffset = currentOffset - 1;
                             source.seek(tempOffset);
-                            int genID = source.peek();
                             // is the next char a digit?
-                            if (isDigit(genID))
+                            if (parser.isDigit())
                             {
                                 tempOffset--;
                                 source.seek(tempOffset);
-                                if (isSpace())
+                                if (parser.isSpace())
                                 {
                                     int length = 0;
                                     source.seek(--tempOffset);
-                                    while (tempOffset > MINIMUM_SEARCH_OFFSET 
&& isDigit())
+                                    while (tempOffset > MINIMUM_SEARCH_OFFSET 
&& parser.isDigit())
                                     {
                                         source.seek(--tempOffset);
                                         length++;
@@ -759,7 +759,7 @@ public class BruteForceParser extends CO
     /**
      * Tell if the dictionary is a PDF or FDF catalog.
      * 
-     * @param dictionary
+     * @param dictionary the dictionary to be tested
      * @return true if the given dictionary is a root dictionary
      */
     private boolean isCatalog(COSDictionary dictionary)
@@ -810,20 +810,17 @@ public class BruteForceParser extends CO
 
     /**
      * Rebuild the trailer dictionary if startxref can't be found.
+     *
+     * @param xrefTable the cross reference table of the pdf
      * 
-     * @param trailerResolver the trailer resolver of the document
-     * @param securityHandler security handler to be used to decrypt encrypted 
documents
      * @return the rebuild trailer dictionary
      * 
      * @throws IOException if something went wrong
      */
-    protected COSDictionary rebuildTrailer(XrefTrailerResolver trailerResolver,
-            SecurityHandler<? extends ProtectionPolicy> securityHandler) 
throws IOException
+    protected COSDictionary rebuildTrailer(Map<COSObjectKey, Long> xrefTable) 
throws IOException
     {
-        // update security handler
-        this.securityHandler = securityHandler;
-        // reset trailer resolver
-        trailerResolver.reset();
+        // use a new trailer resolver
+        XrefTrailerResolver trailerResolver = new XrefTrailerResolver();
         // use the found objects to rebuild the trailer resolver
         trailerResolver.nextXrefObj(0, XRefType.TABLE);
         getBFCOSObjectOffsets().forEach(trailerResolver::setXRef);
@@ -840,22 +837,23 @@ public class BruteForceParser extends CO
 
         COSDictionary trailer = trailerResolver.getTrailer();
         document.setTrailer(trailer);
+        xrefTable.putAll(trailerResolver.getXrefTable());
         boolean searchForObjStreamsDone = false;
         if (!bfSearchForTrailer(trailer) && !searchForTrailerItems(trailer))
         {
             // root entry wasn't found, maybe it is part of an object stream
             // brute force search for all object streams.
-            bfSearchForObjStreams(trailerResolver, securityHandler);
+            bfSearchForObjStreams(xrefTable);
             searchForObjStreamsDone = true;
             // search again for the root entry
             searchForTrailerItems(trailer);
         }
         // prepare decryption if necessary
-        prepareDecryption();
+        parser.prepareDecryption();
         if (!searchForObjStreamsDone)
         {
             // brute force search for all object streams.
-            bfSearchForObjStreams(trailerResolver, securityHandler);
+            bfSearchForObjStreams(xrefTable);
         }
         return trailer;
     }

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1916856&r1=1916855&r2=1916856&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java 
(original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java 
Mon Apr  8 05:54:20 2024
@@ -25,8 +25,6 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Optional;
-import java.util.Map.Entry;
 import java.util.Set;
 
 import org.apache.logging.log4j.Logger;
@@ -46,7 +44,6 @@ import org.apache.pdfbox.io.IOUtils;
 import org.apache.pdfbox.io.RandomAccessRead;
 import org.apache.pdfbox.io.RandomAccessReadView;
 import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction;
-import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
 import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
@@ -55,7 +52,6 @@ import org.apache.pdfbox.pdmodel.encrypt
 import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
 import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
-import org.apache.pdfbox.util.StringUtil;
 
 /**
  * COS-Parser which first reads startxref and xref tables in order to know 
valid objects and parse only these objects.
@@ -71,17 +67,14 @@ public class COSParser extends BaseParse
     private static final String PDF_DEFAULT_VERSION = "1.4";
     private static final String FDF_DEFAULT_VERSION = "1.0";
 
-    private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' };
     private static final char[] STARTXREF = { 
's','t','a','r','t','x','r','e','f' };
 
     private static final byte[] ENDSTREAM = { E, N, D, S, T, R, E, A, M };
 
     private static final byte[] ENDOBJ = { E, N, D, O, B, J };
 
-    private static final long MINIMUM_SEARCH_OFFSET = 6;
+    protected static final long MINIMUM_SEARCH_OFFSET = 6;
     
-    private static final int X = 'x';
-
     private static final int STRMBUFLEN = 2048;
     private final byte[] strmBuf = new byte[ STRMBUFLEN ];
 
@@ -105,16 +98,16 @@ public class COSParser extends BaseParse
     /**
      * EOF-marker.
      */
-    protected static final char[] EOF_MARKER = { '%', '%', 'E', 'O', 'F' };
+    private static final char[] EOF_MARKER = { '%', '%', 'E', 'O', 'F' };
     /**
      * obj-marker.
      */
-    protected static final char[] OBJ_MARKER = { 'o', 'b', 'j' };
+    private static final char[] OBJ_MARKER = { 'o', 'b', 'j' };
 
     /**
      * file length.
      */
-    protected long fileLen;
+    private final long fileLen;
 
     /**
      * is parser using auto healing capacity ?
@@ -127,6 +120,7 @@ public class COSParser extends BaseParse
     
     private BruteForceParser bruteForceParser = null;
     private PDEncryption encryption = null;
+    private final Map<COSObjectKey, Long> xrefTable = new HashMap<>();
     
     /**
      * Intermediate cache. Contains all objects of already read compressed 
object streams. Objects are removed after
@@ -137,7 +131,7 @@ public class COSParser extends BaseParse
     /**
      * The security handler.
      */
-    protected SecurityHandler<? extends ProtectionPolicy> securityHandler = 
null;
+    private SecurityHandler<? extends ProtectionPolicy> securityHandler = null;
 
     /**
      *  how many trailing bytes to read for EOF marker.
@@ -146,12 +140,6 @@ public class COSParser extends BaseParse
 
     private static final Logger LOG = LogManager.getLogger(COSParser.class);
 
-    /** 
-     * Collects all Xref/trailer objects and resolves them into single
-     * object using startxref reference. 
-     */
-    protected XrefTrailerResolver xrefTrailerResolver = new 
XrefTrailerResolver();
-
     /**
      * Default constructor.
      *
@@ -261,7 +249,9 @@ public class COSParser extends BaseParse
             long startXRefOffset = getStartxrefOffset();
             if (startXRefOffset > -1)
             {
-                trailer = parseXref(startXRefOffset);
+                XrefParser xrefParser = new XrefParser(document, this);
+                trailer = xrefParser.parseXref(startXRefOffset);
+                xrefTable.putAll(xrefParser.getXrefTable());
             }
             else
             {
@@ -286,15 +276,10 @@ public class COSParser extends BaseParse
         }
         if (rebuildTrailer)
         {
-            trailer = 
getBruteForceParser().rebuildTrailer(xrefTrailerResolver, null);
+            // reset cross reference table
+            xrefTable.clear();
+            trailer = getBruteForceParser().rebuildTrailer(xrefTable);
             trailerWasRebuild = true;
-            // transfer encryption information from BruteForceParser
-            encryption = getBruteForceParser().getEncryption();
-            if (encryption != null)
-            {
-                securityHandler = encryption.getSecurityHandler();
-                accessPermission = 
securityHandler.getCurrentAccessPermission();
-            }
         }
         else
         {
@@ -303,194 +288,13 @@ public class COSParser extends BaseParse
             // don't use the getter as it creates an instance of 
BruteForceParser
             if (bruteForceParser != null && 
bruteForceParser.bfSearchTriggered())
             {
-                
getBruteForceParser().bfSearchForObjStreams(xrefTrailerResolver, 
securityHandler);
+                getBruteForceParser().bfSearchForObjStreams(xrefTable);
             }
         }
-        if (resetTrailerResolver())
-        {
-            xrefTrailerResolver.reset();
-            xrefTrailerResolver = null;
-        }
         return trailer;
     }
 
     /**
-     * Indicates whether the xref trailer resolver should be reset or not. 
Should be overwritten if the xref trailer
-     * resolver is needed after the initial parsing.
-     * 
-     * @return true if the xref trailer resolver should be reset
-     */
-    protected boolean resetTrailerResolver()
-    {
-        return true;
-    }
-
-    /**
-     * Parses cross reference tables.
-     * 
-     * @param startXRefOffset start offset of the first table
-     * @return the trailer dictionary
-     * @throws IOException if something went wrong
-     */
-    private COSDictionary parseXref(long startXRefOffset) throws IOException
-    {
-        source.seek(startXRefOffset);
-        long startXrefOffset = Math.max(0, parseStartXref());
-        // check the startxref offset
-        long fixedOffset = checkXRefOffset(startXrefOffset);
-        if (fixedOffset > -1)
-        {
-            startXrefOffset = fixedOffset;
-        }
-        document.setStartXref(startXrefOffset);
-        long prev = startXrefOffset;
-        // ---- parse whole chain of xref tables/object streams using PREV 
reference
-        Set<Long> prevSet = new HashSet<>();
-        COSDictionary trailer = null;
-        while (prev > 0)
-        {
-            // save expected position for loop detection
-            prevSet.add(prev);
-            // seek to xref table
-            source.seek(prev);
-            // skip white spaces
-            skipSpaces();
-            // save current position as well due to skipped spaces
-            prevSet.add(source.getPosition());
-            // -- parse xref
-            if (source.peek() == X)
-            {
-                // xref table and trailer
-                // use existing parser to parse xref table
-                if (!parseXrefTable(prev) || !parseTrailer())
-                {
-                    throw new IOException("Expected trailer object at offset "
-                            + source.getPosition());
-                }
-                trailer = xrefTrailerResolver.getCurrentTrailer();
-                // check for a XRef stream, it may contain some object ids of 
compressed objects 
-                if(trailer.containsKey(COSName.XREF_STM))
-                {
-                    int streamOffset = trailer.getInt(COSName.XREF_STM);
-                    // check the xref stream reference
-                    fixedOffset = checkXRefOffset(streamOffset);
-                    if (fixedOffset > -1 && fixedOffset != streamOffset)
-                    {
-                        LOG.warn("/XRefStm offset {} is incorrect, corrected 
to {}", streamOffset,
-                                fixedOffset);
-                        streamOffset = (int)fixedOffset;
-                        trailer.setInt(COSName.XREF_STM, streamOffset);
-                    }
-                    if (streamOffset > 0)
-                    {
-                        source.seek(streamOffset);
-                        skipSpaces();
-                        try
-                        {
-                            parseXrefObjStream(prev, false);
-                            document.setHasHybridXRef();
-                        }
-                        catch (IOException ex)
-                        {
-                            if (isLenient)
-                            {
-                                LOG.error("Failed to parse /XRefStm at offset 
{}", streamOffset,
-                                        ex);
-                            }
-                            else
-                            {
-                                throw ex;
-                            }
-                        }
-                    }
-                    else
-                    {
-                        if(isLenient)
-                        {
-                            LOG.error("Skipped XRef stream due to a corrupt 
offset:{}",
-                                    streamOffset);
-                        }
-                        else
-                        {
-                            throw new IOException("Skipped XRef stream due to 
a corrupt offset:"+streamOffset);
-                        }
-                    }
-                }
-                prev = trailer.getLong(COSName.PREV);
-            }
-            else
-            {
-                // parse xref stream
-                prev = parseXrefObjStream(prev, true);
-                trailer = xrefTrailerResolver.getCurrentTrailer();
-            }
-            if (prev > 0)
-            {
-                // check the xref table reference
-                fixedOffset = checkXRefOffset(prev);
-                if (fixedOffset > -1 && fixedOffset != prev)
-                {
-                    prev = fixedOffset;
-                    trailer.setLong(COSName.PREV, prev);
-                }
-            }
-            if (prevSet.contains(prev))
-            {
-                throw new IOException("/Prev loop at offset " + prev);
-            }
-        }
-        // ---- build valid xrefs out of the xref chain
-        xrefTrailerResolver.setStartxref(startXrefOffset);
-        trailer = xrefTrailerResolver.getTrailer();
-        document.setTrailer(trailer);
-        document.setIsXRefStream(XRefType.STREAM == 
xrefTrailerResolver.getXrefType());
-        // check the offsets of all referenced objects
-        if (isLenient)
-        {
-            checkXrefOffsets();
-        }
-        // copy xref table
-        document.addXRefTable(xrefTrailerResolver.getXrefTable());
-
-        // remember the highest XRef object number to avoid it being reused in 
incremental saving
-        Optional<Long> maxValue = document.getXrefTable().keySet().stream() //
-                .map(COSObjectKey::getNumber) //
-                .reduce(Long::max);
-        document.setHighestXRefObjectNumber(maxValue.isPresent() ? 
maxValue.get() : 0);
-
-        return trailer;
-    }
-
-    /**
-     * Parses an xref object stream starting with indirect object id.
-     * 
-     * @return value of PREV item in dictionary or <code>-1</code> if no such 
item exists
-     */
-    private long parseXrefObjStream(long objByteOffset, boolean isStandalone) 
throws IOException
-    {
-        // ---- parse indirect object head
-        readObjectNumber();
-        readGenerationNumber();
-        readExpectedString(OBJ_MARKER, true);
-
-        COSDictionary dict = parseCOSDictionary(false);
-        try (COSStream xrefStream = parseCOSStream(dict))
-        {
-            // the cross reference stream of a hybrid xref table will be added 
to the existing one
-            // and we must not override the offset and the trailer
-            if ( isStandalone )
-            {
-                xrefTrailerResolver.nextXrefObj( objByteOffset, 
XRefType.STREAM );
-                xrefTrailerResolver.setTrailer(xrefStream);
-            }
-            PDFXrefStreamParser parser = new PDFXrefStreamParser(xrefStream, 
document);
-            parser.parse(xrefTrailerResolver);
-        }
-
-        return dict.getLong(COSName.PREV);
-    }
-    
-    /**
      * Looks for and parses startxref. We first look for last '%%EOF' marker 
(within last
      * {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link 
#setEOFLookupRange(int)}) and go back to find
      * <code>startxref</code>.
@@ -564,7 +368,7 @@ public class COSParser extends BaseParse
      * 
      * @return start offset of pattern within buffer or <code>-1</code> if 
pattern could not be found
      */
-    protected int lastIndexOf(final char[] pattern, final byte[] buf, final 
int endOff)
+    private int lastIndexOf(final char[] pattern, final byte[] buf, final int 
endOff)
     {
         final int lastPatternChOff = pattern.length - 1;
 
@@ -658,7 +462,7 @@ public class COSParser extends BaseParse
      * 
      * @throws IOException If an IO error occurs.
      */
-    protected synchronized COSBase parseObjectDynamically(COSObjectKey objKey,
+    private synchronized COSBase parseObjectDynamically(COSObjectKey objKey,
             boolean requireExistingNotCompressedObj) throws IOException
     {
         COSObject pdfObject = document.getObjectFromPool(objKey);
@@ -727,7 +531,7 @@ public class COSParser extends BaseParse
         // an indirect object starts with the object number/generation number
         final long readObjNr = readObjectNumber();
         final int readObjGen = readGenerationNumber();
-        readExpectedString(OBJ_MARKER, true);
+        readObjectMarker();
 
         // consistency check
         if (readObjNr != objKey.getNumber() || readObjGen != 
objKey.getGeneration())
@@ -876,7 +680,7 @@ public class COSParser extends BaseParse
             }
             if (COSNull.NULL == length)
             {
-                LOG.warn("Length object ({} {}) not found", 
lengthObj.getKey());
+                LOG.warn("Length object ({}) not found", lengthObj.getKey());
                 return null;
             }
             if (length instanceof COSNumber)
@@ -961,7 +765,7 @@ public class COSParser extends BaseParse
         {
             throw new IOException(
                     "Error reading stream, expected='endstream' actual='"
-                    + endStream + "' at offset " + source.getPosition());
+                            + endStream + "' at offset " + 
source.getPosition());
         }
         return document.createCOSStream(dic, streamStartPosition, 
streamLength);
     }
@@ -1102,287 +906,11 @@ public class COSParser extends BaseParse
         return streamLengthIsValid;
     }
 
-    /**
-     * Check if the cross reference table/stream can be found at the current 
offset.
-     * 
-     * @param startXRefOffset
-     * @return the revised offset
-     * @throws IOException
-     */
-    private long checkXRefOffset(long startXRefOffset) throws IOException
-    {
-        // repair mode isn't available in non-lenient mode
-        if (!isLenient)
-        {
-            return startXRefOffset;
-        }
-        source.seek(startXRefOffset);
-        skipSpaces();
-        if (isString(XREF_TABLE))
-        {
-            return startXRefOffset;
-        }
-        if (startXRefOffset > 0)
-        {
-            if (checkXRefStreamOffset(startXRefOffset))
-            {
-                return startXRefOffset;
-            }
-            else
-            {
-                return calculateXRefFixedOffset(startXRefOffset);
-            }
-        }
-        // can't find a valid offset
-        return -1;
-    }
-
-    /**
-     * Check if the cross reference stream can be found at the current offset.
-     * 
-     * @param startXRefOffset the expected start offset of the XRef stream
-     * @return the revised offset
-     * @throws IOException if something went wrong
-     */
-    private boolean checkXRefStreamOffset(long startXRefOffset) throws 
IOException
-    {
-        // repair mode isn't available in non-lenient mode
-        if (!isLenient || startXRefOffset == 0)
-        {
-            return true;
-        }
-        // seek to offset-1 
-        source.seek(startXRefOffset-1);
-        int nextValue = source.read();
-        // the first character has to be a whitespace, and then a digit
-        if (isWhitespace(nextValue))
-        {
-            skipSpaces();
-            if (isDigit())
-            {
-                try
-                {
-                    // it's a XRef stream
-                    readObjectNumber();
-                    readGenerationNumber();
-                    readExpectedString(OBJ_MARKER, true);
-                    // check the dictionary to avoid false positives
-                    COSDictionary dict = parseCOSDictionary(false);
-                    source.seek(startXRefOffset);
-                    if ("XRef".equals(dict.getNameAsString(COSName.TYPE)))
-                    {
-                        return true;
-                    }
-                }
-                catch (IOException exception)
-                {
-                    // there wasn't an object of a xref stream
-                    LOG.debug("No Xref stream at given location {}", 
startXRefOffset, exception);
-                    source.seek(startXRefOffset);
-                }
-            }
-        }
-        return false;
-    }
-    
-    /**
-     * Try to find a fixed offset for the given xref table/stream.
-     * 
-     * @param objectOffset the given offset where to look at
-     * @return the fixed offset
-     * 
-     * @throws IOException if something went wrong
-     */
-    private long calculateXRefFixedOffset(long objectOffset) throws IOException
-    {
-        if (objectOffset < 0)
-        {
-            LOG.error("Invalid object offset {} when searching for a xref 
table/stream",
-                    objectOffset);
-            return 0;
-        }
-        // search for the offset of the given xref table/stream among those 
found by a brute force search.
-        long newOffset = getBruteForceParser().bfSearchForXRef(objectOffset);
-        if (newOffset > -1)
-        {
-            LOG.debug("Fixed reference for xref table/stream {} -> {}", 
objectOffset, newOffset);
-            return newOffset;
-        }
-        LOG.error("Can't find the object xref table/stream at offset {}", 
objectOffset);
-        return 0;
-    }
-
-    private boolean validateXrefOffsets(Map<COSObjectKey, Long> xrefOffset) 
throws IOException
-    {
-        if (xrefOffset == null)
-        {
-            return true;
-        }
-        Map<COSObjectKey, COSObjectKey> correctedKeys = new HashMap<>();
-        HashSet<COSObjectKey> validKeys = new HashSet<>();
-        for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet())
-        {
-            COSObjectKey objectKey = objectEntry.getKey();
-            Long objectOffset = objectEntry.getValue();
-            // a negative offset number represents an object number itself
-            // see type 2 entry in xref stream
-            if (objectOffset != null && objectOffset >= 0)
-            {
-                COSObjectKey foundObjectKey = findObjectKey(objectKey, 
objectOffset, xrefOffset);
-                if (foundObjectKey == null)
-                {
-                    LOG.debug(
-                            "Stop checking xref offsets as at least one ({}) 
couldn't be dereferenced",
-                            objectKey);
-                    return false;
-                }
-                else if (foundObjectKey != objectKey)
-                {
-                    // Generation was fixed - need to update map later, after 
iteration
-                    correctedKeys.put(objectKey, foundObjectKey);
-                }
-                else
-                {
-                    validKeys.add(objectKey);
-                }
-            }
-        }
-        Map<COSObjectKey, Long> correctedPointers = new HashMap<>();
-        for (Entry<COSObjectKey, COSObjectKey> correctedKeyEntry : 
correctedKeys.entrySet())
-        {
-            if (!validKeys.contains(correctedKeyEntry.getValue()))
-            {
-                // Only replace entries, if the original entry does not point 
to a valid object
-                correctedPointers.put(correctedKeyEntry.getValue(),
-                        xrefOffset.get(correctedKeyEntry.getKey()));
-            }
-        }
-        // remove old invalid, as some might not be replaced
-        correctedKeys.forEach((key, value) -> xrefOffset.remove(key));
-        xrefOffset.putAll(correctedPointers);
-        return true;
-    }
-
-    /**
-     * Check the XRef table by dereferencing all objects and fixing the offset 
if necessary.
-     * 
-     * @throws IOException if something went wrong.
-     */
-    private void checkXrefOffsets() throws IOException
-    {
-        Map<COSObjectKey, Long> xrefOffset = 
xrefTrailerResolver.getXrefTable();
-        if (!validateXrefOffsets(xrefOffset))
-        {
-            Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = 
getBruteForceParser()
-                    .getBFCOSObjectOffsets();
-            if (!bfCOSObjectKeyOffsets.isEmpty())
-            {
-                LOG.debug("Replaced read xref table with the results of a 
brute force search");
-                xrefOffset.clear();
-                xrefOffset.putAll(bfCOSObjectKeyOffsets);
-            }
-        }
-    }
-
-    /**
-     * Check if the given object can be found at the given offset. Returns the 
provided object key if everything is ok.
-     * If the generation number differs it will be fixed and a new object key 
is returned.
-     * 
-     * @param objectKey the key of object we are looking for
-     * @param offset the offset where to look
-     * @param xrefOffset a map with with all known xref entries
-     * @return returns the found/fixed object key
-     * 
-     * @throws IOException if something went wrong
-     */
-    private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset,
-            Map<COSObjectKey, Long> xrefOffset) throws IOException
-    {
-        // there can't be any object at the very beginning of a pdf
-        if (offset < MINIMUM_SEARCH_OFFSET)
-        {
-            return null;
-        }
-        try 
-        {
-            source.seek(offset);
-            skipWhiteSpaces();
-            if (source.getPosition() == offset)
-            {
-                // ensure that at least one whitespace is skipped in front of 
the object number
-                source.seek(offset - 1);
-                if (source.getPosition() < offset)
-                   {
-                       if (!isDigit())
-                       {
-                           // anything else but a digit may be some garbage of 
the previous object -> just ignore it
-                           source.read();
-                       }
-                       else
-                       {
-                           long current = source.getPosition();
-                           source.seek(--current);
-                           while (isDigit())
-                               source.seek(--current);
-                           long newObjNr = readObjectNumber();
-                           int newGenNr = readGenerationNumber();
-                           COSObjectKey newObjKey = new COSObjectKey(newObjNr, 
newGenNr);
-                           Long existingOffset = xrefOffset.get(newObjKey);
-                           // the found object number belongs to another 
uncompressed object at the same or nearby offset
-                           // something has to be wrong
-                           if (existingOffset != null && existingOffset > 0
-                                   && Math.abs(offset - existingOffset) < 10)
-                           {
-                            LOG.debug("Found the object {} instead of {} at 
offset {} - ignoring",
-                                    newObjKey, objectKey, offset);
-                               return null;
-                           }
-                           // something seems to be wrong but it's hard to 
determine what exactly -> simply continue
-                           source.seek(offset);
-                       }
-                   }
-            }
-            // try to read the given object/generation number
-            long foundObjectNumber = readObjectNumber();
-            if (objectKey.getNumber() != foundObjectNumber)
-            {
-                LOG.warn("found wrong object number. expected [{}] found [{}]",
-                        objectKey.getNumber(), foundObjectNumber);
-                if (!isLenient)
-                {
-                    return null;
-                }
-                else
-                {
-                    objectKey = new COSObjectKey(foundObjectNumber, 
objectKey.getGeneration());
-                }
-            }
-
-            int genNumber = readGenerationNumber();
-            // finally try to read the object marker
-            readExpectedString(OBJ_MARKER, true);
-            if (genNumber == objectKey.getGeneration())
-            {
-                return objectKey;
-            }
-            else if (isLenient && genNumber > objectKey.getGeneration())
-            {
-                return new COSObjectKey(objectKey.getNumber(), genNumber);
-            }
-        }
-        catch (IOException exception)
-        {
-            // Swallow the exception, obviously there isn't any valid object 
number
-            LOG.debug("No valid object at given location {} - ignoring", 
offset, exception);
-        }
-        return null;
-    }
-
-    private BruteForceParser getBruteForceParser() throws IOException
+    protected BruteForceParser getBruteForceParser() throws IOException
     {
        if (bruteForceParser == null)
        {
-               bruteForceParser = new BruteForceParser(source, document);
+            bruteForceParser = new BruteForceParser(document, this);
         }
        return bruteForceParser;
     }
@@ -1457,25 +985,6 @@ public class COSParser extends BaseParse
     }
 
     /**
-     * This will parse the startxref section from the stream. The startxref 
value is ignored.
-     *
-     * @return the startxref value or -1 on parsing error
-     * @throws IOException If an IO error occurs.
-     */
-    private long parseStartXref() throws IOException
-    {
-        long startXref = -1;
-        if (isString(STARTXREF))
-        {
-            readString();
-            skipSpaces();
-            // This integer is the byte offset of the first object referenced 
by the xref or xref stream
-            startXref = readLong();
-        }
-        return startXref;
-    }
-    
-    /**
      * Checks if the given string can be found at the current offset.
      * 
      * @param string the bytes of the string to look for
@@ -1521,67 +1030,9 @@ public class COSParser extends BaseParse
         return bytesMatching;
     }
 
-    /**
-     * This will parse the trailer from the stream and add it to the state.
-     *
-     * @return false on parsing error
-     * @throws IOException If an IO error occurs.
-     */
-    private boolean parseTrailer() throws IOException
+    protected void readObjectMarker() throws IOException
     {
-        // parse the last trailer.
-        long trailerOffset = source.getPosition();
-        // PDFBOX-1739 skip extra xref entries in RegisSTAR documents
-        if (isLenient)
-        {
-            int nextCharacter = source.peek();
-            while (nextCharacter != 't' && isDigit(nextCharacter))
-            {
-                if (source.getPosition() == trailerOffset)
-                {
-                    // warn only the first time
-                    LOG.warn("Expected trailer object at offset {}, keep 
trying", trailerOffset);
-                }
-                readLine();
-                nextCharacter = source.peek();
-            }
-        }
-        if(source.peek() != 't')
-        {
-            return false;
-        }
-        //read "trailer"
-        long currentOffset = source.getPosition();
-        String nextLine = readLine();
-        if( !nextLine.trim().equals( "trailer" ) )
-        {
-            // in some cases the EOL is missing and the trailer immediately
-            // continues with "<<" or with a blank character
-            // even if this does not comply with PDF reference we want to 
support as many PDFs as possible
-            // Acrobat reader can also deal with this.
-            if (nextLine.startsWith("trailer"))
-            {
-                // we can't just unread a portion of the read data as we don't 
know if the EOL consist of 1 or 2 bytes
-                int len = "trailer".length();
-                // jump back right after "trailer"
-                source.seek(currentOffset + len);
-            }
-            else
-            {
-                return false;
-            }
-        }
-    
-        // in some cases the EOL is missing and the trailer continues with " 
<<"
-        // even if this does not comply with PDF reference we want to support 
as many PDFs as possible
-        // Acrobat reader can also deal with this.
-        skipSpaces();
-    
-        COSDictionary parsedTrailer = parseCOSDictionary(true);
-        xrefTrailerResolver.setTrailer( parsedTrailer );
-    
-        skipSpaces();
-        return true;
+        readExpectedString(OBJ_MARKER, true);
     }
 
     /**
@@ -1690,128 +1141,6 @@ public class COSParser extends BaseParse
     }
 
     /**
-     * This will parse the xref table from the stream and add it to the state
-     * The XrefTable contents are ignored.
-     * @param startByteOffset the offset to start at
-     * @return false on parsing error
-     * @throws IOException If an IO error occurs.
-     */
-    protected boolean parseXrefTable(long startByteOffset) throws IOException
-    {
-        if(source.peek() != 'x')
-        {
-            return false;
-        }
-        String xref = readString();
-        if( !xref.trim().equals( "xref" ) )
-        {
-            return false;
-        }
-        
-        // check for trailer after xref
-        String str = readString();
-        byte[] b = str.getBytes(StandardCharsets.ISO_8859_1);
-        source.rewind(b.length);
-        
-        // signal start of new XRef
-        xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE );
-    
-        if (str.startsWith("trailer"))
-        {
-            LOG.warn("skipping empty xref table");
-            return false;
-        }
-        
-        // Xref tables can have multiple sections. Each starts with a starting 
object id and a count.
-        while(true)
-        {
-            String currentLine = readLine();
-            String[] splitString = StringUtil.splitOnSpace(currentLine);
-            if (splitString.length != 2)
-            {
-                LOG.warn("Unexpected XRefTable Entry: {}", currentLine);
-                return false;
-            }
-            // first obj id
-            long currObjID;
-            try
-            {
-                currObjID = Long.parseLong(splitString[0]);
-            }
-            catch (NumberFormatException exception)
-            {
-                LOG.warn("XRefTable: invalid ID for the first object: {}", 
currentLine);
-                return false;
-            }
-
-            // the number of objects in the xref table
-            int count = 0;
-            try
-            {
-                count = Integer.parseInt(splitString[1]);
-            }
-            catch (NumberFormatException exception)
-            {
-                LOG.warn("XRefTable: invalid number of objects: {}", 
currentLine);
-                return false;
-            }
-            
-            skipSpaces();
-            for(int i = 0; i < count; i++)
-            {
-                if (source.isEOF() || isEndOfName(source.peek()))
-                {
-                    break;
-                }
-                if(source.peek() == 't')
-                {
-                    break;
-                }
-                //Ignore table contents
-                currentLine = readLine();
-                splitString = StringUtil.splitOnSpace(currentLine);
-                if (splitString.length < 3)
-                {
-                    LOG.warn("invalid xref line: {}", currentLine);
-                    break;
-                }
-                /* This supports the corrupt table as reported in
-                 * PDFBOX-474 (XXXX XXX XX n) */
-                if(splitString[splitString.length-1].equals("n"))
-                {
-                    try
-                    {
-                        long currOffset = Long.parseLong(splitString[0]);
-                        // skip 0 offsets
-                        if (currOffset > 0)
-                        {
-                            int currGenID = Integer.parseInt(splitString[1]);
-                            COSObjectKey objKey = new COSObjectKey(currObjID, 
currGenID);
-                            xrefTrailerResolver.setXRef(objKey, currOffset);
-                        }
-                    }
-                    catch (IllegalArgumentException e)
-                    {
-                        throw new IOException(e);
-                    }
-                }
-                else if(!splitString[2].equals("f"))
-                {
-                    throw new IOException("Corrupt XRefTable Entry - ObjID:" + 
currObjID);
-                }
-                currObjID++;
-                skipSpaces();
-            }
-            skipSpaces();
-            if (!isDigit())
-            {
-                break;
-            }
-        }
-        return true;
-    }
-
-    /**
      * This will get the encryption dictionary. The document must be parsed 
before this is called.
      *
      * @return The encryption dictionary of the document that was parsed.
@@ -1901,4 +1230,15 @@ public class COSParser extends BaseParse
         }
     }
 
+    /**
+     * This will get the security handler. The document must be parsed before 
this is called.
+     *
+     * @return The security handler of the document that was parsed.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    protected SecurityHandler<? extends ProtectionPolicy> getSecurityHandler()
+    {
+        return securityHandler;
+    }
 }

Added: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java?rev=1916856&view=auto
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java 
(added)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java 
Mon Apr  8 05:54:20 2024
@@ -0,0 +1,694 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdfparser;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObjectKey;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
+
+/**
+ * Parser to be used to read the cross reference table of a pdf. It is either 
a simple table or a stream.
+ */
+public class XrefParser
+{
+    private static final Logger LOG = LogManager.getLogger(XrefParser.class);
+
+    private static final int X = 'x';
+    private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' };
+    private static final char[] STARTXREF = { 's', 't', 'a', 'r', 't', 'x', 
'r', 'e', 'f' };
+
+    /** 
+     * Collects all Xref/trailer objects and resolves them into single
+     * object using startxref reference. 
+     */
+    private XrefTrailerResolver xrefTrailerResolver = new 
XrefTrailerResolver();
+
+    private final COSParser parser;
+    private final COSDocument document;
+    private final RandomAccessRead source;
+
+    /**
+     * Default constructor.
+     *
+     * @param cosDocument the corresponding COS document of the pdf.
+     * @param cosParser the parser to be used to read the pdf.
+     * 
+     */
+    public XrefParser(COSDocument cosDocument, COSParser cosParser)
+    {
+        document = cosDocument;
+        parser = cosParser;
+        source = parser.source;
+    }
+
+    /**
+     * Returns the resulting cross reference table.
+     * 
+     * @return
+     */
+    public Map<COSObjectKey, Long> getXrefTable()
+    {
+        return xrefTrailerResolver.getXrefTable();
+    }
+
+    /**
+     * Parses cross reference tables.
+     * 
+     * @param startXRefOffset start offset of the first table
+     * @return the trailer dictionary
+     * @throws IOException if something went wrong
+     */
+    public COSDictionary parseXref(long startXRefOffset) throws IOException
+    {
+        source.seek(startXRefOffset);
+        long startXrefOffset = Math.max(0, parseStartXref());
+        // check the startxref offset
+        long fixedOffset = checkXRefOffset(startXrefOffset);
+        if (fixedOffset > -1)
+        {
+            startXrefOffset = fixedOffset;
+        }
+        document.setStartXref(startXrefOffset);
+        long prev = startXrefOffset;
+        // ---- parse whole chain of xref tables/object streams using PREV 
reference
+        Set<Long> prevSet = new HashSet<>();
+        COSDictionary trailer = null;
+        while (prev > 0)
+        {
+            // save expected position for loop detection
+            prevSet.add(prev);
+            // seek to xref table
+            source.seek(prev);
+            // skip white spaces
+            parser.skipSpaces();
+            // save current position as well due to skipped spaces
+            prevSet.add(source.getPosition());
+            // -- parse xref
+            if (source.peek() == X)
+            {
+                // xref table and trailer
+                // use existing parser to parse xref table
+                if (!parseXrefTable(prev) || !parseTrailer())
+                {
+                    throw new IOException("Expected trailer object at offset "
+                            + source.getPosition());
+                }
+                trailer = xrefTrailerResolver.getCurrentTrailer();
+                // check for a XRef stream, it may contain some object ids of 
compressed objects 
+                if(trailer.containsKey(COSName.XREF_STM))
+                {
+                    int streamOffset = trailer.getInt(COSName.XREF_STM);
+                    // check the xref stream reference
+                    fixedOffset = checkXRefOffset(streamOffset);
+                    if (fixedOffset > -1 && fixedOffset != streamOffset)
+                    {
+                        LOG.warn("/XRefStm offset {} is incorrect, corrected 
to {}", streamOffset,
+                                fixedOffset);
+                        streamOffset = (int)fixedOffset;
+                        trailer.setInt(COSName.XREF_STM, streamOffset);
+                    }
+                    if (streamOffset > 0)
+                    {
+                        source.seek(streamOffset);
+                        parser.skipSpaces();
+                        try
+                        {
+                            parseXrefObjStream(prev, false);
+                            document.setHasHybridXRef();
+                        }
+                        catch (IOException ex)
+                        {
+                            LOG.error("Failed to parse /XRefStm at offset {}", 
streamOffset, ex);
+                        }
+                    }
+                    else
+                    {
+                        LOG.error("Skipped XRef stream due to a corrupt 
offset: {}", streamOffset);
+                    }
+                }
+                prev = trailer.getLong(COSName.PREV);
+            }
+            else
+            {
+                // parse xref stream
+                prev = parseXrefObjStream(prev, true);
+                trailer = xrefTrailerResolver.getCurrentTrailer();
+            }
+            if (prev > 0)
+            {
+                // check the xref table reference
+                fixedOffset = checkXRefOffset(prev);
+                if (fixedOffset > -1 && fixedOffset != prev)
+                {
+                    prev = fixedOffset;
+                    trailer.setLong(COSName.PREV, prev);
+                }
+            }
+            if (prevSet.contains(prev))
+            {
+                throw new IOException("/Prev loop at offset " + prev);
+            }
+        }
+        // ---- build valid xrefs out of the xref chain
+        xrefTrailerResolver.setStartxref(startXrefOffset);
+        trailer = xrefTrailerResolver.getTrailer();
+        document.setTrailer(trailer);
+        document.setIsXRefStream(XRefType.STREAM == 
xrefTrailerResolver.getXrefType());
+        // check the offsets of all referenced objects
+        checkXrefOffsets();
+        // copy xref table
+        document.addXRefTable(xrefTrailerResolver.getXrefTable());
+
+        // remember the highest XRef object number to avoid it being reused in 
incremental saving
+        Optional<Long> maxValue = document.getXrefTable().keySet().stream() //
+                .map(COSObjectKey::getNumber) //
+                .reduce(Long::max);
+        document.setHighestXRefObjectNumber(maxValue.isPresent() ? 
maxValue.get() : 0);
+
+        return trailer;
+    }
+
+    /**
+     * This will parse the trailer from the stream and add it to the state.
+     *
+     * @return false on parsing error
+     * @throws IOException If an IO error occurs.
+     */
+    private boolean parseTrailer() throws IOException
+    {
+        // parse the last trailer.
+        long trailerOffset = source.getPosition();
+        // PDFBOX-1739 skip extra xref entries in RegisSTAR documents
+        int nextCharacter = source.peek();
+        while (nextCharacter != 't' && COSParser.isDigit(nextCharacter))
+        {
+            if (source.getPosition() == trailerOffset)
+            {
+                // warn only the first time
+                LOG.warn("Expected trailer object at offset {}, keep trying", 
trailerOffset);
+            }
+            parser.readLine();
+            nextCharacter = source.peek();
+        }
+        if (source.peek() != 't')
+        {
+            return false;
+        }
+        // read "trailer"
+        long currentOffset = source.getPosition();
+        String nextLine = parser.readLine();
+        if (!nextLine.trim().equals("trailer"))
+        {
+            // in some cases the EOL is missing and the trailer immediately
+            // continues with "<<" or with a blank character
+            // even if this does not comply with PDF reference we want to 
support as many PDFs as possible
+            // Acrobat reader can also deal with this.
+            if (nextLine.startsWith("trailer"))
+            {
+                // we can't just unread a portion of the read data as we don't 
know if the EOL consist of 1 or 2 bytes
+                int len = "trailer".length();
+                // jump back right after "trailer"
+                source.seek(currentOffset + len);
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        // in some cases the EOL is missing and the trailer continues with " 
<<"
+        // even if this does not comply with PDF reference we want to support 
as many PDFs as possible
+        // Acrobat reader can also deal with this.
+        parser.skipSpaces();
+
+        COSDictionary parsedTrailer = parser.parseCOSDictionary(true);
+        xrefTrailerResolver.setTrailer(parsedTrailer);
+
+        parser.skipSpaces();
+        return true;
+    }
+
+    /**
+     * Parses an xref object stream starting with indirect object id.
+     * 
+     * @return value of PREV item in dictionary or <code>-1</code> if no such 
item exists
+     */
+    private long parseXrefObjStream(long objByteOffset, boolean isStandalone) 
throws IOException
+    {
+        // ---- parse indirect object head
+        parser.readObjectNumber();
+        parser.readGenerationNumber();
+        parser.readObjectMarker();
+
+        COSDictionary dict = parser.parseCOSDictionary(false);
+        try (COSStream xrefStream = parser.parseCOSStream(dict))
+        {
+            // the cross reference stream of a hybrid xref table will be added 
to the existing one
+            // and we must not override the offset and the trailer
+            if ( isStandalone )
+            {
+                xrefTrailerResolver.nextXrefObj( objByteOffset, 
XRefType.STREAM );
+                xrefTrailerResolver.setTrailer(xrefStream);
+            }
+            PDFXrefStreamParser parser = new PDFXrefStreamParser(xrefStream, 
document);
+            parser.parse(xrefTrailerResolver);
+        }
+
+        return dict.getLong(COSName.PREV);
+    }
+    
+    /**
+     * Check if the cross reference table/stream can be found at the current 
offset.
+     * 
+     * @param startXRefOffset
+     * @return the revised offset
+     * @throws IOException
+     */
+    private long checkXRefOffset(long startXRefOffset) throws IOException
+    {
+        source.seek(startXRefOffset);
+        parser.skipSpaces();
+        if (parser.isString(XREF_TABLE))
+        {
+            return startXRefOffset;
+        }
+        if (startXRefOffset > 0)
+        {
+            if (checkXRefStreamOffset(startXRefOffset))
+            {
+                return startXRefOffset;
+            }
+            else
+            {
+                return calculateXRefFixedOffset(startXRefOffset);
+            }
+        }
+        // can't find a valid offset
+        return -1;
+    }
+
+    /**
+     * Try to find a fixed offset for the given xref table/stream.
+     * 
+     * @param objectOffset the given offset where to look at
+     * @return the fixed offset
+     * 
+     * @throws IOException if something went wrong
+     */
+    private long calculateXRefFixedOffset(long objectOffset) throws IOException
+    {
+        if (objectOffset < 0)
+        {
+            LOG.error("Invalid object offset {} when searching for a xref 
table/stream",
+                    objectOffset);
+            return 0;
+        }
+        // search for the offset of the given xref table/stream among those 
found by a brute force search.
+        long newOffset = 
parser.getBruteForceParser().bfSearchForXRef(objectOffset);
+        if (newOffset > -1)
+        {
+            LOG.debug("Fixed reference for xref table/stream {} -> {}", 
objectOffset, newOffset);
+            return newOffset;
+        }
+        LOG.error("Can't find the object xref table/stream at offset {}", 
objectOffset);
+        return 0;
+    }
+
+    /**
+     * Check if the cross reference stream can be found at the current offset.
+     * 
+     * @param startXRefOffset the expected start offset of the XRef stream
+     * @return the revised offset
+     * @throws IOException if something went wrong
+     */
+    private boolean checkXRefStreamOffset(long startXRefOffset) throws 
IOException
+    {
+        if (startXRefOffset == 0)
+        {
+            return true;
+        }
+        // seek to offset-1 
+        source.seek(startXRefOffset - 1);
+        int nextValue = source.read();
+        // the first character has to be a whitespace, and then a digit
+        if (COSParser.isWhitespace(nextValue))
+        {
+            parser.skipSpaces();
+            if (parser.isDigit())
+            {
+                try
+                {
+                    // it's a XRef stream
+                    parser.readObjectNumber();
+                    parser.readGenerationNumber();
+                    parser.readObjectMarker();
+                    // check the dictionary to avoid false positives
+                    COSDictionary dict = parser.parseCOSDictionary(false);
+                    source.seek(startXRefOffset);
+                    if ("XRef".equals(dict.getNameAsString(COSName.TYPE)))
+                    {
+                        return true;
+                    }
+                }
+                catch (IOException exception)
+                {
+                    // there wasn't an object of a xref stream
+                    LOG.debug("No Xref stream at given location {}", 
startXRefOffset, exception);
+                    source.seek(startXRefOffset);
+                }
+            }
+        }
+        return false;
+    }
+    
+    private boolean validateXrefOffsets(Map<COSObjectKey, Long> xrefOffset) 
throws IOException
+    {
+        if (xrefOffset == null)
+        {
+            return true;
+        }
+        Map<COSObjectKey, COSObjectKey> correctedKeys = new HashMap<>();
+        HashSet<COSObjectKey> validKeys = new HashSet<>();
+        for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet())
+        {
+            COSObjectKey objectKey = objectEntry.getKey();
+            Long objectOffset = objectEntry.getValue();
+            // a negative offset number represents an object number itself
+            // see type 2 entry in xref stream
+            if (objectOffset != null && objectOffset >= 0)
+            {
+                COSObjectKey foundObjectKey = findObjectKey(objectKey, 
objectOffset, xrefOffset);
+                if (foundObjectKey == null)
+                {
+                    LOG.debug(
+                            "Stop checking xref offsets as at least one ({}) 
couldn't be dereferenced",
+                            objectKey);
+                    return false;
+                }
+                else if (foundObjectKey != objectKey)
+                {
+                    // Generation was fixed - need to update map later, after 
iteration
+                    correctedKeys.put(objectKey, foundObjectKey);
+                }
+                else
+                {
+                    validKeys.add(objectKey);
+                }
+            }
+        }
+        Map<COSObjectKey, Long> correctedPointers = new HashMap<>();
+        for (Entry<COSObjectKey, COSObjectKey> correctedKeyEntry : 
correctedKeys.entrySet())
+        {
+            if (!validKeys.contains(correctedKeyEntry.getValue()))
+            {
+                // Only replace entries, if the original entry does not point 
to a valid object
+                correctedPointers.put(correctedKeyEntry.getValue(),
+                        xrefOffset.get(correctedKeyEntry.getKey()));
+            }
+        }
+        // remove old invalid, as some might not be replaced
+        correctedKeys.forEach((key, value) -> xrefOffset.remove(key));
+        xrefOffset.putAll(correctedPointers);
+        return true;
+    }
+
+    /**
+     * Check the XRef table by dereferencing all objects and fixing the offset 
if necessary.
+     * 
+     * @throws IOException if something went wrong.
+     */
+    private void checkXrefOffsets() throws IOException
+    {
+        Map<COSObjectKey, Long> xrefOffset = 
xrefTrailerResolver.getXrefTable();
+        if (!validateXrefOffsets(xrefOffset))
+        {
+            Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = 
parser.getBruteForceParser()
+                    .getBFCOSObjectOffsets();
+            if (!bfCOSObjectKeyOffsets.isEmpty())
+            {
+                LOG.debug("Replaced read xref table with the results of a 
brute force search");
+                xrefOffset.clear();
+                xrefOffset.putAll(bfCOSObjectKeyOffsets);
+            }
+        }
+    }
+
+    /**
+     * Check if the given object can be found at the given offset. Returns the 
provided object key if everything is ok.
+     * If the generation number differs it will be fixed and a new object key 
is returned.
+     * 
+     * @param objectKey the key of object we are looking for
+     * @param offset the offset where to look
+     * @param xrefOffset a map with with all known xref entries
+     * @return returns the found/fixed object key
+     * 
+     * @throws IOException if something went wrong
+     */
+    private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset,
+            Map<COSObjectKey, Long> xrefOffset) throws IOException
+    {
+        // there can't be any object at the very beginning of a pdf
+        if (offset < COSParser.MINIMUM_SEARCH_OFFSET)
+        {
+            return null;
+        }
+        try 
+        {
+            source.seek(offset);
+            parser.skipWhiteSpaces();
+            if (source.getPosition() == offset)
+            {
+                // ensure that at least one whitespace is skipped in front of 
the object number
+                source.seek(offset - 1);
+                if (source.getPosition() < offset)
+                   {
+                    if (!parser.isDigit())
+                       {
+                           // anything else but a digit may be some garbage of 
the previous object -> just ignore it
+                        source.read();
+                       }
+                       else
+                       {
+                        long current = source.getPosition();
+                        source.seek(--current);
+                        while (parser.isDigit())
+                            source.seek(--current);
+                        long newObjNr = parser.readObjectNumber();
+                        int newGenNr = parser.readGenerationNumber();
+                           COSObjectKey newObjKey = new COSObjectKey(newObjNr, 
newGenNr);
+                           Long existingOffset = xrefOffset.get(newObjKey);
+                           // the found object number belongs to another 
uncompressed object at the same or nearby offset
+                           // something has to be wrong
+                           if (existingOffset != null && existingOffset > 0
+                                   && Math.abs(offset - existingOffset) < 10)
+                           {
+                            LOG.debug("Found the object {} instead of {} at 
offset {} - ignoring",
+                                    newObjKey, objectKey, offset);
+                               return null;
+                           }
+                           // something seems to be wrong but it's hard to 
determine what exactly -> simply continue
+                        source.seek(offset);
+                       }
+                   }
+            }
+            // try to read the given object/generation number
+            long foundObjectNumber = parser.readObjectNumber();
+            if (objectKey.getNumber() != foundObjectNumber)
+            {
+                LOG.warn("found wrong object number. expected [{}] found [{}]",
+                        objectKey.getNumber(), foundObjectNumber);
+                objectKey = new COSObjectKey(foundObjectNumber, 
objectKey.getGeneration());
+            }
+
+            int genNumber = parser.readGenerationNumber();
+            // finally try to read the object marker
+            parser.readObjectMarker();
+            if (genNumber == objectKey.getGeneration())
+            {
+                return objectKey;
+            }
+            else if (genNumber > objectKey.getGeneration())
+            {
+                return new COSObjectKey(objectKey.getNumber(), genNumber);
+            }
+        }
+        catch (IOException exception)
+        {
+            // Swallow the exception, obviously there isn't any valid object 
number
+            LOG.debug("No valid object at given location {} - ignoring", 
offset, exception);
+        }
+        return null;
+    }
+
+    /**
+     * This will parse the startxref section from the stream. The startxref 
value is ignored.
+     *
+     * @return the startxref value or -1 on parsing error
+     * @throws IOException If an IO error occurs.
+     */
+    private long parseStartXref() throws IOException
+    {
+        long startXref = -1;
+        if (parser.isString(STARTXREF))
+        {
+            parser.readString();
+            parser.skipSpaces();
+            // This integer is the byte offset of the first object referenced 
by the xref or xref stream
+            startXref = parser.readLong();
+        }
+        return startXref;
+    }
+    
+    /**
+     * This will parse the xref table from the stream and add it to the state
+     * The XrefTable contents are ignored.
+     * @param startByteOffset the offset to start at
+     * @return false on parsing error
+     * @throws IOException If an IO error occurs.
+     */
+    private boolean parseXrefTable(long startByteOffset) throws IOException
+    {
+        if (source.peek() != 'x')
+        {
+            return false;
+        }
+        String xref = parser.readString();
+        if( !xref.trim().equals( "xref" ) )
+        {
+            return false;
+        }
+        
+        // check for trailer after xref
+        String str = parser.readString();
+        byte[] b = str.getBytes(StandardCharsets.ISO_8859_1);
+        source.seek(source.getPosition() - b.length);
+        
+        // signal start of new XRef
+        xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE );
+    
+        if (str.startsWith("trailer"))
+        {
+            LOG.warn("skipping empty xref table");
+            return false;
+        }
+        
+        // Xref tables can have multiple sections. Each starts with a starting 
object id and a count.
+        while(true)
+        {
+            String currentLine = parser.readLine();
+            String[] splitString = currentLine.split("\\s");
+            if (splitString.length != 2)
+            {
+                LOG.warn("Unexpected XRefTable Entry: {}", currentLine);
+                return false;
+            }
+            // first obj id
+            long currObjID;
+            try
+            {
+                currObjID = Long.parseLong(splitString[0]);
+            }
+            catch (NumberFormatException exception)
+            {
+                LOG.warn("XRefTable: invalid ID for the first object: {}", 
currentLine);
+                return false;
+            }
+
+            // the number of objects in the xref table
+            int count = 0;
+            try
+            {
+                count = Integer.parseInt(splitString[1]);
+            }
+            catch (NumberFormatException exception)
+            {
+                LOG.warn("XRefTable: invalid number of objects: {}", 
currentLine);
+                return false;
+            }
+            
+            parser.skipSpaces();
+            for(int i = 0; i < count; i++)
+            {
+                if (parser.isEOF() )
+                {
+                    break;
+                }
+                int nextChar = source.peek();
+                if (nextChar == 't' || COSParser.isEndOfName(nextChar))
+                {
+                    break;
+                }
+                //Ignore table contents
+                currentLine = parser.readLine();
+                splitString = currentLine.split("\\s");
+                if (splitString.length < 3)
+                {
+                    LOG.warn("invalid xref line: {}", currentLine);
+                    break;
+                }
+                /* This supports the corrupt table as reported in
+                 * PDFBOX-474 (XXXX XXX XX n) */
+                if(splitString[splitString.length-1].equals("n"))
+                {
+                    try
+                    {
+                        long currOffset = Long.parseLong(splitString[0]);
+                        // skip 0 offsets
+                        if (currOffset > 0)
+                        {
+                            int currGenID = Integer.parseInt(splitString[1]);
+                            COSObjectKey objKey = new COSObjectKey(currObjID, 
currGenID);
+                            xrefTrailerResolver.setXRef(objKey, currOffset);
+                        }
+                    }
+                    catch (IllegalArgumentException e)
+                    {
+                        throw new IOException(e);
+                    }
+                }
+                else if(!splitString[2].equals("f"))
+                {
+                    throw new IOException("Corrupt XRefTable Entry - ObjID:" + 
currObjID);
+                }
+                currObjID++;
+                parser.skipSpaces();
+            }
+            parser.skipSpaces();
+            if (!parser.isDigit())
+            {
+                break;
+            }
+        }
+        return true;
+    }
+
+}

Propchange: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

svn commit: r1916856 - in /pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser: BaseParser.java BruteForceParser.java COSParser.java XrefParser.java

Reply via email to