Author: lehmi Date: Mon Apr 8 05:54:20 2024 New Revision: 1916856 URL: http://svn.apache.org/viewvc?rev=1916856&view=rev Log: PDFBOX-5032: implement a separate parser for the xref information. BruteForceParser no longer extends COSParser
Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java (with props) Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java?rev=1916856&r1=1916855&r2=1916856&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BaseParser.java Mon Apr 8 05:54:20 2024 @@ -62,7 +62,7 @@ public abstract class BaseParser private static final long GENERATION_NUMBER_THRESHOLD = 65535; - static final int MAX_LENGTH_LONG = Long.toString(Long.MAX_VALUE).length(); + private static final int MAX_LENGTH_LONG = Long.toString(Long.MAX_VALUE).length(); private static final Charset ALTERNATIVE_CHARSET; @@ -133,13 +133,25 @@ public abstract class BaseParser private static final char[] NULL = { 'n', 'u', 'l', 'l' }; /** + * ASCII code for Null. + */ + private static final byte ASCII_NULL = 0; + /** + * ASCII code for horizontal tab. + */ + private static final byte ASCII_TAB = 9; + /** * ASCII code for line feed. */ - protected static final byte ASCII_LF = 10; + private static final byte ASCII_LF = 10; + /** + * ASCII code for form feed. + */ + private static final byte ASCII_FF = 12; /** * ASCII code for carriage return. */ - protected static final byte ASCII_CR = 13; + private static final byte ASCII_CR = 13; private static final byte ASCII_ZERO = 48; private static final byte ASCII_NINE = 57; private static final byte ASCII_SPACE = 32; @@ -251,8 +263,7 @@ public abstract class BaseParser { if (document == null) { - throw new IOException("object reference " + key + " at offset " + source - .getPosition() + throw new IOException("object reference " + key + " at offset " + source.getPosition() + " in content stream"); } return document.getObjectFromPool(key); @@ -389,32 +400,35 @@ public abstract class BaseParser return true; } + /** + * Skip the upcoming CRLF or LF which are supposed to follow a stream. + * + * @throws IOException + */ protected void skipWhiteSpaces() throws IOException { //PDF Ref 3.2.7 A stream must be followed by either //a CRLF or LF but nothing else. - int whitespace = source.read(); - //see brother_scan_cover.pdf, it adds whitespaces //after the stream but before the start of the //data, so just read those first - while (ASCII_SPACE == whitespace) + while (isSpace(whitespace)) { whitespace = source.read(); } - if (ASCII_CR == whitespace) + if (isCR(whitespace)) { whitespace = source.read(); - if (ASCII_LF != whitespace) + if (!isLF(whitespace)) { source.rewind(1); //The spec says this is invalid but it happens in the real //world so we must support it. } } - else if (ASCII_LF != whitespace) + else if (!isLF(whitespace)) { //we are in an error. //but again we will do a lenient parsing and just assume that everything @@ -460,10 +474,10 @@ public abstract class BaseParser // 4. COSDictionary ends in the next line: LF + '>' // 5. Next line contains another COSObject: CR + '/' // 6. COSDictionary ends in the next line: CR + '>' - if (((nextThreeBytes[0] == ASCII_CR || nextThreeBytes[0] == ASCII_LF) + if (((isCR(nextThreeBytes[0]) || isLF(nextThreeBytes[0])) && (nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>')) // || // - (nextThreeBytes[0] == ASCII_CR && nextThreeBytes[1] == ASCII_LF + (isCR(nextThreeBytes[0]) && isLF(nextThreeBytes[1]) && (nextThreeBytes[2] == '/' || nextThreeBytes[2] == '>')) // ) { @@ -781,14 +795,14 @@ public abstract class BaseParser * @param ch The character * @return true if the character terminates a PDF name, otherwise false. */ - protected boolean isEndOfName(int ch) + protected static boolean isEndOfName(int ch) { switch (ch) { case ASCII_SPACE: case ASCII_CR: case ASCII_LF: - case 9: + case ASCII_TAB: case '>': case '<': case '[': @@ -796,7 +810,7 @@ public abstract class BaseParser case ']': case ')': case '(': - case 0: + case ASCII_NULL: case '\f': case '%': case -1: @@ -872,8 +886,8 @@ public abstract class BaseParser } /** - * Tries to decode the buffer cotent to an UTF-8 String. - * If that fails, tries the alternative Encoding. + * Tries to decode the buffer content to an UTF-8 String. If that fails, tries the alternative Encoding. + * * @param buffer the {@link ByteArrayOutputStream} containing the bytes to decode * @return the decoded String */ @@ -935,7 +949,7 @@ public abstract class BaseParser case (char)-1: return null; default: - if( Character.isDigit(c) || c == '-' || c == '+' || c == '.') + if (isDigit(c) || c == '-' || c == '+' || c == '.') { return parseCOSNumber(); } @@ -1086,22 +1100,10 @@ public abstract class BaseParser /** * This will tell if the next character is a closing brace( close of PDF array ). * - * @return true if the next byte is ']', false otherwise. - * - * @throws IOException If an IO error occurs. - */ - protected boolean isClosing() throws IOException - { - return isClosing(source.peek()); - } - - /** - * This will tell if the next character is a closing brace( close of PDF array ). - * * @param c The character to check against end of line * @return true if the next byte is ']', false otherwise. */ - protected boolean isClosing(int c) + protected static boolean isClosing(int c) { return c == ']'; } @@ -1144,18 +1146,6 @@ public abstract class BaseParser } /** - * This will tell if the next byte to be read is an end of line byte. - * - * @return true if the next byte is 0x0A or 0x0D. - * - * @throws IOException If there is an error reading from the stream. - */ - protected boolean isEOL() throws IOException - { - return isEOL(source.peek()); - } - - /** * This will tell if the end of the data is reached. * * @return true if the end of the data is reached. @@ -1172,17 +1162,29 @@ public abstract class BaseParser * @param c The character to check against end of line * @return true if the next byte is 0x0A or 0x0D. */ - protected boolean isEOL(int c) + protected static boolean isEOL(int c) { return isLF(c) || isCR(c); } - private boolean isLF(int c) + /** + * This will tell if the next byte to be read is a line feed. + * + * @param c The character to check against line feed + * @return true if the next byte is 0x0A. + */ + private static boolean isLF(int c) { return ASCII_LF == c; } - private boolean isCR(int c) + /** + * This will tell if the next byte to be read is a carriage return. + * + * @param c The character to check against carriage return + * @return true if the next byte is 0x0D. + */ + private static boolean isCR(int c) { return ASCII_CR == c; } @@ -1209,9 +1211,9 @@ public abstract class BaseParser { switch (c) { - case 0: - case 9: - case 12: + case ASCII_NULL: + case ASCII_TAB: + case ASCII_FF: case ASCII_LF: case ASCII_CR: case ASCII_SPACE: @@ -1239,7 +1241,7 @@ public abstract class BaseParser * @param c The character to check against space * @return true if the next byte in the stream is a space character. */ - protected boolean isSpace(int c) + private static boolean isSpace(int c) { return ASCII_SPACE == c; } @@ -1327,7 +1329,8 @@ public abstract class BaseParser int retval = readInt(); if(retval < 0 || retval > GENERATION_NUMBER_THRESHOLD) { - throw new IOException("Generation Number '" + retval + "' has more than 5 digits"); + throw new IOException( + "Generation Number '" + retval + "' has more than 5 digits or is negative"); } return retval; } @@ -1399,7 +1402,7 @@ public abstract class BaseParser { int lastByte; StringBuilder buffer = new StringBuilder(); - while ((lastByte = source.read()) >= '0' && lastByte <= '9') + while (isDigit(lastByte = source.read())) { buffer.append( (char)lastByte ); if (buffer.length() > MAX_LENGTH_LONG) Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java?rev=1916856&r1=1916855&r2=1916856&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/BruteForceParser.java Mon Apr 8 05:54:20 2024 @@ -42,7 +42,7 @@ import org.apache.pdfbox.pdmodel.encrypt /** * Brute force parser to be used as last resort if a malformed pdf can't be read. */ -public class BruteForceParser extends COSParser +public class BruteForceParser { private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' }; private static final char[] XREF_STREAM = { '/', 'X', 'R', 'e', 'f' }; @@ -77,18 +77,23 @@ public class BruteForceParser extends CO private boolean bfSearchTriggered = false; + private final COSParser parser; + private final COSDocument document; + private final RandomAccessRead source; + /** * Constructor. Triggers a brute force search for all objects of the document. * - * @param source input representing the pdf. - * @param document the corresponding COS document + * @param cosDocument the corresponding COS document + * @param cosParser the COSParser to be used for reading the pdf * * @throws IOException if the source data could not be read */ - public BruteForceParser(RandomAccessRead source, COSDocument document) throws IOException + public BruteForceParser(COSDocument cosDocument, COSParser cosParser) throws IOException { - super(source); - this.document = document; + document = cosDocument; + parser = cosParser; + source = parser.source; } /** @@ -140,25 +145,25 @@ public class BruteForceParser extends CO source.seek(currentOffset); int nextChar = source.read(); currentOffset++; - if (isWhitespace(nextChar) && isString(OBJ_MARKER)) + if (COSParser.isWhitespace(nextChar) && parser.isString(OBJ_MARKER)) { long tempOffset = currentOffset - 2; source.seek(tempOffset); int genID = source.peek(); // is the next char a digit? - if (isDigit(genID)) + if (COSParser.isDigit(genID)) { genID -= 48; tempOffset--; source.seek(tempOffset); - if (isWhitespace()) + if (parser.isWhitespace()) { - while (tempOffset > MINIMUM_SEARCH_OFFSET && isWhitespace()) + while (tempOffset > MINIMUM_SEARCH_OFFSET && parser.isWhitespace()) { source.seek(--tempOffset); } boolean objectIDFound = false; - while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) + while (tempOffset > MINIMUM_SEARCH_OFFSET && parser.isDigit()) { source.seek(--tempOffset); objectIDFound = true; @@ -166,7 +171,7 @@ public class BruteForceParser extends CO if (objectIDFound) { source.read(); - long objectId = readObjectNumber(); + long objectId = parser.readObjectNumber(); if (lastObjOffset > 0) { // add the former object ID only if there was a subsequent object ID @@ -185,21 +190,21 @@ public class BruteForceParser extends CO // check for "endo" as abbreviation for "endobj", as the pdf may be cut off // in the middle of the keyword, see PDFBOX-3936. // We could possibly implement a more intelligent algorithm if necessary - else if (nextChar == 'e' && isString(endobjString)) + else if (nextChar == 'e' && parser.isString(endobjString)) { currentOffset += endobjString.length; source.seek(currentOffset); - if (source.isEOF()) + if (parser.isEOF()) { endOfObjFound = true; } - else if (isString(endobjRemainingString)) + else if (parser.isString(endobjRemainingString)) { currentOffset += endobjRemainingString.length; endOfObjFound = true; } } - } while (currentOffset < lastEOFMarker && !source.isEOF()); + } while (currentOffset < lastEOFMarker && !parser.isEOF()); if ((lastEOFMarker < Long.MAX_VALUE || endOfObjFound) && lastObjOffset > 0) { // if the pdf wasn't cut off in the middle or if the last object ends with a "endobj" marker @@ -291,15 +296,12 @@ public class BruteForceParser extends CO /** * Brute force search for all objects streams of a pdf. * - * @param trailerResolver the trailer resolver of the document - * @param securityHandler security handler to be used to decrypt encrypted documents + * @param xrefTable the cross reference table of the document + * * @throws IOException if something went wrong */ - protected void bfSearchForObjStreams(XrefTrailerResolver trailerResolver, - SecurityHandler<? extends ProtectionPolicy> securityHandler) throws IOException + protected void bfSearchForObjStreams(Map<COSObjectKey, Long> xrefTable) throws IOException { - // update security handler - this.securityHandler = securityHandler; // save origin offset long originOffset = source.getPosition(); @@ -318,24 +320,24 @@ public class BruteForceParser extends CO .map(Map.Entry::getKey) // .collect(Collectors.toList()); // add all found compressed objects to the brute force search result + SecurityHandler<? extends ProtectionPolicy> securityHandler = parser.getSecurityHandler(); for (Long offset : objStreamOffsets) { source.seek(offset); - long stmObjNumber = readObjectNumber(); - int stmGenNumber = readGenerationNumber(); - readExpectedString(OBJ_MARKER, true); + long stmObjNumber = parser.readObjectNumber(); + int stmGenNumber = parser.readGenerationNumber(); + parser.readExpectedString(OBJ_MARKER, true); COSStream stream = null; try { - COSDictionary dict = parseCOSDictionary(false); - stream = parseCOSStream(dict); + COSDictionary dict = parser.parseCOSDictionary(false); + stream = parser.parseCOSStream(dict); if (securityHandler != null) { securityHandler.decryptStream(stream, stmObjNumber, stmGenNumber); } PDFObjectStreamParser objStreamParser = new PDFObjectStreamParser(stream, document); Map<Long, Integer> objectNumbers = objStreamParser.readObjectNumbers(); - Map<COSObjectKey, Long> xrefOffset = trailerResolver.getXrefTable(); for (Long objNumber : objectNumbers.keySet()) { COSObjectKey objKey = new COSObjectKey(objNumber, 0); @@ -349,7 +351,7 @@ public class BruteForceParser extends CO if (existingOffset == null || offset > existingOffset) { bfCOSObjectOffsets.put(objKey, -stmObjNumber); - xrefOffset.put(objKey, -stmObjNumber); + xrefTable.put(objKey, -stmObjNumber); } } } @@ -389,8 +391,8 @@ public class BruteForceParser extends CO { boolean rootFound = false; boolean infoFound = false; - skipSpaces(); - COSDictionary trailerDict = parseCOSDictionary(true); + parser.skipSpaces(); + COSDictionary trailerDict = parser.parseCOSDictionary(true); COSObject rootObj = trailerDict.getCOSObject(COSName.ROOT); if (rootObj != null) { @@ -532,11 +534,11 @@ public class BruteForceParser extends CO // check if the following data is some valid pdf content // which most likely indicates that the pdf is linearized, // updated or just cut off somewhere in the middle - skipSpaces(); - if (!isString(XREF_TABLE)) + parser.skipSpaces(); + if (!parser.isString(XREF_TABLE)) { - readObjectNumber(); - readGenerationNumber(); + parser.readObjectNumber(); + parser.readGenerationNumber(); } } catch (IOException exception) @@ -583,21 +585,20 @@ public class BruteForceParser extends CO source.seek(currentOffset); for (int j = 0; j < 10; j++) { - if (isString(string)) + if (parser.isString(string)) { long tempOffset = currentOffset - 1; source.seek(tempOffset); - int genID = source.peek(); // is the next char a digit? - if (isDigit(genID)) + if (parser.isDigit()) { tempOffset--; source.seek(tempOffset); - if (isSpace()) + if (parser.isSpace()) { int length = 0; source.seek(--tempOffset); - while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) + while (tempOffset > MINIMUM_SEARCH_OFFSET && parser.isDigit()) { source.seek(--tempOffset); length++; @@ -606,8 +607,8 @@ public class BruteForceParser extends CO { source.read(); newOffset = source.getPosition(); - long objNumber = readObjectNumber(); - int genNumber = readGenerationNumber(); + long objNumber = parser.readObjectNumber(); + int genNumber = parser.readGenerationNumber(); COSObjectKey streamObjectKey = new COSObjectKey(objNumber, genNumber); bfSearchObjStreamsOffsets.put(newOffset, streamObjectKey); @@ -648,7 +649,7 @@ public class BruteForceParser extends CO { source.seek(newOffset - 1); // ensure that we don't read "startxref" instead of "xref" - if (isWhitespace()) + if (parser.isWhitespace()) { bfSearchXRefTablesOffsets.add(newOffset); } @@ -685,21 +686,20 @@ public class BruteForceParser extends CO source.seek(currentOffset); for (int j = 0; j < 10; j++) { - if (isString(string)) + if (parser.isString(string)) { long tempOffset = currentOffset - 1; source.seek(tempOffset); - int genID = source.peek(); // is the next char a digit? - if (isDigit(genID)) + if (parser.isDigit()) { tempOffset--; source.seek(tempOffset); - if (isSpace()) + if (parser.isSpace()) { int length = 0; source.seek(--tempOffset); - while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) + while (tempOffset > MINIMUM_SEARCH_OFFSET && parser.isDigit()) { source.seek(--tempOffset); length++; @@ -759,7 +759,7 @@ public class BruteForceParser extends CO /** * Tell if the dictionary is a PDF or FDF catalog. * - * @param dictionary + * @param dictionary the dictionary to be tested * @return true if the given dictionary is a root dictionary */ private boolean isCatalog(COSDictionary dictionary) @@ -810,20 +810,17 @@ public class BruteForceParser extends CO /** * Rebuild the trailer dictionary if startxref can't be found. + * + * @param xrefTable the cross reference table of the pdf * - * @param trailerResolver the trailer resolver of the document - * @param securityHandler security handler to be used to decrypt encrypted documents * @return the rebuild trailer dictionary * * @throws IOException if something went wrong */ - protected COSDictionary rebuildTrailer(XrefTrailerResolver trailerResolver, - SecurityHandler<? extends ProtectionPolicy> securityHandler) throws IOException + protected COSDictionary rebuildTrailer(Map<COSObjectKey, Long> xrefTable) throws IOException { - // update security handler - this.securityHandler = securityHandler; - // reset trailer resolver - trailerResolver.reset(); + // use a new trailer resolver + XrefTrailerResolver trailerResolver = new XrefTrailerResolver(); // use the found objects to rebuild the trailer resolver trailerResolver.nextXrefObj(0, XRefType.TABLE); getBFCOSObjectOffsets().forEach(trailerResolver::setXRef); @@ -840,22 +837,23 @@ public class BruteForceParser extends CO COSDictionary trailer = trailerResolver.getTrailer(); document.setTrailer(trailer); + xrefTable.putAll(trailerResolver.getXrefTable()); boolean searchForObjStreamsDone = false; if (!bfSearchForTrailer(trailer) && !searchForTrailerItems(trailer)) { // root entry wasn't found, maybe it is part of an object stream // brute force search for all object streams. - bfSearchForObjStreams(trailerResolver, securityHandler); + bfSearchForObjStreams(xrefTable); searchForObjStreamsDone = true; // search again for the root entry searchForTrailerItems(trailer); } // prepare decryption if necessary - prepareDecryption(); + parser.prepareDecryption(); if (!searchForObjStreamsDone) { // brute force search for all object streams. - bfSearchForObjStreams(trailerResolver, securityHandler); + bfSearchForObjStreams(xrefTable); } return trailer; } Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1916856&r1=1916855&r2=1916856&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Mon Apr 8 05:54:20 2024 @@ -25,8 +25,6 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Optional; -import java.util.Map.Entry; import java.util.Set; import org.apache.logging.log4j.Logger; @@ -46,7 +44,6 @@ import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.io.RandomAccessRead; import org.apache.pdfbox.io.RandomAccessReadView; import org.apache.pdfbox.io.RandomAccessStreamCache.StreamCacheCreateFunction; -import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; @@ -55,7 +52,6 @@ import org.apache.pdfbox.pdmodel.encrypt import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial; import org.apache.pdfbox.pdmodel.encryption.SecurityHandler; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; -import org.apache.pdfbox.util.StringUtil; /** * COS-Parser which first reads startxref and xref tables in order to know valid objects and parse only these objects. @@ -71,17 +67,14 @@ public class COSParser extends BaseParse private static final String PDF_DEFAULT_VERSION = "1.4"; private static final String FDF_DEFAULT_VERSION = "1.0"; - private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' }; private static final char[] STARTXREF = { 's','t','a','r','t','x','r','e','f' }; private static final byte[] ENDSTREAM = { E, N, D, S, T, R, E, A, M }; private static final byte[] ENDOBJ = { E, N, D, O, B, J }; - private static final long MINIMUM_SEARCH_OFFSET = 6; + protected static final long MINIMUM_SEARCH_OFFSET = 6; - private static final int X = 'x'; - private static final int STRMBUFLEN = 2048; private final byte[] strmBuf = new byte[ STRMBUFLEN ]; @@ -105,16 +98,16 @@ public class COSParser extends BaseParse /** * EOF-marker. */ - protected static final char[] EOF_MARKER = { '%', '%', 'E', 'O', 'F' }; + private static final char[] EOF_MARKER = { '%', '%', 'E', 'O', 'F' }; /** * obj-marker. */ - protected static final char[] OBJ_MARKER = { 'o', 'b', 'j' }; + private static final char[] OBJ_MARKER = { 'o', 'b', 'j' }; /** * file length. */ - protected long fileLen; + private final long fileLen; /** * is parser using auto healing capacity ? @@ -127,6 +120,7 @@ public class COSParser extends BaseParse private BruteForceParser bruteForceParser = null; private PDEncryption encryption = null; + private final Map<COSObjectKey, Long> xrefTable = new HashMap<>(); /** * Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after @@ -137,7 +131,7 @@ public class COSParser extends BaseParse /** * The security handler. */ - protected SecurityHandler<? extends ProtectionPolicy> securityHandler = null; + private SecurityHandler<? extends ProtectionPolicy> securityHandler = null; /** * how many trailing bytes to read for EOF marker. @@ -146,12 +140,6 @@ public class COSParser extends BaseParse private static final Logger LOG = LogManager.getLogger(COSParser.class); - /** - * Collects all Xref/trailer objects and resolves them into single - * object using startxref reference. - */ - protected XrefTrailerResolver xrefTrailerResolver = new XrefTrailerResolver(); - /** * Default constructor. * @@ -261,7 +249,9 @@ public class COSParser extends BaseParse long startXRefOffset = getStartxrefOffset(); if (startXRefOffset > -1) { - trailer = parseXref(startXRefOffset); + XrefParser xrefParser = new XrefParser(document, this); + trailer = xrefParser.parseXref(startXRefOffset); + xrefTable.putAll(xrefParser.getXrefTable()); } else { @@ -286,15 +276,10 @@ public class COSParser extends BaseParse } if (rebuildTrailer) { - trailer = getBruteForceParser().rebuildTrailer(xrefTrailerResolver, null); + // reset cross reference table + xrefTable.clear(); + trailer = getBruteForceParser().rebuildTrailer(xrefTable); trailerWasRebuild = true; - // transfer encryption information from BruteForceParser - encryption = getBruteForceParser().getEncryption(); - if (encryption != null) - { - securityHandler = encryption.getSecurityHandler(); - accessPermission = securityHandler.getCurrentAccessPermission(); - } } else { @@ -303,194 +288,13 @@ public class COSParser extends BaseParse // don't use the getter as it creates an instance of BruteForceParser if (bruteForceParser != null && bruteForceParser.bfSearchTriggered()) { - getBruteForceParser().bfSearchForObjStreams(xrefTrailerResolver, securityHandler); + getBruteForceParser().bfSearchForObjStreams(xrefTable); } } - if (resetTrailerResolver()) - { - xrefTrailerResolver.reset(); - xrefTrailerResolver = null; - } return trailer; } /** - * Indicates whether the xref trailer resolver should be reset or not. Should be overwritten if the xref trailer - * resolver is needed after the initial parsing. - * - * @return true if the xref trailer resolver should be reset - */ - protected boolean resetTrailerResolver() - { - return true; - } - - /** - * Parses cross reference tables. - * - * @param startXRefOffset start offset of the first table - * @return the trailer dictionary - * @throws IOException if something went wrong - */ - private COSDictionary parseXref(long startXRefOffset) throws IOException - { - source.seek(startXRefOffset); - long startXrefOffset = Math.max(0, parseStartXref()); - // check the startxref offset - long fixedOffset = checkXRefOffset(startXrefOffset); - if (fixedOffset > -1) - { - startXrefOffset = fixedOffset; - } - document.setStartXref(startXrefOffset); - long prev = startXrefOffset; - // ---- parse whole chain of xref tables/object streams using PREV reference - Set<Long> prevSet = new HashSet<>(); - COSDictionary trailer = null; - while (prev > 0) - { - // save expected position for loop detection - prevSet.add(prev); - // seek to xref table - source.seek(prev); - // skip white spaces - skipSpaces(); - // save current position as well due to skipped spaces - prevSet.add(source.getPosition()); - // -- parse xref - if (source.peek() == X) - { - // xref table and trailer - // use existing parser to parse xref table - if (!parseXrefTable(prev) || !parseTrailer()) - { - throw new IOException("Expected trailer object at offset " - + source.getPosition()); - } - trailer = xrefTrailerResolver.getCurrentTrailer(); - // check for a XRef stream, it may contain some object ids of compressed objects - if(trailer.containsKey(COSName.XREF_STM)) - { - int streamOffset = trailer.getInt(COSName.XREF_STM); - // check the xref stream reference - fixedOffset = checkXRefOffset(streamOffset); - if (fixedOffset > -1 && fixedOffset != streamOffset) - { - LOG.warn("/XRefStm offset {} is incorrect, corrected to {}", streamOffset, - fixedOffset); - streamOffset = (int)fixedOffset; - trailer.setInt(COSName.XREF_STM, streamOffset); - } - if (streamOffset > 0) - { - source.seek(streamOffset); - skipSpaces(); - try - { - parseXrefObjStream(prev, false); - document.setHasHybridXRef(); - } - catch (IOException ex) - { - if (isLenient) - { - LOG.error("Failed to parse /XRefStm at offset {}", streamOffset, - ex); - } - else - { - throw ex; - } - } - } - else - { - if(isLenient) - { - LOG.error("Skipped XRef stream due to a corrupt offset:{}", - streamOffset); - } - else - { - throw new IOException("Skipped XRef stream due to a corrupt offset:"+streamOffset); - } - } - } - prev = trailer.getLong(COSName.PREV); - } - else - { - // parse xref stream - prev = parseXrefObjStream(prev, true); - trailer = xrefTrailerResolver.getCurrentTrailer(); - } - if (prev > 0) - { - // check the xref table reference - fixedOffset = checkXRefOffset(prev); - if (fixedOffset > -1 && fixedOffset != prev) - { - prev = fixedOffset; - trailer.setLong(COSName.PREV, prev); - } - } - if (prevSet.contains(prev)) - { - throw new IOException("/Prev loop at offset " + prev); - } - } - // ---- build valid xrefs out of the xref chain - xrefTrailerResolver.setStartxref(startXrefOffset); - trailer = xrefTrailerResolver.getTrailer(); - document.setTrailer(trailer); - document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType()); - // check the offsets of all referenced objects - if (isLenient) - { - checkXrefOffsets(); - } - // copy xref table - document.addXRefTable(xrefTrailerResolver.getXrefTable()); - - // remember the highest XRef object number to avoid it being reused in incremental saving - Optional<Long> maxValue = document.getXrefTable().keySet().stream() // - .map(COSObjectKey::getNumber) // - .reduce(Long::max); - document.setHighestXRefObjectNumber(maxValue.isPresent() ? maxValue.get() : 0); - - return trailer; - } - - /** - * Parses an xref object stream starting with indirect object id. - * - * @return value of PREV item in dictionary or <code>-1</code> if no such item exists - */ - private long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws IOException - { - // ---- parse indirect object head - readObjectNumber(); - readGenerationNumber(); - readExpectedString(OBJ_MARKER, true); - - COSDictionary dict = parseCOSDictionary(false); - try (COSStream xrefStream = parseCOSStream(dict)) - { - // the cross reference stream of a hybrid xref table will be added to the existing one - // and we must not override the offset and the trailer - if ( isStandalone ) - { - xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM ); - xrefTrailerResolver.setTrailer(xrefStream); - } - PDFXrefStreamParser parser = new PDFXrefStreamParser(xrefStream, document); - parser.parse(xrefTrailerResolver); - } - - return dict.getLong(COSName.PREV); - } - - /** * Looks for and parses startxref. We first look for last '%%EOF' marker (within last * {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or range set via {@link #setEOFLookupRange(int)}) and go back to find * <code>startxref</code>. @@ -564,7 +368,7 @@ public class COSParser extends BaseParse * * @return start offset of pattern within buffer or <code>-1</code> if pattern could not be found */ - protected int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) + private int lastIndexOf(final char[] pattern, final byte[] buf, final int endOff) { final int lastPatternChOff = pattern.length - 1; @@ -658,7 +462,7 @@ public class COSParser extends BaseParse * * @throws IOException If an IO error occurs. */ - protected synchronized COSBase parseObjectDynamically(COSObjectKey objKey, + private synchronized COSBase parseObjectDynamically(COSObjectKey objKey, boolean requireExistingNotCompressedObj) throws IOException { COSObject pdfObject = document.getObjectFromPool(objKey); @@ -727,7 +531,7 @@ public class COSParser extends BaseParse // an indirect object starts with the object number/generation number final long readObjNr = readObjectNumber(); final int readObjGen = readGenerationNumber(); - readExpectedString(OBJ_MARKER, true); + readObjectMarker(); // consistency check if (readObjNr != objKey.getNumber() || readObjGen != objKey.getGeneration()) @@ -876,7 +680,7 @@ public class COSParser extends BaseParse } if (COSNull.NULL == length) { - LOG.warn("Length object ({} {}) not found", lengthObj.getKey()); + LOG.warn("Length object ({}) not found", lengthObj.getKey()); return null; } if (length instanceof COSNumber) @@ -961,7 +765,7 @@ public class COSParser extends BaseParse { throw new IOException( "Error reading stream, expected='endstream' actual='" - + endStream + "' at offset " + source.getPosition()); + + endStream + "' at offset " + source.getPosition()); } return document.createCOSStream(dic, streamStartPosition, streamLength); } @@ -1102,287 +906,11 @@ public class COSParser extends BaseParse return streamLengthIsValid; } - /** - * Check if the cross reference table/stream can be found at the current offset. - * - * @param startXRefOffset - * @return the revised offset - * @throws IOException - */ - private long checkXRefOffset(long startXRefOffset) throws IOException - { - // repair mode isn't available in non-lenient mode - if (!isLenient) - { - return startXRefOffset; - } - source.seek(startXRefOffset); - skipSpaces(); - if (isString(XREF_TABLE)) - { - return startXRefOffset; - } - if (startXRefOffset > 0) - { - if (checkXRefStreamOffset(startXRefOffset)) - { - return startXRefOffset; - } - else - { - return calculateXRefFixedOffset(startXRefOffset); - } - } - // can't find a valid offset - return -1; - } - - /** - * Check if the cross reference stream can be found at the current offset. - * - * @param startXRefOffset the expected start offset of the XRef stream - * @return the revised offset - * @throws IOException if something went wrong - */ - private boolean checkXRefStreamOffset(long startXRefOffset) throws IOException - { - // repair mode isn't available in non-lenient mode - if (!isLenient || startXRefOffset == 0) - { - return true; - } - // seek to offset-1 - source.seek(startXRefOffset-1); - int nextValue = source.read(); - // the first character has to be a whitespace, and then a digit - if (isWhitespace(nextValue)) - { - skipSpaces(); - if (isDigit()) - { - try - { - // it's a XRef stream - readObjectNumber(); - readGenerationNumber(); - readExpectedString(OBJ_MARKER, true); - // check the dictionary to avoid false positives - COSDictionary dict = parseCOSDictionary(false); - source.seek(startXRefOffset); - if ("XRef".equals(dict.getNameAsString(COSName.TYPE))) - { - return true; - } - } - catch (IOException exception) - { - // there wasn't an object of a xref stream - LOG.debug("No Xref stream at given location {}", startXRefOffset, exception); - source.seek(startXRefOffset); - } - } - } - return false; - } - - /** - * Try to find a fixed offset for the given xref table/stream. - * - * @param objectOffset the given offset where to look at - * @return the fixed offset - * - * @throws IOException if something went wrong - */ - private long calculateXRefFixedOffset(long objectOffset) throws IOException - { - if (objectOffset < 0) - { - LOG.error("Invalid object offset {} when searching for a xref table/stream", - objectOffset); - return 0; - } - // search for the offset of the given xref table/stream among those found by a brute force search. - long newOffset = getBruteForceParser().bfSearchForXRef(objectOffset); - if (newOffset > -1) - { - LOG.debug("Fixed reference for xref table/stream {} -> {}", objectOffset, newOffset); - return newOffset; - } - LOG.error("Can't find the object xref table/stream at offset {}", objectOffset); - return 0; - } - - private boolean validateXrefOffsets(Map<COSObjectKey, Long> xrefOffset) throws IOException - { - if (xrefOffset == null) - { - return true; - } - Map<COSObjectKey, COSObjectKey> correctedKeys = new HashMap<>(); - HashSet<COSObjectKey> validKeys = new HashSet<>(); - for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet()) - { - COSObjectKey objectKey = objectEntry.getKey(); - Long objectOffset = objectEntry.getValue(); - // a negative offset number represents an object number itself - // see type 2 entry in xref stream - if (objectOffset != null && objectOffset >= 0) - { - COSObjectKey foundObjectKey = findObjectKey(objectKey, objectOffset, xrefOffset); - if (foundObjectKey == null) - { - LOG.debug( - "Stop checking xref offsets as at least one ({}) couldn't be dereferenced", - objectKey); - return false; - } - else if (foundObjectKey != objectKey) - { - // Generation was fixed - need to update map later, after iteration - correctedKeys.put(objectKey, foundObjectKey); - } - else - { - validKeys.add(objectKey); - } - } - } - Map<COSObjectKey, Long> correctedPointers = new HashMap<>(); - for (Entry<COSObjectKey, COSObjectKey> correctedKeyEntry : correctedKeys.entrySet()) - { - if (!validKeys.contains(correctedKeyEntry.getValue())) - { - // Only replace entries, if the original entry does not point to a valid object - correctedPointers.put(correctedKeyEntry.getValue(), - xrefOffset.get(correctedKeyEntry.getKey())); - } - } - // remove old invalid, as some might not be replaced - correctedKeys.forEach((key, value) -> xrefOffset.remove(key)); - xrefOffset.putAll(correctedPointers); - return true; - } - - /** - * Check the XRef table by dereferencing all objects and fixing the offset if necessary. - * - * @throws IOException if something went wrong. - */ - private void checkXrefOffsets() throws IOException - { - Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable(); - if (!validateXrefOffsets(xrefOffset)) - { - Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBruteForceParser() - .getBFCOSObjectOffsets(); - if (!bfCOSObjectKeyOffsets.isEmpty()) - { - LOG.debug("Replaced read xref table with the results of a brute force search"); - xrefOffset.clear(); - xrefOffset.putAll(bfCOSObjectKeyOffsets); - } - } - } - - /** - * Check if the given object can be found at the given offset. Returns the provided object key if everything is ok. - * If the generation number differs it will be fixed and a new object key is returned. - * - * @param objectKey the key of object we are looking for - * @param offset the offset where to look - * @param xrefOffset a map with with all known xref entries - * @return returns the found/fixed object key - * - * @throws IOException if something went wrong - */ - private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset, - Map<COSObjectKey, Long> xrefOffset) throws IOException - { - // there can't be any object at the very beginning of a pdf - if (offset < MINIMUM_SEARCH_OFFSET) - { - return null; - } - try - { - source.seek(offset); - skipWhiteSpaces(); - if (source.getPosition() == offset) - { - // ensure that at least one whitespace is skipped in front of the object number - source.seek(offset - 1); - if (source.getPosition() < offset) - { - if (!isDigit()) - { - // anything else but a digit may be some garbage of the previous object -> just ignore it - source.read(); - } - else - { - long current = source.getPosition(); - source.seek(--current); - while (isDigit()) - source.seek(--current); - long newObjNr = readObjectNumber(); - int newGenNr = readGenerationNumber(); - COSObjectKey newObjKey = new COSObjectKey(newObjNr, newGenNr); - Long existingOffset = xrefOffset.get(newObjKey); - // the found object number belongs to another uncompressed object at the same or nearby offset - // something has to be wrong - if (existingOffset != null && existingOffset > 0 - && Math.abs(offset - existingOffset) < 10) - { - LOG.debug("Found the object {} instead of {} at offset {} - ignoring", - newObjKey, objectKey, offset); - return null; - } - // something seems to be wrong but it's hard to determine what exactly -> simply continue - source.seek(offset); - } - } - } - // try to read the given object/generation number - long foundObjectNumber = readObjectNumber(); - if (objectKey.getNumber() != foundObjectNumber) - { - LOG.warn("found wrong object number. expected [{}] found [{}]", - objectKey.getNumber(), foundObjectNumber); - if (!isLenient) - { - return null; - } - else - { - objectKey = new COSObjectKey(foundObjectNumber, objectKey.getGeneration()); - } - } - - int genNumber = readGenerationNumber(); - // finally try to read the object marker - readExpectedString(OBJ_MARKER, true); - if (genNumber == objectKey.getGeneration()) - { - return objectKey; - } - else if (isLenient && genNumber > objectKey.getGeneration()) - { - return new COSObjectKey(objectKey.getNumber(), genNumber); - } - } - catch (IOException exception) - { - // Swallow the exception, obviously there isn't any valid object number - LOG.debug("No valid object at given location {} - ignoring", offset, exception); - } - return null; - } - - private BruteForceParser getBruteForceParser() throws IOException + protected BruteForceParser getBruteForceParser() throws IOException { if (bruteForceParser == null) { - bruteForceParser = new BruteForceParser(source, document); + bruteForceParser = new BruteForceParser(document, this); } return bruteForceParser; } @@ -1457,25 +985,6 @@ public class COSParser extends BaseParse } /** - * This will parse the startxref section from the stream. The startxref value is ignored. - * - * @return the startxref value or -1 on parsing error - * @throws IOException If an IO error occurs. - */ - private long parseStartXref() throws IOException - { - long startXref = -1; - if (isString(STARTXREF)) - { - readString(); - skipSpaces(); - // This integer is the byte offset of the first object referenced by the xref or xref stream - startXref = readLong(); - } - return startXref; - } - - /** * Checks if the given string can be found at the current offset. * * @param string the bytes of the string to look for @@ -1521,67 +1030,9 @@ public class COSParser extends BaseParse return bytesMatching; } - /** - * This will parse the trailer from the stream and add it to the state. - * - * @return false on parsing error - * @throws IOException If an IO error occurs. - */ - private boolean parseTrailer() throws IOException + protected void readObjectMarker() throws IOException { - // parse the last trailer. - long trailerOffset = source.getPosition(); - // PDFBOX-1739 skip extra xref entries in RegisSTAR documents - if (isLenient) - { - int nextCharacter = source.peek(); - while (nextCharacter != 't' && isDigit(nextCharacter)) - { - if (source.getPosition() == trailerOffset) - { - // warn only the first time - LOG.warn("Expected trailer object at offset {}, keep trying", trailerOffset); - } - readLine(); - nextCharacter = source.peek(); - } - } - if(source.peek() != 't') - { - return false; - } - //read "trailer" - long currentOffset = source.getPosition(); - String nextLine = readLine(); - if( !nextLine.trim().equals( "trailer" ) ) - { - // in some cases the EOL is missing and the trailer immediately - // continues with "<<" or with a blank character - // even if this does not comply with PDF reference we want to support as many PDFs as possible - // Acrobat reader can also deal with this. - if (nextLine.startsWith("trailer")) - { - // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes - int len = "trailer".length(); - // jump back right after "trailer" - source.seek(currentOffset + len); - } - else - { - return false; - } - } - - // in some cases the EOL is missing and the trailer continues with " <<" - // even if this does not comply with PDF reference we want to support as many PDFs as possible - // Acrobat reader can also deal with this. - skipSpaces(); - - COSDictionary parsedTrailer = parseCOSDictionary(true); - xrefTrailerResolver.setTrailer( parsedTrailer ); - - skipSpaces(); - return true; + readExpectedString(OBJ_MARKER, true); } /** @@ -1690,128 +1141,6 @@ public class COSParser extends BaseParse } /** - * This will parse the xref table from the stream and add it to the state - * The XrefTable contents are ignored. - * @param startByteOffset the offset to start at - * @return false on parsing error - * @throws IOException If an IO error occurs. - */ - protected boolean parseXrefTable(long startByteOffset) throws IOException - { - if(source.peek() != 'x') - { - return false; - } - String xref = readString(); - if( !xref.trim().equals( "xref" ) ) - { - return false; - } - - // check for trailer after xref - String str = readString(); - byte[] b = str.getBytes(StandardCharsets.ISO_8859_1); - source.rewind(b.length); - - // signal start of new XRef - xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE ); - - if (str.startsWith("trailer")) - { - LOG.warn("skipping empty xref table"); - return false; - } - - // Xref tables can have multiple sections. Each starts with a starting object id and a count. - while(true) - { - String currentLine = readLine(); - String[] splitString = StringUtil.splitOnSpace(currentLine); - if (splitString.length != 2) - { - LOG.warn("Unexpected XRefTable Entry: {}", currentLine); - return false; - } - // first obj id - long currObjID; - try - { - currObjID = Long.parseLong(splitString[0]); - } - catch (NumberFormatException exception) - { - LOG.warn("XRefTable: invalid ID for the first object: {}", currentLine); - return false; - } - - // the number of objects in the xref table - int count = 0; - try - { - count = Integer.parseInt(splitString[1]); - } - catch (NumberFormatException exception) - { - LOG.warn("XRefTable: invalid number of objects: {}", currentLine); - return false; - } - - skipSpaces(); - for(int i = 0; i < count; i++) - { - if (source.isEOF() || isEndOfName(source.peek())) - { - break; - } - if(source.peek() == 't') - { - break; - } - //Ignore table contents - currentLine = readLine(); - splitString = StringUtil.splitOnSpace(currentLine); - if (splitString.length < 3) - { - LOG.warn("invalid xref line: {}", currentLine); - break; - } - /* This supports the corrupt table as reported in - * PDFBOX-474 (XXXX XXX XX n) */ - if(splitString[splitString.length-1].equals("n")) - { - try - { - long currOffset = Long.parseLong(splitString[0]); - // skip 0 offsets - if (currOffset > 0) - { - int currGenID = Integer.parseInt(splitString[1]); - COSObjectKey objKey = new COSObjectKey(currObjID, currGenID); - xrefTrailerResolver.setXRef(objKey, currOffset); - } - } - catch (IllegalArgumentException e) - { - throw new IOException(e); - } - } - else if(!splitString[2].equals("f")) - { - throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID); - } - currObjID++; - skipSpaces(); - } - skipSpaces(); - if (!isDigit()) - { - break; - } - } - return true; - } - - /** * This will get the encryption dictionary. The document must be parsed before this is called. * * @return The encryption dictionary of the document that was parsed. @@ -1901,4 +1230,15 @@ public class COSParser extends BaseParse } } + /** + * This will get the security handler. The document must be parsed before this is called. + * + * @return The security handler of the document that was parsed. + * + * @throws IOException If there is an error getting the document. + */ + protected SecurityHandler<? extends ProtectionPolicy> getSecurityHandler() + { + return securityHandler; + } } Added: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java?rev=1916856&view=auto ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java (added) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java Mon Apr 8 05:54:20 2024 @@ -0,0 +1,694 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.pdfbox.pdfparser; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Optional; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSDocument; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSObjectKey; +import org.apache.pdfbox.cos.COSStream; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType; + +/** + * Parser to be used to read the cross reference table of a pdf. It is either a simple table or a stream. + */ +public class XrefParser +{ + private static final Logger LOG = LogManager.getLogger(XrefParser.class); + + private static final int X = 'x'; + private static final char[] XREF_TABLE = { 'x', 'r', 'e', 'f' }; + private static final char[] STARTXREF = { 's', 't', 'a', 'r', 't', 'x', 'r', 'e', 'f' }; + + /** + * Collects all Xref/trailer objects and resolves them into single + * object using startxref reference. + */ + private XrefTrailerResolver xrefTrailerResolver = new XrefTrailerResolver(); + + private final COSParser parser; + private final COSDocument document; + private final RandomAccessRead source; + + /** + * Default constructor. + * + * @param cosDocument the corresponding COS document of the pdf. + * @param cosParser the parser to be used to read the pdf. + * + */ + public XrefParser(COSDocument cosDocument, COSParser cosParser) + { + document = cosDocument; + parser = cosParser; + source = parser.source; + } + + /** + * Returns the resulting cross reference table. + * + * @return + */ + public Map<COSObjectKey, Long> getXrefTable() + { + return xrefTrailerResolver.getXrefTable(); + } + + /** + * Parses cross reference tables. + * + * @param startXRefOffset start offset of the first table + * @return the trailer dictionary + * @throws IOException if something went wrong + */ + public COSDictionary parseXref(long startXRefOffset) throws IOException + { + source.seek(startXRefOffset); + long startXrefOffset = Math.max(0, parseStartXref()); + // check the startxref offset + long fixedOffset = checkXRefOffset(startXrefOffset); + if (fixedOffset > -1) + { + startXrefOffset = fixedOffset; + } + document.setStartXref(startXrefOffset); + long prev = startXrefOffset; + // ---- parse whole chain of xref tables/object streams using PREV reference + Set<Long> prevSet = new HashSet<>(); + COSDictionary trailer = null; + while (prev > 0) + { + // save expected position for loop detection + prevSet.add(prev); + // seek to xref table + source.seek(prev); + // skip white spaces + parser.skipSpaces(); + // save current position as well due to skipped spaces + prevSet.add(source.getPosition()); + // -- parse xref + if (source.peek() == X) + { + // xref table and trailer + // use existing parser to parse xref table + if (!parseXrefTable(prev) || !parseTrailer()) + { + throw new IOException("Expected trailer object at offset " + + source.getPosition()); + } + trailer = xrefTrailerResolver.getCurrentTrailer(); + // check for a XRef stream, it may contain some object ids of compressed objects + if(trailer.containsKey(COSName.XREF_STM)) + { + int streamOffset = trailer.getInt(COSName.XREF_STM); + // check the xref stream reference + fixedOffset = checkXRefOffset(streamOffset); + if (fixedOffset > -1 && fixedOffset != streamOffset) + { + LOG.warn("/XRefStm offset {} is incorrect, corrected to {}", streamOffset, + fixedOffset); + streamOffset = (int)fixedOffset; + trailer.setInt(COSName.XREF_STM, streamOffset); + } + if (streamOffset > 0) + { + source.seek(streamOffset); + parser.skipSpaces(); + try + { + parseXrefObjStream(prev, false); + document.setHasHybridXRef(); + } + catch (IOException ex) + { + LOG.error("Failed to parse /XRefStm at offset {}", streamOffset, ex); + } + } + else + { + LOG.error("Skipped XRef stream due to a corrupt offset: {}", streamOffset); + } + } + prev = trailer.getLong(COSName.PREV); + } + else + { + // parse xref stream + prev = parseXrefObjStream(prev, true); + trailer = xrefTrailerResolver.getCurrentTrailer(); + } + if (prev > 0) + { + // check the xref table reference + fixedOffset = checkXRefOffset(prev); + if (fixedOffset > -1 && fixedOffset != prev) + { + prev = fixedOffset; + trailer.setLong(COSName.PREV, prev); + } + } + if (prevSet.contains(prev)) + { + throw new IOException("/Prev loop at offset " + prev); + } + } + // ---- build valid xrefs out of the xref chain + xrefTrailerResolver.setStartxref(startXrefOffset); + trailer = xrefTrailerResolver.getTrailer(); + document.setTrailer(trailer); + document.setIsXRefStream(XRefType.STREAM == xrefTrailerResolver.getXrefType()); + // check the offsets of all referenced objects + checkXrefOffsets(); + // copy xref table + document.addXRefTable(xrefTrailerResolver.getXrefTable()); + + // remember the highest XRef object number to avoid it being reused in incremental saving + Optional<Long> maxValue = document.getXrefTable().keySet().stream() // + .map(COSObjectKey::getNumber) // + .reduce(Long::max); + document.setHighestXRefObjectNumber(maxValue.isPresent() ? maxValue.get() : 0); + + return trailer; + } + + /** + * This will parse the trailer from the stream and add it to the state. + * + * @return false on parsing error + * @throws IOException If an IO error occurs. + */ + private boolean parseTrailer() throws IOException + { + // parse the last trailer. + long trailerOffset = source.getPosition(); + // PDFBOX-1739 skip extra xref entries in RegisSTAR documents + int nextCharacter = source.peek(); + while (nextCharacter != 't' && COSParser.isDigit(nextCharacter)) + { + if (source.getPosition() == trailerOffset) + { + // warn only the first time + LOG.warn("Expected trailer object at offset {}, keep trying", trailerOffset); + } + parser.readLine(); + nextCharacter = source.peek(); + } + if (source.peek() != 't') + { + return false; + } + // read "trailer" + long currentOffset = source.getPosition(); + String nextLine = parser.readLine(); + if (!nextLine.trim().equals("trailer")) + { + // in some cases the EOL is missing and the trailer immediately + // continues with "<<" or with a blank character + // even if this does not comply with PDF reference we want to support as many PDFs as possible + // Acrobat reader can also deal with this. + if (nextLine.startsWith("trailer")) + { + // we can't just unread a portion of the read data as we don't know if the EOL consist of 1 or 2 bytes + int len = "trailer".length(); + // jump back right after "trailer" + source.seek(currentOffset + len); + } + else + { + return false; + } + } + + // in some cases the EOL is missing and the trailer continues with " <<" + // even if this does not comply with PDF reference we want to support as many PDFs as possible + // Acrobat reader can also deal with this. + parser.skipSpaces(); + + COSDictionary parsedTrailer = parser.parseCOSDictionary(true); + xrefTrailerResolver.setTrailer(parsedTrailer); + + parser.skipSpaces(); + return true; + } + + /** + * Parses an xref object stream starting with indirect object id. + * + * @return value of PREV item in dictionary or <code>-1</code> if no such item exists + */ + private long parseXrefObjStream(long objByteOffset, boolean isStandalone) throws IOException + { + // ---- parse indirect object head + parser.readObjectNumber(); + parser.readGenerationNumber(); + parser.readObjectMarker(); + + COSDictionary dict = parser.parseCOSDictionary(false); + try (COSStream xrefStream = parser.parseCOSStream(dict)) + { + // the cross reference stream of a hybrid xref table will be added to the existing one + // and we must not override the offset and the trailer + if ( isStandalone ) + { + xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM ); + xrefTrailerResolver.setTrailer(xrefStream); + } + PDFXrefStreamParser parser = new PDFXrefStreamParser(xrefStream, document); + parser.parse(xrefTrailerResolver); + } + + return dict.getLong(COSName.PREV); + } + + /** + * Check if the cross reference table/stream can be found at the current offset. + * + * @param startXRefOffset + * @return the revised offset + * @throws IOException + */ + private long checkXRefOffset(long startXRefOffset) throws IOException + { + source.seek(startXRefOffset); + parser.skipSpaces(); + if (parser.isString(XREF_TABLE)) + { + return startXRefOffset; + } + if (startXRefOffset > 0) + { + if (checkXRefStreamOffset(startXRefOffset)) + { + return startXRefOffset; + } + else + { + return calculateXRefFixedOffset(startXRefOffset); + } + } + // can't find a valid offset + return -1; + } + + /** + * Try to find a fixed offset for the given xref table/stream. + * + * @param objectOffset the given offset where to look at + * @return the fixed offset + * + * @throws IOException if something went wrong + */ + private long calculateXRefFixedOffset(long objectOffset) throws IOException + { + if (objectOffset < 0) + { + LOG.error("Invalid object offset {} when searching for a xref table/stream", + objectOffset); + return 0; + } + // search for the offset of the given xref table/stream among those found by a brute force search. + long newOffset = parser.getBruteForceParser().bfSearchForXRef(objectOffset); + if (newOffset > -1) + { + LOG.debug("Fixed reference for xref table/stream {} -> {}", objectOffset, newOffset); + return newOffset; + } + LOG.error("Can't find the object xref table/stream at offset {}", objectOffset); + return 0; + } + + /** + * Check if the cross reference stream can be found at the current offset. + * + * @param startXRefOffset the expected start offset of the XRef stream + * @return the revised offset + * @throws IOException if something went wrong + */ + private boolean checkXRefStreamOffset(long startXRefOffset) throws IOException + { + if (startXRefOffset == 0) + { + return true; + } + // seek to offset-1 + source.seek(startXRefOffset - 1); + int nextValue = source.read(); + // the first character has to be a whitespace, and then a digit + if (COSParser.isWhitespace(nextValue)) + { + parser.skipSpaces(); + if (parser.isDigit()) + { + try + { + // it's a XRef stream + parser.readObjectNumber(); + parser.readGenerationNumber(); + parser.readObjectMarker(); + // check the dictionary to avoid false positives + COSDictionary dict = parser.parseCOSDictionary(false); + source.seek(startXRefOffset); + if ("XRef".equals(dict.getNameAsString(COSName.TYPE))) + { + return true; + } + } + catch (IOException exception) + { + // there wasn't an object of a xref stream + LOG.debug("No Xref stream at given location {}", startXRefOffset, exception); + source.seek(startXRefOffset); + } + } + } + return false; + } + + private boolean validateXrefOffsets(Map<COSObjectKey, Long> xrefOffset) throws IOException + { + if (xrefOffset == null) + { + return true; + } + Map<COSObjectKey, COSObjectKey> correctedKeys = new HashMap<>(); + HashSet<COSObjectKey> validKeys = new HashSet<>(); + for (Entry<COSObjectKey, Long> objectEntry : xrefOffset.entrySet()) + { + COSObjectKey objectKey = objectEntry.getKey(); + Long objectOffset = objectEntry.getValue(); + // a negative offset number represents an object number itself + // see type 2 entry in xref stream + if (objectOffset != null && objectOffset >= 0) + { + COSObjectKey foundObjectKey = findObjectKey(objectKey, objectOffset, xrefOffset); + if (foundObjectKey == null) + { + LOG.debug( + "Stop checking xref offsets as at least one ({}) couldn't be dereferenced", + objectKey); + return false; + } + else if (foundObjectKey != objectKey) + { + // Generation was fixed - need to update map later, after iteration + correctedKeys.put(objectKey, foundObjectKey); + } + else + { + validKeys.add(objectKey); + } + } + } + Map<COSObjectKey, Long> correctedPointers = new HashMap<>(); + for (Entry<COSObjectKey, COSObjectKey> correctedKeyEntry : correctedKeys.entrySet()) + { + if (!validKeys.contains(correctedKeyEntry.getValue())) + { + // Only replace entries, if the original entry does not point to a valid object + correctedPointers.put(correctedKeyEntry.getValue(), + xrefOffset.get(correctedKeyEntry.getKey())); + } + } + // remove old invalid, as some might not be replaced + correctedKeys.forEach((key, value) -> xrefOffset.remove(key)); + xrefOffset.putAll(correctedPointers); + return true; + } + + /** + * Check the XRef table by dereferencing all objects and fixing the offset if necessary. + * + * @throws IOException if something went wrong. + */ + private void checkXrefOffsets() throws IOException + { + Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable(); + if (!validateXrefOffsets(xrefOffset)) + { + Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = parser.getBruteForceParser() + .getBFCOSObjectOffsets(); + if (!bfCOSObjectKeyOffsets.isEmpty()) + { + LOG.debug("Replaced read xref table with the results of a brute force search"); + xrefOffset.clear(); + xrefOffset.putAll(bfCOSObjectKeyOffsets); + } + } + } + + /** + * Check if the given object can be found at the given offset. Returns the provided object key if everything is ok. + * If the generation number differs it will be fixed and a new object key is returned. + * + * @param objectKey the key of object we are looking for + * @param offset the offset where to look + * @param xrefOffset a map with with all known xref entries + * @return returns the found/fixed object key + * + * @throws IOException if something went wrong + */ + private COSObjectKey findObjectKey(COSObjectKey objectKey, long offset, + Map<COSObjectKey, Long> xrefOffset) throws IOException + { + // there can't be any object at the very beginning of a pdf + if (offset < COSParser.MINIMUM_SEARCH_OFFSET) + { + return null; + } + try + { + source.seek(offset); + parser.skipWhiteSpaces(); + if (source.getPosition() == offset) + { + // ensure that at least one whitespace is skipped in front of the object number + source.seek(offset - 1); + if (source.getPosition() < offset) + { + if (!parser.isDigit()) + { + // anything else but a digit may be some garbage of the previous object -> just ignore it + source.read(); + } + else + { + long current = source.getPosition(); + source.seek(--current); + while (parser.isDigit()) + source.seek(--current); + long newObjNr = parser.readObjectNumber(); + int newGenNr = parser.readGenerationNumber(); + COSObjectKey newObjKey = new COSObjectKey(newObjNr, newGenNr); + Long existingOffset = xrefOffset.get(newObjKey); + // the found object number belongs to another uncompressed object at the same or nearby offset + // something has to be wrong + if (existingOffset != null && existingOffset > 0 + && Math.abs(offset - existingOffset) < 10) + { + LOG.debug("Found the object {} instead of {} at offset {} - ignoring", + newObjKey, objectKey, offset); + return null; + } + // something seems to be wrong but it's hard to determine what exactly -> simply continue + source.seek(offset); + } + } + } + // try to read the given object/generation number + long foundObjectNumber = parser.readObjectNumber(); + if (objectKey.getNumber() != foundObjectNumber) + { + LOG.warn("found wrong object number. expected [{}] found [{}]", + objectKey.getNumber(), foundObjectNumber); + objectKey = new COSObjectKey(foundObjectNumber, objectKey.getGeneration()); + } + + int genNumber = parser.readGenerationNumber(); + // finally try to read the object marker + parser.readObjectMarker(); + if (genNumber == objectKey.getGeneration()) + { + return objectKey; + } + else if (genNumber > objectKey.getGeneration()) + { + return new COSObjectKey(objectKey.getNumber(), genNumber); + } + } + catch (IOException exception) + { + // Swallow the exception, obviously there isn't any valid object number + LOG.debug("No valid object at given location {} - ignoring", offset, exception); + } + return null; + } + + /** + * This will parse the startxref section from the stream. The startxref value is ignored. + * + * @return the startxref value or -1 on parsing error + * @throws IOException If an IO error occurs. + */ + private long parseStartXref() throws IOException + { + long startXref = -1; + if (parser.isString(STARTXREF)) + { + parser.readString(); + parser.skipSpaces(); + // This integer is the byte offset of the first object referenced by the xref or xref stream + startXref = parser.readLong(); + } + return startXref; + } + + /** + * This will parse the xref table from the stream and add it to the state + * The XrefTable contents are ignored. + * @param startByteOffset the offset to start at + * @return false on parsing error + * @throws IOException If an IO error occurs. + */ + private boolean parseXrefTable(long startByteOffset) throws IOException + { + if (source.peek() != 'x') + { + return false; + } + String xref = parser.readString(); + if( !xref.trim().equals( "xref" ) ) + { + return false; + } + + // check for trailer after xref + String str = parser.readString(); + byte[] b = str.getBytes(StandardCharsets.ISO_8859_1); + source.seek(source.getPosition() - b.length); + + // signal start of new XRef + xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE ); + + if (str.startsWith("trailer")) + { + LOG.warn("skipping empty xref table"); + return false; + } + + // Xref tables can have multiple sections. Each starts with a starting object id and a count. + while(true) + { + String currentLine = parser.readLine(); + String[] splitString = currentLine.split("\\s"); + if (splitString.length != 2) + { + LOG.warn("Unexpected XRefTable Entry: {}", currentLine); + return false; + } + // first obj id + long currObjID; + try + { + currObjID = Long.parseLong(splitString[0]); + } + catch (NumberFormatException exception) + { + LOG.warn("XRefTable: invalid ID for the first object: {}", currentLine); + return false; + } + + // the number of objects in the xref table + int count = 0; + try + { + count = Integer.parseInt(splitString[1]); + } + catch (NumberFormatException exception) + { + LOG.warn("XRefTable: invalid number of objects: {}", currentLine); + return false; + } + + parser.skipSpaces(); + for(int i = 0; i < count; i++) + { + if (parser.isEOF() ) + { + break; + } + int nextChar = source.peek(); + if (nextChar == 't' || COSParser.isEndOfName(nextChar)) + { + break; + } + //Ignore table contents + currentLine = parser.readLine(); + splitString = currentLine.split("\\s"); + if (splitString.length < 3) + { + LOG.warn("invalid xref line: {}", currentLine); + break; + } + /* This supports the corrupt table as reported in + * PDFBOX-474 (XXXX XXX XX n) */ + if(splitString[splitString.length-1].equals("n")) + { + try + { + long currOffset = Long.parseLong(splitString[0]); + // skip 0 offsets + if (currOffset > 0) + { + int currGenID = Integer.parseInt(splitString[1]); + COSObjectKey objKey = new COSObjectKey(currObjID, currGenID); + xrefTrailerResolver.setXRef(objKey, currOffset); + } + } + catch (IllegalArgumentException e) + { + throw new IOException(e); + } + } + else if(!splitString[2].equals("f")) + { + throw new IOException("Corrupt XRefTable Entry - ObjID:" + currObjID); + } + currObjID++; + parser.skipSpaces(); + } + parser.skipSpaces(); + if (!parser.isDigit()) + { + break; + } + } + return true; + } + +} Propchange: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/XrefParser.java ------------------------------------------------------------------------------ svn:eol-style = native