Author: lehmi Date: Sun Oct 22 17:04:37 2017 New Revision: 1812933 URL: http://svn.apache.org/viewvc?rev=1812933&view=rev Log: PDFBOX-3957: search for valid trailer entries when rebuilding the trailer
Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Modified: pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java URL: http://svn.apache.org/viewvc/pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1812933&r1=1812932&r2=1812933&view=diff ============================================================================== --- pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original) +++ pdfbox/branches/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun Oct 22 17:04:37 2017 @@ -113,6 +113,11 @@ public class COSParser extends BaseParse protected static final char[] OBJ_MARKER = new char[] { 'o', 'b', 'j' }; /** + * trailer-marker. + */ + private static final char[] TRAILER_MARKER = new char[] { 't', 'r', 'a', 'i', 'l', 'e', 'r' }; + + /** * ObjStream-marker. */ private static final char[] OBJ_STREAM = new char[] { '/', 'O', 'b', 'j', 'S', 't', 'm' }; @@ -1618,6 +1623,75 @@ public class COSParser extends BaseParse } /** + * Brute force search for all trailer marker. + * + * @throws IOException if something went wrong + */ + private List<COSObjectKey[]> bfSearchForTrailer() throws IOException + { + List<COSObjectKey[]> trailerDicts = new ArrayList<COSObjectKey[]>(); + long originOffset = source.getPosition(); + source.seek(MINIMUM_SEARCH_OFFSET); + while (!source.isEOF()) + { + // search for trailer marker + if (isString(TRAILER_MARKER)) + { + source.seek(source.getPosition() + TRAILER_MARKER.length); + try + { + skipSpaces(); + COSDictionary trailerDict = parseCOSDictionary(); + COSObjectKey[] trailerKeys = new COSObjectKey[2]; + if (trailerDict.containsKey(COSName.ROOT)) + { + COSBase rootObj = trailerDict.getItem(COSName.ROOT); + if (rootObj instanceof COSObject) + { + long objNumber = ((COSObject) rootObj).getObjectNumber(); + int genNumber = ((COSObject) rootObj).getGenerationNumber(); + trailerKeys[0] = new COSObjectKey(objNumber, genNumber); + } + } + if (trailerDict.containsKey(COSName.INFO)) + { + COSBase infoObj = trailerDict.getItem(COSName.INFO); + long objNumber = ((COSObject) infoObj).getObjectNumber(); + int genNumber = ((COSObject) infoObj).getGenerationNumber(); + trailerKeys[1] = new COSObjectKey(objNumber, genNumber); + } + if (trailerKeys[0] != null || trailerKeys[1] != null) + { + trailerDicts.add(trailerKeys); + } + } + catch (IOException exception) + { + continue; + } + } + source.read(); + } + source.seek(originOffset); + // eliminate double entries + int trailerdictsSize = trailerDicts.size(); + if (trailerdictsSize > 1) + { + COSObjectKey[] first = trailerDicts.get(0); + for (int i = trailerdictsSize - 1; i > 0; i--) + { + COSObjectKey[] other = trailerDicts.get(i); + if (first[0].equals(other[0]) && first[1].equals(other[1])) + { + trailerDicts.remove(other); + } + } + + } + return trailerDicts; + } + + /** * Brute force search for the last EOF marker. * * @throws IOException if something went wrong @@ -1966,78 +2040,100 @@ public class COSParser extends BaseParse xrefTrailerResolver.setStartxref(0); trailer = xrefTrailerResolver.getTrailer(); getDocument().setTrailer(trailer); - // search for the different parts of the trailer dictionary - for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) + List<COSObjectKey[]> trailerObjects = bfSearchForTrailer(); + if (trailerObjects.size() == 1) { - Long offset = entry.getValue(); - COSDictionary dictionary = null; - // handle compressed objects - if (offset < 0) + COSObjectKey[] trailerObj = trailerObjects.get(0); + COSObjectKey rootKey = trailerObj[0]; + Long rootOffset = rootKey != null ? bfSearchCOSObjectKeyOffsets.get(rootKey) : null; + COSObjectKey infoKey = trailerObj[1]; + Long infoOffset = infoKey != null ? bfSearchCOSObjectKeyOffsets.get(infoKey) : null; + if (rootKey != null && rootOffset != null) { - COSObject compressedObject = document.getObjectFromPool(entry.getKey()); - if (compressedObject.getObject() == null) + COSDictionary rootDict = retrieveCOSDictionary(rootKey, rootOffset); + if (rootDict != null && isCatalog(rootDict)) { - parseObjectStream((int) -offset); + trailer.setItem(COSName.ROOT, document.getObjectFromPool(rootKey)); } - COSBase baseObject = compressedObject.getObject(); - if (baseObject instanceof COSDictionary) - { - dictionary = (COSDictionary) baseObject; - } - else + } + if (infoKey != null && infoOffset != null) + { + COSDictionary infoDict = retrieveCOSDictionary(infoKey, infoOffset); + if (infoDict != null && isInfo(infoDict)) { - continue; + trailer.setItem(COSName.INFO, document.getObjectFromPool(infoKey)); } } - else + } + else + { + // search for the different parts of the trailer dictionary + for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) { - source.seek(offset); - readObjectNumber(); - readGenerationNumber(); - readExpectedString(OBJ_MARKER, true); - if (source.peek() != '<') + COSDictionary dictionary = retrieveCOSDictionary(entry.getKey(), + entry.getValue()); + if (dictionary == null) { continue; } - try + // document catalog + if (isCatalog(dictionary)) { - dictionary = parseCOSDictionary(); + trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey())); } - catch (IOException exception) + // info dictionary + else if (isInfo(dictionary)) { - LOG.debug("Skipped object " + entry.getKey() - + ", either it's corrupt or not a dictionary"); - continue; + trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey())); } + // encryption dictionary, if existing, is lost + // We can't run "Algorithm 2" from PDF specification because of missing ID } - // document catalog - if (isCatalog(dictionary)) - { - trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey())); - } - // info dictionary - else if (!dictionary.containsKey(COSName.PARENT) - && !dictionary.containsKey(COSName.A) - && !dictionary.containsKey(COSName.DEST) - && (dictionary.containsKey(COSName.MOD_DATE) - || dictionary.containsKey(COSName.TITLE) - || dictionary.containsKey(COSName.AUTHOR) - || dictionary.containsKey(COSName.SUBJECT) - || dictionary.containsKey(COSName.KEYWORDS) - || dictionary.containsKey(COSName.CREATOR) - || dictionary.containsKey(COSName.PRODUCER) - || dictionary.containsKey(COSName.CREATION_DATE))) - { - trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey())); - } - // encryption dictionary, if existing, is lost - // We can't run "Algorithm 2" from PDF specification because of missing ID } } trailerWasRebuild = true; return trailer; } + private COSDictionary retrieveCOSDictionary(COSObjectKey key, Long offset) throws IOException + { + COSDictionary dictionary = null; + // handle compressed objects + if (offset < 0) + { + COSObject compressedObject = document.getObjectFromPool(key); + if (compressedObject.getObject() == null) + { + parseObjectStream((int) -offset); + } + COSBase baseObject = compressedObject.getObject(); + if (baseObject instanceof COSDictionary) + { + dictionary = (COSDictionary) baseObject; + } + } + else + { + source.seek(offset); + readObjectNumber(); + readGenerationNumber(); + readExpectedString(OBJ_MARKER, true); + if (source.peek() != '<') + { + return null; + } + try + { + dictionary = parseCOSDictionary(); + } + catch (IOException exception) + { + LOG.debug("Skipped object " + key + + ", either it's corrupt or not a dictionary"); + } + } + return dictionary; + } /** * Check if all entries of the pages dictionary are present. Those which can't be dereferenced are removed. * @@ -2101,7 +2197,7 @@ public class COSParser extends BaseParse * Tell if the dictionary is a PDF catalog. Override this for an FDF catalog. * * @param dictionary - * @return + * @return true if the given dictionary is a root dictionary */ protected boolean isCatalog(COSDictionary dictionary) { @@ -2109,8 +2205,32 @@ public class COSParser extends BaseParse } /** - * This will parse the startxref section from the stream. - * The startxref value is ignored. + * Tell if the dictionary is an info dictionary. + * + * @param dictionary + * @return true if the given dictionary is an info dictionary + */ + private boolean isInfo(COSDictionary dictionary) + { + if (dictionary.containsKey(COSName.PARENT) || dictionary.containsKey(COSName.A) || dictionary.containsKey(COSName.DEST)) + { + return false; + } + if (!dictionary.containsKey(COSName.MOD_DATE) && !dictionary.containsKey(COSName.TITLE) + && !dictionary.containsKey(COSName.AUTHOR) + && !dictionary.containsKey(COSName.SUBJECT) + && !dictionary.containsKey(COSName.KEYWORDS) + && !dictionary.containsKey(COSName.CREATOR) + && !dictionary.containsKey(COSName.PRODUCER) + && !dictionary.containsKey(COSName.CREATION_DATE)) + { + return false; + } + return true; + } + + /** + * This will parse the startxref section from the stream. The startxref value is ignored. * * @return the startxref value or -1 on parsing error * @throws IOException If an IO error occurs.