Author: lehmi Date: Sun Aug 6 14:27:31 2017 New Revision: 1804236 URL: http://svn.apache.org/viewvc?rev=1804236&view=rev Log: PDFBOX-3888: minimize the direct usage of xrefTrailerResolver and bfSearchCOSObjectKeyOffsets
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1804236&r1=1804235&r2=1804236&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Sun Aug 6 14:27:31 2017 @@ -16,6 +16,8 @@ */ package org.apache.pdfbox.pdfparser; +import static org.apache.pdfbox.util.Charsets.ISO_8859_1; + import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; @@ -32,6 +34,7 @@ import java.util.Queue; import java.util.Set; import java.util.TreeMap; import java.util.Vector; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSArray; @@ -48,9 +51,6 @@ import org.apache.pdfbox.io.RandomAccess import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType; import org.apache.pdfbox.pdmodel.encryption.SecurityHandler; - -import static org.apache.pdfbox.util.Charsets.ISO_8859_1; - /** * PDF-Parser which first reads startxref and xref tables in order to know valid objects and parse only these objects. * @@ -580,9 +580,19 @@ public class COSParser extends BaseParse if (!parsedObjects.contains(objId)) { - Long fileOffset = xrefTrailerResolver.getXrefTable().get(objKey); - // it is allowed that object references point to null, - // thus we have to test + Long fileOffset = document.getXrefTable().get(objKey); + if (fileOffset == null && isLenient) + { + Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets(); + fileOffset = bfCOSObjectKeyOffsets.get(objKey); + if (fileOffset != null) + { + LOG.debug("Set missing " + fileOffset + " for object " + objKey); + document.getXrefTable().put(objKey, fileOffset); + } + } + + // it is allowed that object references point to null, thus we have to test if (fileOffset != null && fileOffset != 0) { if (fileOffset > 0) @@ -594,7 +604,8 @@ public class COSParser extends BaseParse // negative offset means we have a compressed // object within object stream; // get offset of object stream - fileOffset = xrefTrailerResolver.getXrefTable().get( + fileOffset = document.getXrefTable() + .get( new COSObjectKey((int)-fileOffset, 0)); if ((fileOffset == null) || (fileOffset <= 0)) { @@ -700,7 +711,7 @@ public class COSParser extends BaseParse { // not previously parsed // ---- read offset or object stream object number from xref table - Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey); + Long offsetOrObjstmObNr = document.getXrefTable().get(objKey); // sanity test to circumvent loops with broken documents if (requireExistingNotCompressedObj @@ -711,24 +722,14 @@ public class COSParser extends BaseParse } // maybe something is wrong with the xref table -> perform brute force search for all objects - if (offsetOrObjstmObNr == null && isLenient && bfSearchCOSObjectKeyOffsets == null) + if (offsetOrObjstmObNr == null && isLenient) { - bfSearchForObjects(); - if (bfSearchCOSObjectKeyOffsets != null && !bfSearchCOSObjectKeyOffsets.isEmpty()) + Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets(); + offsetOrObjstmObNr = bfCOSObjectKeyOffsets.get(objKey); + if (offsetOrObjstmObNr != null) { - LOG.debug("Add all new read objects from brute force search to the xref table"); - Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable(); - final Set<Map.Entry<COSObjectKey, Long>> entries = bfSearchCOSObjectKeyOffsets.entrySet(); - for (Entry<COSObjectKey, Long> entry : entries) - { - COSObjectKey key = entry.getKey(); - // add all missing objects to the xref table - if (!xrefOffset.containsKey(key)) - { - xrefOffset.put(key, entry.getValue()); - } - } - offsetOrObjstmObNr = xrefOffset.get(objKey); + LOG.debug("Set missing offset " + offsetOrObjstmObNr + " for object " + objKey); + document.getXrefTable().put(objKey, offsetOrObjstmObNr); } } @@ -879,7 +880,7 @@ public class COSParser extends BaseParse for (COSObject next : parser.getObjects()) { COSObjectKey stmObjKey = new COSObjectKey(next); - Long offset = xrefTrailerResolver.getXrefTable().get(stmObjKey); + Long offset = document.getXrefTable().get(stmObjKey); if (offset != null && offset == -objstmObjNr) { COSObject stmObj = document.getObjectFromPool(stmObjKey); @@ -1317,8 +1318,8 @@ public class COSParser extends BaseParse Map<COSObjectKey, Long> xrefOffset = xrefTrailerResolver.getXrefTable(); if (!validateXrefOffsets(xrefOffset)) { - bfSearchForObjects(); - if (bfSearchCOSObjectKeyOffsets != null && !bfSearchCOSObjectKeyOffsets.isEmpty()) + Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets(); + if (!bfCOSObjectKeyOffsets.isEmpty()) { List<COSObjectKey> objStreams = new ArrayList<>(); // find all object streams @@ -1339,7 +1340,7 @@ public class COSParser extends BaseParse { for (COSObjectKey key : objStreams) { - if (bfSearchCOSObjectKeyOffsets.containsKey(key)) + if (bfCOSObjectKeyOffsets.containsKey(key)) { // remove all parsed objects which are part of an object stream Set<Long> objects = xrefTrailerResolver @@ -1347,11 +1348,11 @@ public class COSParser extends BaseParse for (Long objNr : objects) { COSObjectKey streamObjectKey = new COSObjectKey(objNr, 0); - Long streamObjectOffset = bfSearchCOSObjectKeyOffsets + Long streamObjectOffset = bfCOSObjectKeyOffsets .get(streamObjectKey); if (streamObjectOffset != null && streamObjectOffset > 0) { - bfSearchCOSObjectKeyOffsets.remove(streamObjectKey); + bfCOSObjectKeyOffsets.remove(streamObjectKey); } } } @@ -1368,7 +1369,7 @@ public class COSParser extends BaseParse } } LOG.debug("Replaced read xref table with the results of a brute force search"); - xrefOffset.putAll(bfSearchCOSObjectKeyOffsets); + xrefOffset.putAll(bfCOSObjectKeyOffsets); } } } @@ -1425,6 +1426,15 @@ public class COSParser extends BaseParse return Long.toString(objectID) + " " + Integer.toString(genID) + " obj"; } + private Map<COSObjectKey, Long> getBFCOSObjectOffsets() throws IOException + { + if (bfSearchCOSObjectKeyOffsets == null) + { + bfSearchForObjects(); + } + return bfSearchCOSObjectKeyOffsets; + } + /** * Brute force search for every object in the pdf. * @@ -1432,74 +1442,69 @@ public class COSParser extends BaseParse */ private void bfSearchForObjects() throws IOException { - if (bfSearchCOSObjectKeyOffsets == null) - { - bfSearchForLastEOFMarker(); - bfSearchCOSObjectKeyOffsets = new HashMap<>(); - long originOffset = source.getPosition(); - long currentOffset = MINIMUM_SEARCH_OFFSET; - long lastObjectId = Long.MIN_VALUE; - int lastGenID = Integer.MIN_VALUE; - long lastObjOffset = Long.MIN_VALUE; - String objString = " obj"; - char[] string = objString.toCharArray(); - do - { - source.seek(currentOffset); - if (isString(string)) + bfSearchForLastEOFMarker(); + bfSearchCOSObjectKeyOffsets = new HashMap<>(); + long originOffset = source.getPosition(); + long currentOffset = MINIMUM_SEARCH_OFFSET; + long lastObjectId = Long.MIN_VALUE; + int lastGenID = Integer.MIN_VALUE; + long lastObjOffset = Long.MIN_VALUE; + String objString = " obj"; + char[] string = objString.toCharArray(); + do + { + source.seek(currentOffset); + if (isString(string)) + { + long tempOffset = currentOffset - 1; + source.seek(tempOffset); + int genID = source.peek(); + // is the next char a digit? + if (isDigit(genID)) { - long tempOffset = currentOffset - 1; + genID -= 48; + tempOffset--; source.seek(tempOffset); - int genID = source.peek(); - // is the next char a digit? - if (isDigit(genID)) - { - genID -= 48; - tempOffset--; - source.seek(tempOffset); - if (isSpace()) + if (isSpace()) + { + while (tempOffset > MINIMUM_SEARCH_OFFSET && isSpace()) { - while (tempOffset > MINIMUM_SEARCH_OFFSET && isSpace()) - { - source.seek(--tempOffset); - } - boolean objectIDFound = false; - while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) - { - source.seek(--tempOffset); - objectIDFound = true; - } - if (objectIDFound) + source.seek(--tempOffset); + } + boolean objectIDFound = false; + while (tempOffset > MINIMUM_SEARCH_OFFSET && isDigit()) + { + source.seek(--tempOffset); + objectIDFound = true; + } + if (objectIDFound) + { + source.read(); + long objectId = readObjectNumber(); + if (lastObjOffset > 0) { - source.read(); - long objectId = readObjectNumber(); - if (lastObjOffset > 0) - { - // add the former object ID only if there was a subsequent object ID - bfSearchCOSObjectKeyOffsets - .put(new COSObjectKey(lastObjectId, lastGenID), - lastObjOffset); - } - lastObjectId = objectId; - lastGenID = genID; - lastObjOffset = tempOffset + 1; + // add the former object ID only if there was a subsequent object ID + bfSearchCOSObjectKeyOffsets.put( + new COSObjectKey(lastObjectId, lastGenID), lastObjOffset); } + lastObjectId = objectId; + lastGenID = genID; + lastObjOffset = tempOffset + 1; } } } - currentOffset++; } - while (currentOffset < lastEOFMarker && !source.isEOF()); - if (lastEOFMarker < Long.MAX_VALUE && lastObjOffset > 0) - { - // if the pdf wasn't cut off in the middle the last object id has to added here - // so that it can't get lost as there isn't any subsequent object id - bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID), - lastObjOffset); - } - // reestablish origin position - source.seek(originOffset); + currentOffset++; + } while (currentOffset < lastEOFMarker && !source.isEOF()); + if (lastEOFMarker < Long.MAX_VALUE && lastObjOffset > 0) + { + // if the pdf wasn't cut off in the middle the last object id has to added here + // so that it can't get lost as there isn't any subsequent object id + bfSearchCOSObjectKeyOffsets.put(new COSObjectKey(lastObjectId, lastGenID), + lastObjOffset); } + // reestablish origin position + source.seek(originOffset); } /** @@ -1775,58 +1780,56 @@ public class COSParser extends BaseParse protected final COSDictionary rebuildTrailer() throws IOException { COSDictionary trailer = null; - bfSearchForObjects(); - if (bfSearchCOSObjectKeyOffsets != null) + Map<COSObjectKey, Long> bfCOSObjectKeyOffsets = getBFCOSObjectOffsets(); + // reset trailer resolver + xrefTrailerResolver.reset(); + // use the found objects to rebuild the trailer resolver + xrefTrailerResolver.nextXrefObj(0, XRefType.TABLE); + for (Entry<COSObjectKey, Long> entry : bfCOSObjectKeyOffsets.entrySet()) + { + xrefTrailerResolver.setXRef(entry.getKey(), entry.getValue()); + } + xrefTrailerResolver.setStartxref(0); + trailer = xrefTrailerResolver.getTrailer(); + getDocument().setTrailer(trailer); + // search for the different parts of the trailer dictionary + for (Entry<COSObjectKey, Long> entry : bfCOSObjectKeyOffsets.entrySet()) { - // reset trailer resolver - xrefTrailerResolver.reset(); - // use the found objects to rebuild the trailer resolver - xrefTrailerResolver.nextXrefObj(0, XRefType.TABLE); - for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) - { - xrefTrailerResolver.setXRef(entry.getKey(), entry.getValue()); - } - xrefTrailerResolver.setStartxref(0); - trailer = xrefTrailerResolver.getTrailer(); - getDocument().setTrailer(trailer); - // search for the different parts of the trailer dictionary - for (Entry<COSObjectKey, Long> entry : bfSearchCOSObjectKeyOffsets.entrySet()) - { - Long offset = entry.getValue(); - source.seek(offset); - readObjectNumber(); - readGenerationNumber(); - readExpectedString(OBJ_MARKER, true); - try + Long offset = entry.getValue(); + source.seek(offset); + readObjectNumber(); + readGenerationNumber(); + readExpectedString(OBJ_MARKER, true); + try + { + if (source.peek() != '<') { - if (source.peek() != '<') - { - continue; - } - COSDictionary dictionary = parseCOSDictionary(); - // document catalog - if (isCatalog(dictionary)) - { - trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey())); - } - // info dictionary - else if (dictionary.containsKey(COSName.MOD_DATE) && - (dictionary.containsKey(COSName.TITLE) - || dictionary.containsKey(COSName.AUTHOR) - || dictionary.containsKey(COSName.SUBJECT) - || dictionary.containsKey(COSName.KEYWORDS) - || dictionary.containsKey(COSName.CREATOR) - || dictionary.containsKey(COSName.PRODUCER) - || dictionary.containsKey(COSName.CREATION_DATE))) - { - trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey())); - } - // TODO encryption dictionary + continue; } - catch(IOException exception) + COSDictionary dictionary = parseCOSDictionary(); + // document catalog + if (isCatalog(dictionary)) { - LOG.debug("Skipped object " + entry.getKey() + ", either it's corrupt or not a dictionary"); + trailer.setItem(COSName.ROOT, document.getObjectFromPool(entry.getKey())); } + // info dictionary + else if (dictionary.containsKey(COSName.MOD_DATE) + && (dictionary.containsKey(COSName.TITLE) + || dictionary.containsKey(COSName.AUTHOR) + || dictionary.containsKey(COSName.SUBJECT) + || dictionary.containsKey(COSName.KEYWORDS) + || dictionary.containsKey(COSName.CREATOR) + || dictionary.containsKey(COSName.PRODUCER) + || dictionary.containsKey(COSName.CREATION_DATE))) + { + trailer.setItem(COSName.INFO, document.getObjectFromPool(entry.getKey())); + } + // TODO encryption dictionary + } + catch (IOException exception) + { + LOG.debug("Skipped object " + entry.getKey() + + ", either it's corrupt or not a dictionary"); } } return trailer; Modified: pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java?rev=1804236&r1=1804235&r2=1804236&view=diff ============================================================================== --- pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java (original) +++ pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java Sun Aug 6 14:27:31 2017 @@ -722,7 +722,7 @@ public class PreflightParser extends PDF { // not previously parsed // ---- read offset or object stream object number from xref table - Long offsetOrObjstmObNr = xrefTrailerResolver.getXrefTable().get(objKey); + Long offsetOrObjstmObNr = document.getXrefTable().get(objKey); // sanity test to circumvent loops with broken documents if (requireExistingNotCompressedObj && ((offsetOrObjstmObNr == null))) @@ -879,7 +879,7 @@ public class PreflightParser extends PDF for (COSObject next : parser.getObjects()) { COSObjectKey stmObjKey = new COSObjectKey(next); - Long offset = xrefTrailerResolver.getXrefTable().get(stmObjKey); + Long offset = document.getXrefTable().get(stmObjKey); if (offset != null && offset == -objstmObjNr) { COSObject stmObj = document.getObjectFromPool(stmObjKey);