Author: lehmi Date: Fri Jan 6 13:13:17 2023 New Revision: 1906422 URL: http://svn.apache.org/viewvc?rev=1906422&view=rev Log: PDFBOX-5178: use index value to choose correct object if the object numbers within an object stream are not unique
Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java?rev=1906422&r1=1906421&r2=1906422&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/COSParser.java Fri Jan 6 13:13:17 2023 @@ -129,7 +129,7 @@ public class COSParser extends BaseParse * Intermediate cache. Contains all objects of already read compressed object streams. Objects are removed after * dereferencing them. */ - private final Map<Long, Map<Long, COSBase>> decompressedObjects = new HashMap<>(); + private final Map<Long, Map<COSObjectKey, COSBase>> decompressedObjects = new HashMap<>(); /** * The security handler. @@ -765,11 +765,10 @@ public class COSParser extends BaseParse */ protected COSBase parseObjectStreamObject(long objstmObjNr, COSObjectKey key) throws IOException { - Map<Long, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr, + Map<COSObjectKey, COSBase> streamObjects = decompressedObjects.computeIfAbsent(objstmObjNr, n -> new HashMap<>()); // did we already read the compressed object stream? - long keyNumber = key.getNumber(); - COSBase objectStreamObject = streamObjects.remove(keyNumber); + COSBase objectStreamObject = streamObjects.remove(key); if (objectStreamObject != null) { return objectStreamObject; @@ -782,18 +781,10 @@ public class COSParser extends BaseParse { PDFObjectStreamParser parser = new PDFObjectStreamParser((COSStream) objstmBaseObj, document); - for (Entry<Long, COSBase> entry : parser.parseAllObjects().entrySet()) - { - Long stmObjNumber = entry.getKey(); - if (keyNumber == stmObjNumber) - { - objectStreamObject = entry.getValue(); - } - else - { - streamObjects.putIfAbsent(stmObjNumber, entry.getValue()); - } - } + Map<COSObjectKey, COSBase> allStreamObjects = parser.parseAllObjects(); + objectStreamObject = allStreamObjects.remove(key); + allStreamObjects.entrySet().stream() + .forEach(e -> streamObjects.putIfAbsent(e.getKey(), e.getValue())); } catch (IOException ex) { Modified: pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1906422&r1=1906421&r2=1906422&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original) +++ pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Fri Jan 6 13:13:17 2023 @@ -25,6 +25,7 @@ import java.util.TreeMap; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSObjectKey; import org.apache.pdfbox.cos.COSStream; /** @@ -116,19 +117,36 @@ public class PDFObjectStreamParser exten * @return a map containing all parsed objects using the object number as key * @throws IOException if there is an error while parsing the stream */ - public Map<Long, COSBase> parseAllObjects() throws IOException + public Map<COSObjectKey, COSBase> parseAllObjects() throws IOException { - Map<Long, COSBase> allObjects = new HashMap<>(); + Map<COSObjectKey, COSBase> allObjects = new HashMap<>(); try { Map<Integer, Long> objectNumbers = privateReadObjectOffsets(); + // count the number of object numbers eliminating double entries + long numberOfObjNumbers = objectNumbers.values().stream().distinct().count(); + // the usage of the index should be restricted to cases where more than one + // object use the same object number. + // there are malformed pdfs in the wild which would lead to false results if + // pdfbox always relies on the index if available. In most cases the object number + // is sufficient to choose the correct object + boolean indexNeeded = objectNumbers.size() > numberOfObjNumbers; long currentPosition = source.getPosition(); if (firstObject > 0 && currentPosition < firstObject) { source.skip(firstObject - (int) currentPosition); } + int index = 0; for (Entry<Integer, Long> entry : objectNumbers.entrySet()) { + COSObjectKey objectKey = getObjectKey(entry.getValue(), 0); + // skip object if the index doesn't match + if (indexNeeded && objectKey.getStreamIndex() > -1 + && objectKey.getStreamIndex() != index) + { + index++; + continue; + } int finalPosition = firstObject + entry.getKey(); currentPosition = source.getPosition(); if (finalPosition > 0 && currentPosition < finalPosition) @@ -141,7 +159,8 @@ public class PDFObjectStreamParser exten { streamObject.setDirect(false); } - allObjects.put(entry.getValue(), streamObject); + allObjects.put(objectKey, streamObject); + index++; } } finally Modified: pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java URL: http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java?rev=1906422&r1=1906421&r2=1906422&view=diff ============================================================================== --- pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java (original) +++ pdfbox/trunk/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java Fri Jan 6 13:13:17 2023 @@ -24,8 +24,10 @@ import java.util.Map; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSBoolean; +import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.cos.COSInteger; import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSObjectKey; import org.apache.pdfbox.cos.COSStream; import org.junit.jupiter.api.Test; @@ -64,10 +66,89 @@ class PDFObjectStreamParserTest outputStream.write("6 0 4 5 true false".getBytes()); outputStream.close(); PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, null); - Map<Long, COSBase> objectNumbers = objectStreamParser.parseAllObjects(); + Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects(); assertEquals(2, objectNumbers.size()); - assertEquals(COSBoolean.TRUE, objectNumbers.get(6L)); - assertEquals(COSBoolean.FALSE, objectNumbers.get(4L)); + assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0))); + assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0))); + } + + @Test + void testParseAllObjectsIndexed() throws IOException + { + COSStream stream = new COSStream(); + stream.setItem(COSName.N, COSInteger.THREE); + stream.setItem(COSName.FIRST, COSInteger.get(13)); + OutputStream outputStream = stream.createOutputStream(); + // use object number 4 for two objects + outputStream.write("6 0 4 5 4 11 true false true".getBytes()); + outputStream.close(); + COSDocument cosDoc = new COSDocument(); + Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable(); + // select the second object from the stream for object number 4 by using 2 as value for the index + xrefTable.put(new COSObjectKey(6, 0, 0), -1L); + xrefTable.put(new COSObjectKey(4, 0, 2), -1L); + PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc); + Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects(); + assertEquals(2, objectNumbers.size()); + assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0))); + assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(4, 0))); + + // select the first object from the stream for object number 4 by using 1 as value for the index + // remove the old entry first to be sure it is replaced + xrefTable.remove(new COSObjectKey(4, 0)); + xrefTable.put(new COSObjectKey(4, 0, 1), -1L); + objectStreamParser = new PDFObjectStreamParser(stream, cosDoc); + objectNumbers = objectStreamParser.parseAllObjects(); + assertEquals(2, objectNumbers.size()); + assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0))); + assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0))); + } + + @Test + void testParseAllObjectsSkipMalformedIndex() throws IOException + { + COSStream stream = new COSStream(); + stream.setItem(COSName.N, COSInteger.THREE); + stream.setItem(COSName.FIRST, COSInteger.get(13)); + OutputStream outputStream = stream.createOutputStream(); + outputStream.write("6 0 4 5 5 11 true false true".getBytes()); + outputStream.close(); + COSDocument cosDoc = new COSDocument(); + Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable(); + // add an index for each object key which doesn't match with the index of the object stream + xrefTable.put(new COSObjectKey(6, 0, 10), -1L); + xrefTable.put(new COSObjectKey(4, 0, 11), -1L); + xrefTable.put(new COSObjectKey(5, 0, 12), -1L); + PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc); + // the index isn't taken into account as all object numbers of the stream are unique + // none of the objects is skipped so that all objects are read and available + Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects(); + assertEquals(3, objectNumbers.size()); + assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(6, 0))); + assertEquals(COSBoolean.FALSE, objectNumbers.get(new COSObjectKey(4, 0))); + assertEquals(COSBoolean.TRUE, objectNumbers.get(new COSObjectKey(5, 0))); + } + + @Test + void testParseAllObjectsUseMalformedIndex() throws IOException + { + COSStream stream = new COSStream(); + stream.setItem(COSName.N, COSInteger.THREE); + stream.setItem(COSName.FIRST, COSInteger.get(13)); + OutputStream outputStream = stream.createOutputStream(); + outputStream.write("6 0 4 5 4 11 true false true".getBytes()); + outputStream.close(); + COSDocument cosDoc = new COSDocument(); + Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable(); + // add an index for each object key which doesn't match with the index of the object stream + // add two object keys only as the object stream uses one object number for two objects + xrefTable.put(new COSObjectKey(6, 0, 10), -1L); + xrefTable.put(new COSObjectKey(4, 0, 11), -1L); + PDFObjectStreamParser objectStreamParser = new PDFObjectStreamParser(stream, cosDoc); + // as the used object numbers aren't unique within the object the index of the obejct keys is used + // All objects are dropped as the malformed index values don't match the index of the object within the stream + Map<COSObjectKey, COSBase> objectNumbers = objectStreamParser.parseAllObjects(); + assertEquals(0, objectNumbers.size()); } }