Author: lehmi Date: Sat Jun 27 12:52:39 2020 New Revision: 1879267 URL: http://svn.apache.org/viewvc?rev=1879267&view=rev Log: PDFBOX-4897: backported the optimized behaviour of the object stream parser from 3.0 as proposed by Simon Steiner
Added: pdfbox/branches/issue45/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java - copied unchanged from r1879261, pdfbox/branches/2.0/pdfbox/src/test/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParserTest.java Modified: pdfbox/branches/issue45/ (props changed) pdfbox/branches/issue45/pdfbox/ (props changed) pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Propchange: pdfbox/branches/issue45/ ------------------------------------------------------------------------------ Merged /pdfbox/branches/2.0:r1879261 Propchange: pdfbox/branches/issue45/pdfbox/ ------------------------------------------------------------------------------ Merged /pdfbox/branches/2.0/pdfbox:r1879261 Modified: pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java URL: http://svn.apache.org/viewvc/pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java?rev=1879267&r1=1879266&r2=1879267&view=diff ============================================================================== --- pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java (original) +++ pdfbox/branches/issue45/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFObjectStreamParser.java Sat Jun 27 12:52:39 2020 @@ -18,12 +18,16 @@ package org.apache.pdfbox.pdfparser; import java.io.IOException; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; +import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDocument; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSObject; import org.apache.pdfbox.cos.COSStream; @@ -41,7 +45,8 @@ public class PDFObjectStreamParser exten private static final Log LOG = LogFactory.getLog(PDFObjectStreamParser.class); private List<COSObject> streamObjects = null; - private final COSStream stream; + private final int numberOfObjects; + private final int firstObject; /** * Constructor. @@ -53,8 +58,19 @@ public class PDFObjectStreamParser exten public PDFObjectStreamParser(COSStream stream, COSDocument document) throws IOException { super(new InputStreamSource(stream.createInputStream())); - this.stream = stream; this.document = document; + // get mandatory number of objects + numberOfObjects = stream.getInt(COSName.N); + if (numberOfObjects == -1) + { + throw new IOException("/N entry missing in object stream"); + } + // get mandatory stream offset of the first object + firstObject = stream.getInt(COSName.FIRST); + if (firstObject == -1) + { + throw new IOException("/First entry missing in object stream"); + } } /** @@ -67,47 +83,19 @@ public class PDFObjectStreamParser exten { try { - //need to first parse the header. - int numberOfObjects = stream.getInt( "N" ); - if (numberOfObjects == -1) - { - throw new IOException("/N entry missing in object stream"); - } - List<Long> objectNumbers = new ArrayList<Long>( numberOfObjects ); + Map<Long, Integer> offsets = readOffsets(); streamObjects = new ArrayList<COSObject>( numberOfObjects ); - for( int i=0; i<numberOfObjects; i++ ) + for (Entry<Long, Integer> offset : offsets.entrySet()) { - long objectNumber = readObjectNumber(); - // skip offset - readLong(); - objectNumbers.add( objectNumber); - } - COSObject object; - COSBase cosObject; - int objectCounter = 0; - while( (cosObject = parseDirObject()) != null ) - { - object = new COSObject(cosObject); + COSBase cosObject = parseObject(offset.getValue()); + COSObject object = new COSObject(cosObject); object.setGenerationNumber(0); - if (objectCounter >= objectNumbers.size()) - { - LOG.error("/ObjStm (object stream) has more objects than /N " + numberOfObjects); - break; - } - object.setObjectNumber( objectNumbers.get( objectCounter) ); - streamObjects.add( object ); - if(LOG.isDebugEnabled()) - { - LOG.debug( "parsed=" + object ); - } - // According to the spec objects within an object stream shall not be enclosed - // by obj/endobj tags, but there are some pdfs in the wild using those tags - // skip endobject marker if present - if (!seqSource.isEOF() && seqSource.peek() == 'e') + object.setObjectNumber(offset.getKey()); + streamObjects.add(object); + if (LOG.isDebugEnabled()) { - readLine(); + LOG.debug("parsed=" + object); } - objectCounter++; } } finally @@ -125,4 +113,30 @@ public class PDFObjectStreamParser exten { return streamObjects; } + + private Map<Long, Integer> readOffsets() throws IOException + { + // use LinkesHashMap to preserve order for the sequential parsing + Map<Long, Integer> objectNumbers = new LinkedHashMap<Long, Integer>(numberOfObjects); + for (int i = 0; i < numberOfObjects; i++) + { + long objectNumber = readObjectNumber(); + int offset = (int) readLong(); + objectNumbers.put(objectNumber, offset); + } + return objectNumbers; + } + + private COSBase parseObject(int offset) throws IOException + { + long currentPosition = seqSource.getPosition(); + int finalPosition = firstObject + offset; + if (finalPosition > 0 && currentPosition < finalPosition) + { + // jump to the offset of the object to be parsed + seqSource.readFully(finalPosition - (int) currentPosition); + } + return parseDirObject(); + } + }