Author: tboehme
Date: Sun Apr 8 15:03:51 2012
New Revision: 1311018
URL: http://svn.apache.org/viewvc?rev=1311018&view=rev
Log:
add new NonSequentialPDFParser as proposed in PDFBOX-1199
Added:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
(with props)
Added:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL:
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1311018&view=auto
==============================================================================
---
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
(added)
+++
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
Sun Apr 8 15:03:51 2012
@@ -0,0 +1,1184 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pdfbox.pdfparser;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.security.KeyStore;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.Map.Entry;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSDocument;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSNull;
+import org.apache.pdfbox.cos.COSNumber;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.cos.COSString;
+import org.apache.pdfbox.exceptions.CryptographyException;
+import org.apache.pdfbox.io.PushBackInputStream;
+import org.apache.pdfbox.io.RandomAccess;
+import org.apache.pdfbox.io.RandomAccessBuffer;
+import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
+import org.apache.pdfbox.pdmodel.encryption.DecryptionMaterial;
+import org.apache.pdfbox.pdmodel.encryption.PDEncryptionDictionary;
+import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
+import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
+import org.apache.pdfbox.pdmodel.encryption.SecurityHandlersManager;
+import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
+import org.apache.pdfbox.persistence.util.COSObjectKey;
+
+/**
+ * PDFParser which first reads startxref and xref tables in order to know valid
+ * objects and parse only these objects. Thus it is closer to a conforming
parser
+ * than the sequential reading of {@link PDFParser}.
+ *
+ * This class can be used as a {@link PDFParser} replacement. First {@link
#parse()}
+ * must be called before page objects can be retrieved, e.g. {@link
#getPDDocument()}.
+ *
+ * This class is a much enhanced version of<code>QuickParser</code> presented
in
+ *<a href="https://issues.apache.org/jira/browse/PDFBOX-1104">PDFBOX-1104</a>
+ * by Jeremy Villalobos.
+ */
+public class NonSequentialPDFParser extends PDFParser {
+
+ public final static String SYSPROP_PARSEMINIMAL =
"org.apache.pdfbox.pdfparser.nonSequentialPDFParser.parseMinimal";
+ public final static String SYSPROP_EOFLOOKUPRANGE =
"org.apache.pdfbox.pdfparser.nonSequentialPDFParser.eofLookupRange";
+
+ private final static InputStream EMPTY_INPUT_STREAM = new
ByteArrayInputStream( new byte[0] );
+
+ private final static int DEFAULT_TRAIL_BYTECOUNT = 2048;
+ private final static char[] EOF_MARKER = new
char[] { '%','%','E','O','F' };
+ private final static char[] STARTXREF_MARKER = new
char[] { 's','t','a','r','t','x','r','e','f' };
+ private final static char[] OBJ_MARKER = new
char[] { 'o','b','j' };
+
+ private final File file;
+ private final RandomAccessBufferedFileInputStream raStream;
+
+ private SecurityHandler securityHandler = null;
+
+ private String keyStoreFilename = null;
+ private String alias = null;
+ private String password = "";
+ private int readTrailBytes = DEFAULT_TRAIL_BYTECOUNT;
// how many trailing bytes to read for EOF marker
+
+ /** If<code>true</code> object references in catalog are not
followed;
+ * pro: page objects will be only parsed when needed; cons:
some information of catalog
+ * might not be available (e.g. outline).
+ * Catalog parsing without pages is not an option since a
number of entries will
+ * also refer to page objects (like OpenAction).
+ */
+ private boolean parseMinimalCatalog = "true".equals(
System.getProperty( SYSPROP_PARSEMINIMAL ) );
+
+ private boolean initialParseDone = false;
+ private boolean allPagesParsed = false;
+
+ private static final Log LOG = LogFactory.getLog(
NonSequentialPDFParser.class );
+
+ //
------------------------------------------------------------------------
+ /** Constructs parser for given file using memory buffer. */
+ public NonSequentialPDFParser( String filename ) throws
FileNotFoundException, IOException
+ {
+ this( new File( filename ), null );
+ }
+
+ /** Constructs parser for given file using given buffer for
temporary storage. */
+ public NonSequentialPDFParser( File pdfFile, RandomAccess raBuf
) throws FileNotFoundException, IOException
+ {
+ super( EMPTY_INPUT_STREAM, null, false );
+
+ String eofLookupRangeStr = System.getProperty(
SYSPROP_EOFLOOKUPRANGE );
+ if ( eofLookupRangeStr != null )
+ {
+ try
+ {
+
setEOFLookupRange( Integer.parseInt( eofLookupRangeStr ) );
+ }
+ catch ( NumberFormatException
nfe )
+ {
+ LOG.warn( "System
property " + SYSPROP_EOFLOOKUPRANGE +
+
" does not contain an integer value, but: '" + eofLookupRangeStr + "'" );
+ }
+ }
+
+ file = pdfFile;
+ raStream = new
RandomAccessBufferedFileInputStream( file );
+
+ setDocument( ( raBuf == null ) ? new
COSDocument( new RandomAccessBuffer(), false ) :
+ new
COSDocument( raBuf, false ) );
+
+ pdfSource = new PushBackInputStream( raStream,
4096 );
+ }
+
+ //
------------------------------------------------------------------------
+ /** Sets how many trailing bytes of PDF file are searched for
+ * EOF marker and 'startxref' marker.
+ * If not set we use default value {@link
#DEFAULT_TRAIL_BYTECOUNT}.
+ *
+ *<p<We check that new value is at least 16. However for
practical use
+ * cases this value should not be lower than 1000; even 2000
+ * was found to not be enough in some cases where some trailing
+ * garbage like HTML snippets followed the EOF marker.</p>
+ *
+ *<p>In case system property {@link #SYSPROP_EOFLOOKUPRANGE} is
defined
+ * this value will be set on initialization but can be overwritten
later.</p>
+ */
+ public void setEOFLookupRange( int byteCount )
+ {
+ if ( byteCount> 15 )
+ readTrailBytes = byteCount;
+ }
+
+ //
------------------------------------------------------------------------
+ /**
+ * The initial parse will first parse only the trailer, the
xrefstart and
+ * all xref tables to have a pointer (offset) to all the pdf's
objects.
+ * It can handle linearized pdfs, which will have an xref at the
+ * end pointing to an xref at the beginning of the file.
+ * Last the root object is parsed.
+ *
+ * @throws IOException
+ */
+ private void initialParse() throws IOException
+ {
+ final long startxrefOff = getStartxrefOffset();
+
+ // ---- parse startxref
+ setPdfSource( startxrefOff );
+ parseStartXref();
+
+ final long xrefOffset = document.getStartXref();
+ long prev = xrefOffset;
+
+ // ---- parse whole chain of xref tables/object
streams using PREV reference
+ while( prev> -1 )
+ {
+ // seek to xref table
+ setPdfSource( prev );
+
+ // -- parse xref
+ if ( pdfSource.peek() == 'x' )
+ {
+ // xref table
and trailer
+ // use existing
parser to parse xref table
+ parseXrefTable(
prev );
+
+ // parse the
last trailer.
+ parseTrailer();
+ COSDictionary trailer =
xrefTrailerResolver.getCurrentTrailer();
+ prev = trailer.getInt(
COSName.PREV );
+ }
+ else
+ {
+ // xref stream
+ prev =
parseXrefObjStream( prev );
+ }
+ }
+
+ // ---- build valid xrefs out of the xref chain
+ xrefTrailerResolver.setStartxref( xrefOffset );
+ document.setTrailer( xrefTrailerResolver.getTrailer() );
+
+ // ---- prepare encryption if necessary
+ COSBase trailerEncryptItem = document.getTrailer().getItem(
COSName.ENCRYPT );
+ if ( trailerEncryptItem != null )
+ {
+ if ( trailerEncryptItem instanceof COSObject )
+ {
+ COSObject trailerEncryptObj =
(COSObject) trailerEncryptItem;
+ parseObjectDynamically(
trailerEncryptObj, true );
+ }
+
+ try
+ {
+ PDEncryptionDictionary
encParameters = new PDEncryptionDictionary( document.getEncryptionDictionary()
);
+
+ DecryptionMaterial
decryptionMaterial = null;
+ if( keyStoreFilename != null )
+ {
+ KeyStore ks = KeyStore.getInstance(
"PKCS12" );
+ ks.load( new FileInputStream(
keyStoreFilename ), password.toCharArray() );
+
+ decryptionMaterial = new
PublicKeyDecryptionMaterial( ks, alias, password );
+ }
+ else
+ {
+ decryptionMaterial = new
StandardDecryptionMaterial( password );
+ }
+
+ securityHandler =
SecurityHandlersManager.getInstance().getSecurityHandler(
encParameters.getFilter() );
+
securityHandler.prepareForDecryption( encParameters, document.getDocumentID(),
decryptionMaterial );
+
+ AccessPermission permission =
securityHandler.getCurrentAccessPermission();
+ if ( !
permission.canExtractContent() )
+ {
+ LOG.warn( "PDF file '" +
file.getPath() + "' does not allow extracting content." );
+ }
+ else
+ {
+ LOG.info( "PDF file '" +
file.getPath() + "' allows content extraction." );
+ }
+
+ }
+ catch ( Exception e )
+ {
+ throw new IOException( "Error
(" + e.getClass().getSimpleName() +
+
") while creating security handler for decryption: " +
+
e.getMessage(), e );
+ }
+ }
+
+
+ // ---- parse catalog or root object
+ COSObject root = (COSObject)
xrefTrailerResolver.getTrailer().getItem( COSName.ROOT );
+
+ if ( root == null )
+ throw new IOException( "Missing root
object specification in trailer." );
+
+ parseObjectDynamically( root, false );
+
+ // ---- resolve all objects (including pages)
+ if ( ! parseMinimalCatalog )
+ {
+ COSObject catalogObj =
document.getCatalog();
+ if ( catalogObj != null )
+ {
+ if (
catalogObj.getObject() instanceof COSDictionary )
+ {
+
parseDictObjects( (COSDictionary) catalogObj.getObject(),
+
(COSName[]) null );
+
allPagesParsed = true;
+
document.setDecrypted();
+ }
+ }
+ }
+
+ initialParseDone = true;
+ }
+
+ //
------------------------------------------------------------------------
+ /** Parses an xref object stream starting with indirect object
id.
+ *
+ * @return value of PREV item in dictionary or<code>-1</code>
if no such item exists
+ */
+ private long parseXrefObjStream( long objByteOffset ) throws
IOException
+ {
+ // ---- parse indirect object head
+ readInt();
+ readInt();
+ readPattern( OBJ_MARKER );
+
+ COSDictionary dict = parseCOSDictionary();
+ COSStream xrefStream = parseCOSStream(dict,
getDocument().getScratchFile() );
+ parseXrefStream( xrefStream, (int)
objByteOffset );
+
+ return dict.getLong( COSName.PREV );
+ }
+
+ //
------------------------------------------------------------------------
+ /** Get current offset in file at which next byte would be
read. */
+ private final long getPdfSourceOffset()
+ {
+ return pdfSource.getOffset();
+ }
+
+ /** Sets {@link #pdfSource} to start next parsing at given file
offset. */
+ private final void setPdfSource( long fileOffset ) throws
IOException
+ {
+
+ pdfSource.seek( fileOffset );
+
+ // alternative using 'old fashioned' input stream
+ // if ( pdfSource != null )
+ // pdfSource.close();
+ //
+ // pdfSource = new PushBackInputStream(
+ // new
BufferedInputStream(
+ //
new FileInputStream( file ), 16384), 4096);
+ // pdfSource.skip( _fileOffset );
+ }
+
+ /** Enable handling of alternative pdfSource implementation. */
+ private final void releasePdfSourceInputStream() throws
IOException
+ {
+ // if ( pdfSource != null )
+ // pdfSource.close();
+ }
+
+ private final void closeFileStream() throws IOException
+ {
+ if ( pdfSource != null )
+ pdfSource.close();
+ }
+
+ //
------------------------------------------------------------------------
+ /** Looks for and parses startxref. We first look for last
'%%EOF' marker
+ * (within last {@link #DEFAULT_TRAIL_BYTECOUNT} bytes (or
range set via
+ * {@link #setEOFLookupRange(int)}) and go back to
find<code>startxref</code>. */
+ private final long getStartxrefOffset() throws IOException
+ {
+ byte[] buf;
+ long skipBytes;
+
+ // ---- read trailing bytes into buffer
+ final long fileLen = file.length();
+
+ FileInputStream fIn = null;
+ try
+ {
+ fIn = new FileInputStream( file
);
+
+ final int trailByteCount = (
fileLen< readTrailBytes ) ? (int) fileLen : readTrailBytes;
+ buf = new byte[ trailByteCount
];
+ fIn.skip( skipBytes = fileLen -
trailByteCount );
+
+ int off = 0;
+ int readBytes;
+ while ( off< trailByteCount )
+ {
+ readBytes =
fIn.read( buf, off, trailByteCount - off );
+ // in order to
not get stuck in a loop we check readBytes (this should never happen)
+ if ( readBytes<
1 )
+ throw new
IOException( "No more bytes to read for trailing buffer, but expected: " +
+
( trailByteCount - off ) );
+ off +=
readBytes;
+ }
+ }
+ finally
+ {
+ if ( fIn != null ) try {
fIn.close(); } catch ( IOException ioe ) {}
+ }
+
+ // ---- find last '%%EOF'
+ int bufOff = lastIndexOf( EOF_MARKER, buf,
buf.length );
+
+ if ( bufOff< 0 )
+ throw new IOException( "Missing end of file
marker '" + ( new String( EOF_MARKER ) ) + "'" );
+
+ // ---- find last startxref preceding EOF marker
+ bufOff = lastIndexOf( STARTXREF_MARKER, buf,
bufOff );
+
+ if ( bufOff< 0 )
+ throw new IOException( "Missing
'startxref' marker." );
+
+ return skipBytes + bufOff;
+ }
+
+ //
------------------------------------------------------------------------
+ /** Searches last appearance of pattern within buffer. Lookup
before _lastOff
+ * and goes back until 0.
+ *
+ * @param pattern pattern to search for
+ * @param buf buffer to search pattern in
+ * @param endOff offset (exclusive) where lookup starts at
+ *
+ * @return start offset of pattern within buffer
or<code>-1</code> if pattern could not be found
+ */
+ private final int lastIndexOf( final char[] pattern, final
byte[] buf, final int endOff )
+ {
+ final int lastPatternChOff = pattern.length - 1;
+
+ int bufOff = endOff;
+ int patOff = lastPatternChOff;
+ char lookupCh = pattern[ patOff ];
+
+ while ( --bufOff>= 0 )
+ {
+ if ( buf[ bufOff ] == lookupCh )
+ {
+ if ( --patOff<
0 )
+ //
whole pattern matched
+ return
bufOff;
+ // matched
current char, advance to preceding one
+ lookupCh =
pattern[ patOff ];
+ }
+ else if ( patOff<
lastPatternChOff )
+ {
+ // no char
match but already matched some chars; reset
+ lookupCh =
pattern[ patOff = lastPatternChOff ];
+ }
+ }
+
+ return -1;
+ }
+
+ //
------------------------------------------------------------------------
+ /** Reads given pattern from {@link #pdfSource}. Skipping
whitespace at start and end.
+ *
+ * @throws IOException if pattern could not be read
+ */
+ private final void readPattern( final char[] pattern ) throws
IOException
+ {
+ skipSpaces();
+
+ for ( char c : pattern )
+ {
+ if ( pdfSource.read() != c )
+ throw new IOException(
"Expected pattern '" + new String( pattern ) +
+ "
but missed at character '" + c + "'" );
+ }
+
+ skipSpaces();
+ }
+
+ //
------------------------------------------------------------------------
+ private COSDictionary pagesDictionary = null;
+
+ /** Returns PAGES {@link COSDictionary} object or throws {@link
IOException}
+ * if PAGES dictionary does not exist. */
+ private COSDictionary getPagesObject() throws IOException
+ {
+ if ( pagesDictionary != null )
+ return pagesDictionary;
+
+ COSObject pages = (COSObject)
document.getCatalog().getItem( COSName.PAGES );
+
+ if ( pages == null )
+ throw new IOException( "Missing
PAGES entry in document catalog." );
+
+ COSBase object = parseObjectDynamically( pages,
false );
+
+ if ( ! ( object instanceof COSDictionary ) )
+ throw new IOException( "PAGES not a
dictionary object, but: " +
+
object.getClass().getSimpleName() );
+
+ pagesDictionary = (COSDictionary) object;
+
+ return pagesDictionary;
+ }
+
+ //
------------------------------------------------------------------------
+ /** Parses all objects needed by pages and closes input stream.
*/
+ @Override public void parse() throws IOException
+ {
+ boolean exceptionOccurred = true; // set
to false if all is processed
+
+ try
+ {
+ if ( ! initialParseDone )
+ initialParse();
+
+ final int pageCount =
getPageNumber();
+
+ if ( ! allPagesParsed )
+ {
+ for ( int pNr = 0;
pNr< pageCount; pNr++ )
+ {
+
getPage( pNr );
+ }
+ allPagesParsed
= true;
+
document.setDecrypted();
+ }
+
+ exceptionOccurred = false;
+ }
+ finally
+ {
+ try { closeFileStream(); }
catch ( IOException ioe ) {}
+
+ if ( exceptionOccurred&& (
document != null ) )
+ try {
document.close(); } catch ( IOException ioe ) {}
+ }
+ }
+
+ //
------------------------------------------------------------------------
+ /** Returns security handler of the document
or<code>null</code> if document
+ * is not encrypted or {@link #parse()} wasn't called before.
*/
+ public SecurityHandler getSecurityHandler()
+ {
+ return securityHandler;
+ }
+
+ //
------------------------------------------------------------------------
+ /**
+ * Returns the number of pages in a document.
+ *
+ * @throws IOException if PAGES or other needed object is
missing
+ */
+ public int getPageNumber() throws IOException
+ {
+ int pageCount = getPagesObject().getInt(
COSName.COUNT );
+
+ if ( pageCount< 0 )
+ throw new IOException( "No page
number specified." );
+
+ return pageCount;
+ }
+
+ //
------------------------------------------------------------------------
+ /**
+ * Returns the page requested with all the objects loaded into
it.
+ *
+ * @param num starts from 0 to the number of pages.
+ * @return
+ * @throws IOException
+ */
+ public PDPage getPage( int pageNr ) throws IOException
+ {
+ getPagesObject();
+
+ // ---- get list of top level pages
+ COSArray kids = (COSArray)
pagesDictionary.getDictionaryObject( COSName.KIDS );
+
+ if ( kids == null )
+ throw new IOException( "Missing
'Kids' entry in pages dictionary." );
+
+ // ---- get page we are looking for (possibly
going recursively into subpages)
+ COSObject pageObj = getPageObject( pageNr,
kids, 0 );
+
+ if ( pageObj == null )
+ throw new IOException( "Page " + pageNr +
" not found." );
+
+ // ---- parse all objects necessary to load
page.
+ COSDictionary pageDict = (COSDictionary)
pageObj.getObject() ;
+
+ if ( parseMinimalCatalog&& ( ! allPagesParsed
) )
+ {
+ // parse page resources since
we did not do this on start
+ COSDictionary resDict =
(COSDictionary) pageDict.getDictionaryObject( COSName.RESOURCES );
+ parseDictObjects( resDict );
+ }
+
+ return new PDPage( pageDict );
+ }
+
+ /**
+ * Returns the object for a specific page.
+ * The page tree is made up of kids. The kids have COSArray
with COSObjects
+ * inside of them. The COSObject can be parsed using the
dynamic parsing method
+ * We want to only parse the minimum COSObjects and still
return a complete page.
+ * ready to be used.
+ *
+ * @param num the requested page number; numbering starts with 0
+ * @param startKids Kids array to start with looking up page
number
+ * @param startPageCount
+ *
+ * @return page object or<code>null</code> if no such page
exists
+ *
+ * @throws IOException
+ */
+ private COSObject getPageObject( int num, COSArray startKids,
int startPageCount ) throws IOException{
+
+ int curPageCount = startPageCount;
+ Iterator<COSBase> kidsIter =
startKids.iterator();
+
+ while( kidsIter.hasNext() )
+ {
+ COSObject obj = (COSObject)
kidsIter.next();
+ COSBase base =
obj.getObject();
+ if( base == null )
+ {
+ base =
parseObjectDynamically( obj, false );
+ obj.setObject(
base );
+ }
+
+ COSDictionary dic =
(COSDictionary) base;
+ int count =
dic.getInt( COSName.COUNT );
+ if ( count>= 0 )
+ {
+ // skip this
branch if requested page comes later
+ if( ( curPageCount
+ count )<= num )
+ {
+
curPageCount += count;
+
continue;
+ }
+ }
+
+ COSArray kids = (COSArray)
dic.getDictionaryObject( COSName.KIDS );
+ if( kids != null)
+ {
+ // recursively
scan subpages
+ COSObject ans =
getPageObject( num, kids, curPageCount );
+ // if ans is
not null, we got what we were looking for
+ if( ans != null
)
+ {
+
return ans;
+ }
+ }
+ else
+ {
+ // found page?
+ if(
curPageCount == num )
+ {
+
return obj;
+ }
+ // page has no
kids and it is not the page we are looking for
+ curPageCount++;
+ }
+ }
+ return null;
+ }
+
+ /** Creates a unique object id using object number and object
generation number.
+ * (requires object number< 2^31)) */
+ private final long getObjectId( final COSObject obj )
+ {
+ return ( obj.getObjectNumber().longValue()<<
32 ) | obj.getGenerationNumber().longValue();
+ }
+
+ /** Adds all from newObjects to toBeParsedList if it is not an
COSObject
+ * or we didn't add this COSObject already (checked via
addedObjects). */
+ private final void addNewToList( final Queue<COSBase>
toBeParsedList,
+ final
Collection<COSBase> newObjects,
+ final Set<Long>
addedObjects )
+ {
+ for ( COSBase newObject : newObjects )
+ {
+ if ( newObject instanceof
COSObject )
+ {
+ final long
objId = getObjectId( (COSObject) newObject );
+ if ( !
addedObjects.add( objId ) )
+
continue;
+ }
+ toBeParsedList.add( newObject );
+ }
+ }
+
+ /** Adds newObject to toBeParsedList if it is not an COSObject
+ * or we didn't add this COSObject already (checked via
addedObjects). */
+ private final void addNewToList( final Queue<COSBase>
toBeParsedList,
+ final COSBase
newObject,
+ final Set<Long>
addedObjects )
+ {
+ if ( newObject instanceof COSObject )
+ {
+ final long objId =
getObjectId( (COSObject) newObject );
+ if ( ! addedObjects.add( objId
) )
+ return;
+ }
+ toBeParsedList.add( newObject );
+ }
+
+ /**
+ * Will parse every object necessary to load a single page from
the pdf document.
+ * We try our best to order objects according to offset in file
before reading
+ * to minimize seek operations.
+ *
+ * @param dict the COSObject from the parent pages.
+ * @param excludeObjects dictionary object reference entries
with these names will not be parsed
+ *
+ * @throws IOException
+ */
+ private void parseDictObjects( COSDictionary dict, COSName...
excludeObjects ) throws IOException
+ {
+ // ---- create queue for objects waiting for
further parsing
+ final Queue<COSBase> toBeParsedList =
new LinkedList<COSBase>();
+ final TreeMap<Long,List<COSObject>> objToBeParsed = new
TreeMap<Long, List<COSObject>>(); // offset ordered object map
+
// in case of compressed objects offset points to stmObj
+ final Set<Long> parsedObjects =
new HashSet<Long>();
+ final Set<Long> addedObjects =
new HashSet<Long>();
+
+ // ---- add objects not to be parsed to list of
already parsed objects
+ if ( excludeObjects != null )
+ {
+ for ( COSName objName :
excludeObjects )
+ {
+ COSBase baseObj
= dict.getItem( objName );
+ if ( baseObj
instanceof COSObject )
+ {
+
parsedObjects.add( getObjectId( (COSObject) baseObj ) );
+ }
+ }
+ }
+
+ addNewToList( toBeParsedList, dict.getValues(),
addedObjects );
+
+ // ---- go through objects to be parsed
+ while( ! ( toBeParsedList.isEmpty()&&
objToBeParsed.isEmpty() ) )
+ {
+ // -- first get all COSObject
from other kind of objects and
+ // put them in
objToBeParsed; afterwards toBeParsedList is empty
+ COSBase baseObj;
+ while ( ( baseObj =
toBeParsedList.poll() ) != null )
+ {
+ if ( baseObj
instanceof COSStream )
+ {
+
addNewToList( toBeParsedList, ((COSStream) baseObj).getValues(), addedObjects );
+ }
+ else if (
baseObj instanceof COSDictionary )
+ {
+
addNewToList( toBeParsedList, ((COSDictionary) baseObj).getValues(),
addedObjects );
+ }
+ else if (
baseObj instanceof COSArray )
+ {
+ final
Iterator<COSBase> arrIter = ( (COSArray) baseObj ).iterator();
+
while ( arrIter.hasNext() )
+
{
+
addNewToList( toBeParsedList, arrIter.next(), addedObjects );
+
}
+ }
+ else if (
baseObj instanceof COSObject )
+ {
+
COSObject obj = (COSObject) baseObj;
+
long objId = getObjectId( obj );
+
COSObjectKey objKey = new COSObjectKey( obj.getObjectNumber().intValue(),
+
obj.getGenerationNumber().intValue() );
+
+
if ( ! ( parsedObjects.contains( objId ) /*|| document.hasObjectInPool( objKey
) */ ) )
+
{
+
Long fileOffset = xrefTrailerResolver.getXrefTable().get(
objKey );
+
if ( fileOffset != null ) // it is allowed that object
references point to null, thus we have to test
+
{
+
if ( fileOffset> 0 )
+
{
+
objToBeParsed.put( fileOffset,
Collections.singletonList( obj ) );
+
}
+
else {
+
// negative offset means we have a
compressed object within object stream;
+
// get offset of object stream
+
fileOffset =
xrefTrailerResolver.getXrefTable().get( new COSObjectKey( -fileOffset, 0 ) );
+
if ( ( fileOffset == null ) || (
fileOffset<= 0 ) )
+
throw new IOException( "Invalid object
stream xref object reference: " + fileOffset );
+
+
List<COSObject> stmObjects =
objToBeParsed.get( fileOffset );
+
if ( stmObjects == null )
+
objToBeParsed.put( fileOffset,
stmObjects = new ArrayList<COSObject>() );
+
stmObjects.add( obj );
+
}
+
}
+
else
+
{
+
// NULL object
+
COSObject pdfObject = document.getObjectFromPool( objKey );
+
pdfObject.setObject( COSNull.NULL );
+
}
+
}
+ }
+ }
+
+ // ---- read first COSObject
with smallest offset;
+ // resulting object will
be added to toBeParsedList
+ if ( objToBeParsed.isEmpty() )
+ break;
+
+ for ( COSObject obj :
objToBeParsed.remove( objToBeParsed.firstKey() ) )
+ {
+ COSBase
parsedObj = parseObjectDynamically( obj, false );
+
+ obj.setObject(
parsedObj );
+ addNewToList(
toBeParsedList, parsedObj, addedObjects );
+
+
parsedObjects.add( getObjectId( obj ) );
+ }
+ }
+ }
+
+ /**
+ * This will parse the next object from the stream and add it to
+ * the local state.
+ * This is taken from {@link PDFParser} and reduced to parsing
+ * an indirect object.
+ *
+ * @param obj object to be parsed (we only take object number and
generation number for lookup start offset)
+ * @param requireExistingNotCompressedObj if<code>true</code>
object to be parsed must not be contained within compressed stream
+ * @return the parsed object (which is also added to document object)
+ *
+ * @throws IOException If an IO error occurs.
+ */
+ private COSBase parseObjectDynamically( COSObject obj, boolean
requireExistingNotCompressedObj )
+ throws IOException
+ {
+ return parseObjectDynamically(
obj.getObjectNumber().intValue(),
+
obj.getGenerationNumber().intValue(),
+
requireExistingNotCompressedObj );
+ }
+
+ /**
+ * This will parse the next object from the stream and add it to
+ * the local state.
+ * This is taken from {@link PDFParser} and reduced to parsing
+ * an indirect object.
+ *
+ * @param objNr object number of object to be parsed
+ * @param objGenNr object generation number of object to be parsed
+ * @param requireExistingNotCompressedObj if<code>true</code> the
object to be parsed must be defined
+ * in xref (comment: null
objects may be missing from xref) and
+ * it must not be a
compressed object within object stream
+ * (this is used to
circumvent being stuck in a loop in a malicious PDF)
+ *
+ * @return the parsed object (which is also added to document object)
+ *
+ * @throws IOException If an IO error occurs.
+ */
+ private COSBase parseObjectDynamically( int objNr, int objGenNr,
+
boolean requireExistingNotCompressedObj )
+ throws IOException
+ {
+ // ---- create object key and get object (container)
from pool
+ final COSObjectKey objKey = new COSObjectKey( objNr,
objGenNr );
+ final COSObject pdfObject = document.getObjectFromPool(
objKey );
+
+ if ( pdfObject.getObject() == null )
+ {
+ // not previously parsed
+ // ---- read offset or object stream object
number from xref table
+ Long offsetOrObjstmObNr =
xrefTrailerResolver.getXrefTable().get( objKey );
+
+ // sanity test to circumvent loops with broken
documents
+ if ( requireExistingNotCompressedObj&&
+ ( ( offsetOrObjstmObNr == null )
|| ( offsetOrObjstmObNr<= 0 ) ) )
+ {
+ throw new IOException( "Object must
be defined and must not be compressed object: " +
+
objKey.getNumber() + ":" +
objKey.getGeneration() );
+ }
+
+ if ( offsetOrObjstmObNr == null )
+ {
+ // not defined object -> NULL
object (Spec. 1.7, chap. 3.2.9)
+ pdfObject.setObject(
COSNull.NULL );
+ }
+ else if ( offsetOrObjstmObNr> 0 )
+ {
+ // offset of indirect object in
file
+ // ---- go to object start
+ setPdfSource(
offsetOrObjstmObNr );
+
+ // ---- we must
have an indirect object
+ final int
readObjNr = readInt();
+ final int
readObjGen = readInt();
+ readPattern(
OBJ_MARKER );
+
+ // ----
consistency check
+ if ( (
readObjNr != objKey.getNumber() ) ||
+
( readObjGen != objKey.getGeneration() ) ) {
+ throw new
IOException( "XREF for " + objKey.getNumber() + ":" + objKey.getGeneration() +
+
" points to wrong object: " + readObjNr + ":" + readObjGen );
+ }
+
+ skipSpaces();
+ COSBase pb =
parseDirObject();
+ String endObjectKey =
readString();
+
+ if ( endObjectKey.equals(
"stream" ) )
+ {
+ pdfSource.unread(
endObjectKey.getBytes("ISO-8859-1") );
+ pdfSource.unread( ' ' );
+ if( pb instanceof
COSDictionary )
+ {
+
COSStream stream = parseCOSStream( (COSDictionary)pb,
+
getDocument().getScratchFile()
);
+
+ if (
securityHandler != null )
+ {
+
try
+
{
+
securityHandler.decryptStream(stream, objNr, objGenNr );
+
} catch ( CryptographyException ce ) {
+
throw new IOException( "Error decrypting stream object " + objNr + ": " +
ce.getMessage(), ce );
+
}
+ }
+ pb =
stream;
+ }
+ else
+ {
+ // this is
not legal
+ // the
combination of a dict and the stream/endstream forms a complete stream object
+ throw new IOException(
"Stream not preceded by dictionary (offset: " + offsetOrObjstmObNr + ")." );
+ }
+ skipSpaces();
+ endObjectKey =
readLine();
+
+ // we have case with a
second 'endstream' before endobj
+ if ( ! endObjectKey.startsWith(
"endobj" ) )
+ {
+ if (
endObjectKey.startsWith( "endstream" ) )
+ {
+
endObjectKey = endObjectKey.substring( 9 ).trim();
+
if ( endObjectKey.length() == 0 )
+
{
+
// no other characters in extra endstream line
+
endObjectKey = readLine(); // read next line
+
}
+ }
+ }
+ }
+ else if ( securityHandler != null
)
+ {
+ // decrypt
+ if ( pb instanceof
COSString )
+ {
+
decrypt( (COSString) pb, objNr, objGenNr );
+ }
+ else if ( pb instanceof
COSDictionary )
+ {
+ for(
Entry<COSName,COSBase> entry : ((COSDictionary) pb).entrySet() )
+ {
+
// TODO: specially handle 'Contents' entry of signature dictionary like in
SecurityHandler#decryptDictionary
+ if (
entry.getValue() instanceof COSString )
+ {
+
decrypt( (COSString) entry.getValue(), objNr, objGenNr );
+ }
+ }
+ }
+ else if ( pb instanceof
COSArray )
+ {
+ final
COSArray array = (COSArray) pb;
+ for( int aIdx =
0, len = array.size(); aIdx< len; aIdx++ )
+ {
+ if (
array.get( aIdx ) instanceof COSString )
+ {
+
decrypt( (COSString) array.get( aIdx ), objNr, objGenNr );
+ }
+ }
+ }
+ }
+
+ pdfObject.setObject( pb );
+
+ if ( ! endObjectKey.startsWith(
"endobj" ) )
+ {
+ throw new IOException( "Object ("
+ readObjNr + ":" + readObjGen +
+ ")
at offset " + offsetOrObjstmObNr + " does not end with 'endobj'." );
+ }
+
+
releasePdfSourceInputStream();
+
+ }
+ else
+ {
+ // xref value is object nr of
object stream containing object to be parsed;
+ // since our object was not
found it means object stream was not parsed so far
+ final int objstmObjNr =
(int) ( - offsetOrObjstmObNr );
+ final COSBase objstmBaseObj =
parseObjectDynamically( objstmObjNr, 0, true );
+ if ( objstmBaseObj instanceof
COSStream )
+ {
+ // parse object
stream
+ PDFObjectStreamParser parser =
+ new
PDFObjectStreamParser( (COSStream) objstmBaseObj, document, forceParsing );
+ parser.parse();
+
+ // get set of object numbers
referenced for this object stream
+ final Set<Long> refObjNrs =
xrefTrailerResolver.getContainedObjectNumbers( objstmObjNr );
+
+ // register all objects which
are referenced to be contained in object stream
+ for( COSObject next :
parser.getObjects() )
+ {
+ COSObjectKey stmObjKey =
new COSObjectKey( next );
+ if ( refObjNrs.contains(
stmObjKey.getNumber() ) )
+ {
+ COSObject
stmObj = document.getObjectFromPool( stmObjKey );
+ stmObj.setObject(
next.getObject() );
+ }
+ }
+ }
+ }
+ }
+ return pdfObject.getObject();
+ }
+
+ //
------------------------------------------------------------------------
+ /** Decrypts given COSString. */
+ private final void decrypt( COSString str, long objNr, long objGenNr )
+ throws IOException
+ {
+ try {
+ securityHandler.decryptString( str,
objNr, objGenNr );
+ }
+ catch ( CryptographyException ce )
+ {
+ throw new IOException( "Error decrypting
string: " + ce.getMessage(), ce );
+ }
+ }
+
+ //
------------------------------------------------------------------------
+ private boolean inGetLength = false;
+
+ /** Returns length value referred to or defined in given object. */
+ private COSNumber getLength( final COSBase lengthBaseObj ) throws
IOException
+ {
+ if ( lengthBaseObj == null )
+ return null;
+
+ if ( inGetLength )
+ throw new IOException( "Loop while reading
length from " + lengthBaseObj );
+
+ COSNumber retVal = null;
+
+ try
+ {
+ inGetLength = true;
+
+ // ---- maybe length was given directly
+ if ( lengthBaseObj instanceof COSNumber )
+ {
+ retVal = (COSNumber) lengthBaseObj;
+ }
+ // ---- length in referenced object
+ else if ( lengthBaseObj instanceof
COSObject )
+ {
+ COSObject lengthObj =
(COSObject) lengthBaseObj;
+
+ if ( lengthObj.getObject() ==
null )
+ {
+ // not read so
far
+
+ // keep current
stream position
+ final long
curFileOffset = getPdfSourceOffset();
+
releasePdfSourceInputStream();
+
+
parseObjectDynamically( lengthObj, true );
+
+ // reset
current stream position
+ setPdfSource(
curFileOffset );
+
+ if (
lengthObj.getObject() == null )
+ throw new
IOException( "Length object content was not read." );
+ }
+
+ if ( ! ( lengthObj.getObject()
instanceof COSNumber ) )
+ {
+ throw new IOException( "Wrong
type of referenced length object " + lengthObj + ": " +
+
lengthObj.getObject().getClass().getSimpleName() );
+ }
+
+ retVal = (COSNumber)
lengthObj.getObject();
+
+ }
+ else
+ {
+ throw new IOException( "Wrong type
of length object: " + lengthBaseObj.getClass().getSimpleName() );
+ }
+ }
+ finally
+ {
+ inGetLength = false;
+ }
+ return retVal;
+ }
+
+ //
------------------------------------------------------------------------
+ private final int streamCopyBufLen = 8192;
+ private final byte[] streamCopyBuf = new byte[ streamCopyBufLen ];
+
+ /**
+ * This will read a COSStream from the input stream using length
attribute
+ * within dictionary.
+ * If length attribute is a indirect reference it is first resolved
to get
+ * the stream length. This means we copy stream data without testing
for
+ * 'endstream' or 'endobj' and thus it is no problem if these keywords
+ * occur within stream.
+ * We require 'endstream' to be found after stream data is read.
+ *
+ * @param dic dictionary that goes with this stream.
+ * @param file file to write the stream to when reading.
+ *
+ * @return parsed pdf stream.
+ *
+ * @throws IOException if an error occurred reading the stream, like
problems
+ * with reading length attribute, stream does not end with
'endstream'
+ * after data read, stream too short etc.
+ */
+ @Override
+ protected COSStream parseCOSStream( COSDictionary dic, RandomAccess
file ) throws IOException
+ {
+ final COSStream stream = new COSStream( dic, file );
+ OutputStream out = null;
+ try
+ {
+ readString(); // read 'stream'; this was already tested in
parseObjectsDynamically()
+
+ // ---- skip whitespaces before start of data
+ // PDF Ref 1.7, chap. 3.2.7:
+ // 'stream' should be followed by either a CRLF (0x0d
0x0a) or LF but nothing else.
+ {
+ int whitespace = pdfSource.read();
+
+ //see brother_scan_cover.pdf, it adds
whitespaces
+ //after the stream but before the start of the
+ //data, so just read those first
+ while (whitespace == 0x20)
+ {
+ whitespace = pdfSource.read();
+ }
+
+ if( whitespace == 0x0D )
+ {
+ whitespace = pdfSource.read();
+ if( whitespace != 0x0A )
+ {
+ // the spec says this is invalid but it
happens in the real
+ // world so we must support it
+ pdfSource.unread( whitespace );
+ }
+ }
+ else if (whitespace != 0x0A)
+ {
+ // no whitespace after 'stream'; PDF ref.
says 'should' so that is ok
+ pdfSource.unread( whitespace );
+ }
+ }
+
+ /*This needs to be dic.getItem because when we are parsing,
the underlying object
+ * might still be null.
+ */
+ COSNumber streamLengthObj = getLength( dic.getItem(
COSName.LENGTH ) );
+ if ( streamLengthObj == null )
+ throw new IOException( "Missing length for
stream." );
+
+ // ---- get output stream to copy data to
+ out = stream.createFilteredStream( streamLengthObj );
+
+ long remainBytes = streamLengthObj.longValue();
+
+ while ( remainBytes> 0 )
+ {
+ final int readBytes = pdfSource.read(
streamCopyBuf, 0,
+
( remainBytes> streamCopyBufLen ) ? streamCopyBufLen : (int) remainBytes );
+ if ( readBytes<= 0 )
+ throw new IOException( "No more
bytes from stream but expected: " + remainBytes );
+
+ out.write( streamCopyBuf, 0, readBytes );
+
+ remainBytes -= readBytes;
+ }
+
+ String endStream = readString();
+
+ if ( ! endStream.equals( "endstream" ) )
+ {
+ throw new IOException( "Error reading stream using length
value. Expected='endstream' actual='" + endStream + "' " );
+ }
+
+ } finally
+ {
+ if ( out != null ) out.close();
+ }
+ return stream;
+ }
+}
Propchange:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
------------------------------------------------------------------------------
svn:mime-type = text/plain