svn commit: r1652599 - in /pdfbox/trunk: pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ preflight/src/main/java/org/apache/pdfbox/preflight/parser/

lehmi Sat, 17 Jan 2015 04:57:53 -0800

Author: lehmi
Date: Sat Jan 17 12:57:29 2015
New Revision: 1652599

URL: http://svn.apache.org/r1652599
Log:
PDFBOX-2600: merged PDFParser into NonSequentialPDFParser


Removed:
    pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/PDFParser.java
Modified:
    
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
    
pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java

Modified: 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java?rev=1652599&r1=1652598&r2=1652599&view=diff
==============================================================================
--- 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
 (original)
+++ 
pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/pdfparser/NonSequentialPDFParser.java
 Sat Jan 17 12:57:29 2015
@@ -64,20 +64,27 @@ import org.apache.pdfbox.pdmodel.encrypt
 import org.apache.pdfbox.pdmodel.encryption.PublicKeyDecryptionMaterial;
 import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
+import org.apache.pdfbox.pdmodel.fdf.FDFDocument;
 import org.apache.pdfbox.persistence.util.COSObjectKey;
 
 /**
- * PDFParser which first reads startxref and xref tables in order to know 
valid objects and parse only these objects.
+ * PDF-Parser which first reads startxref and xref tables in order to know 
valid objects and parse only these objects.
  * Thus it is closer to a conforming parser than the sequential reading of 
{@link PDFParser}.
  * 
- * This class can be used as a {@link PDFParser} replacement. First {@link 
#parse()} must be called before page objects
+ * First {@link #parse()} must be called before page objects
  * can be retrieved, e.g. {@link #getPDDocument()}.
  * 
  * This class is a much enhanced version of <code>QuickParser</code> presented 
in <a
  * href="https://issues.apache.org/jira/browse/PDFBOX-1104";>PDFBOX-1104</a> by 
Jeremy Villalobos.
  */
-public class NonSequentialPDFParser extends PDFParser
+public class NonSequentialPDFParser extends BaseParser
 {
+    private static final String PDF_HEADER = "%PDF-";
+    private static final String FDF_HEADER = "%FDF-";
+    
+    private static final String PDF_DEFAULT_VERSION = "1.4";
+    private static final String FDF_DEFAULT_VERSION = "1.0";
+
     private static final byte[] XREF_TABLE = new byte[] { 'x', 'r', 'e', 'f' };
     private static final byte[] XREF_STREAM = new byte[] { '/', 'X', 'R', 'e', 
'f' };
     private static final long MINIMUM_SEARCH_OFFSET = 6;
@@ -164,6 +171,15 @@ public class NonSequentialPDFParser exte
 
     private static final Log LOG = 
LogFactory.getLog(NonSequentialPDFParser.class);
 
+    private boolean isFDFDocment = false;
+
+    /** 
+     * Collects all Xref/trailer objects and resolves them into single
+     * object using startxref reference. 
+     */
+    protected XrefTrailerResolver xrefTrailerResolver = new 
XrefTrailerResolver();
+
+
     /**
      * <code>true</code> if the NonSequentialPDFParser is initialized by a 
InputStream, in this case a temporary file is
      * created. At the end of the {@linkplain #parse()} method,the temporary 
file will be deleted.
@@ -431,7 +447,14 @@ public class NonSequentialPDFParser exte
         }
     }
 
-    @Override
+    /**
+     * This will get the PD document that was parsed.  When you are done with
+     * this document you must call close() on it to release resources.
+     *
+     * @return The document at the PD layer.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
     public PDDocument getPDDocument() throws IOException
     {
         return new PDDocument( getDocument(), this, accessPermission );
@@ -2241,4 +2264,357 @@ public class NonSequentialPDFParser exte
             pdfSource.seek(originOffset);
         }
     }
+
+    /**
+     * This will parse the startxref section from the stream.
+     * The startxref value is ignored.
+     *
+     * @return false on parsing error
+     * @throws IOException If an IO error occurs.
+     */
+    private boolean parseStartXref() throws IOException
+    {
+        if(pdfSource.peek() != 's')
+        {
+            return false;
+        }
+        String startXRef = readString();
+        if( !startXRef.trim().equals( "startxref" ) )
+        {
+            return false;
+        }
+        skipSpaces();
+        /* This integer is the byte offset of the first object referenced by 
the xref or xref stream
+         * Needed for the incremental update (PREV)
+         */
+        getDocument().setStartXref(readLong());
+        return true;
+    }
+
+    /**
+     * This will parse the trailer from the stream and add it to the state.
+     *
+     * @return false on parsing error
+     * @throws IOException If an IO error occurs.
+     */
+    private boolean parseTrailer() throws IOException
+    {
+        if(pdfSource.peek() != 't')
+        {
+            return false;
+        }
+        //read "trailer"
+        String nextLine = readLine();
+        if( !nextLine.trim().equals( "trailer" ) )
+        {
+            // in some cases the EOL is missing and the trailer immediately
+            // continues with "<<" or with a blank character
+            // even if this does not comply with PDF reference we want to 
support as many PDFs as possible
+            // Acrobat reader can also deal with this.
+            if (nextLine.startsWith("trailer"))
+            {
+                byte[] b = nextLine.getBytes(ISO_8859_1);
+                int len = "trailer".length();
+                pdfSource.unread('\n');
+                pdfSource.unread(b, len, b.length-len);
+            }
+            else
+            {
+                return false;
+            }
+        }
+    
+        // in some cases the EOL is missing and the trailer continues with " 
<<"
+        // even if this does not comply with PDF reference we want to support 
as many PDFs as possible
+        // Acrobat reader can also deal with this.
+        skipSpaces();
+    
+        COSDictionary parsedTrailer = parseCOSDictionary();
+        xrefTrailerResolver.setTrailer( parsedTrailer );
+    
+        // The version can also be specified within the document /Catalog
+        readVersionInTrailer(parsedTrailer);
+    
+        skipSpaces();
+        return true;
+    }
+
+    private void parseHeader() throws IOException
+    {
+        // read first line
+        String header = readLine();
+        // some pdf-documents are broken and the pdf-version is in one of the 
following lines
+        if (!header.contains(PDF_HEADER) && !header.contains(FDF_HEADER))
+        {
+            header = readLine();
+            while (!header.contains(PDF_HEADER) && 
!header.contains(FDF_HEADER))
+            {
+                // if a line starts with a digit, it has to be the first one 
with data in it
+                if ((header.length() > 0) && 
(Character.isDigit(header.charAt(0))))
+                {
+                    break;
+                }
+                header = readLine();
+            }
+        }
+    
+        // nothing found
+        if (!header.contains(PDF_HEADER) && !header.contains(FDF_HEADER))
+        {
+            throw new IOException( "Error: Header doesn't contain versioninfo" 
);
+        }
+    
+        //sometimes there is some garbage in the header before the header
+        //actually starts, so lets try to find the header first.
+        int headerStart = header.indexOf( PDF_HEADER );
+        if (headerStart == -1)
+        {
+            headerStart = header.indexOf(FDF_HEADER);
+        }
+    
+        //greater than zero because if it is zero then
+        //there is no point of trimming
+        if ( headerStart > 0 )
+        {
+            //trim off any leading characters
+            header = header.substring( headerStart, header.length() );
+        }
+    
+        /*
+         * This is used if there is garbage after the header on the same line
+         */
+        if (header.startsWith(PDF_HEADER))
+        {
+            if (!header.matches(PDF_HEADER + "\\d.\\d"))
+            {
+    
+                if (header.length() < PDF_HEADER.length() + 3)
+                {
+                    // No version number at all, set to 1.4 as default
+                    header = PDF_HEADER + PDF_DEFAULT_VERSION;
+                    LOG.debug("No pdf version found, set to " + 
PDF_DEFAULT_VERSION + " as default.");
+                }
+                else
+                {
+                    String headerGarbage = 
header.substring(PDF_HEADER.length() + 3, header.length()) + "\n";
+                    header = header.substring(0, PDF_HEADER.length() + 3);
+                    pdfSource.unread(headerGarbage.getBytes(ISO_8859_1));
+                }
+            }
+        }
+        else
+        {
+            isFDFDocment = true;
+            if (!header.matches(FDF_HEADER + "\\d.\\d"))
+            {
+                if (header.length() < FDF_HEADER.length() + 3)
+                {
+                    // No version number at all, set to 1.0 as default
+                    header = FDF_HEADER + FDF_DEFAULT_VERSION;
+                    LOG.debug("No fdf version found, set to " + 
FDF_DEFAULT_VERSION + " as default.");
+                }
+                else
+                {
+                    String headerGarbage = 
header.substring(FDF_HEADER.length() + 3, header.length()) + "\n";
+                    header = header.substring(0, FDF_HEADER.length() + 3);
+                    pdfSource.unread(headerGarbage.getBytes(ISO_8859_1));
+                }
+            }
+        }
+        document.setHeaderString(header);
+    
+        try
+        {
+            if (header.startsWith( PDF_HEADER ))
+            {
+                float pdfVersion = Float. parseFloat(
+                        header.substring( PDF_HEADER.length(), Math.min( 
header.length(), PDF_HEADER .length()+3) ) );
+                document.setVersion( pdfVersion );
+            }
+            else
+            {
+                float pdfVersion = Float. parseFloat(
+                        header.substring( FDF_HEADER.length(), Math.min( 
header.length(), FDF_HEADER.length()+3) ) );
+                document.setVersion( pdfVersion );
+            }
+        }
+        catch ( NumberFormatException e )
+        {
+            throw new IOException( "Error getting pdf version: " + 
e.getMessage(), e );
+        }
+    }
+
+    /**
+     * The document catalog can also have a /Version parameter which overrides 
the version specified
+     * in the header if, and only if it is greater.
+     *
+     * @param parsedTrailer the parsed catalog in the trailer
+     */
+    private void readVersionInTrailer(COSDictionary parsedTrailer)
+    {
+        COSObject root = (COSObject) parsedTrailer.getItem(COSName.ROOT);
+        if (root != null)
+        {
+            COSBase item = root.getItem(COSName.VERSION);
+            if (item instanceof COSName)
+            {
+                COSName version = (COSName) item;
+                float trailerVersion = Float.valueOf(version.getName());
+                if (trailerVersion > document.getVersion())
+                {
+                    document.setVersion(trailerVersion);
+                }
+            }
+            else if (item != null)
+            {
+                LOG.warn("Incorrect /Version entry is ignored: " + item);
+            }
+        }
+    }
+
+    /**
+     * This will parse the xref table from the stream and add it to the state
+     * The XrefTable contents are ignored.
+     * @param startByteOffset the offset to start at
+     * @return false on parsing error
+     * @throws IOException If an IO error occurs.
+     */
+    protected boolean parseXrefTable(long startByteOffset) throws IOException
+    {
+        if(pdfSource.peek() != 'x')
+        {
+            return false;
+        }
+        String xref = readString();
+        if( !xref.trim().equals( "xref" ) )
+        {
+            return false;
+        }
+        
+        // check for trailer after xref
+        String str = readString();
+        byte[] b = str.getBytes(ISO_8859_1);
+        pdfSource.unread(b, 0, b.length);
+        
+        // signal start of new XRef
+        xrefTrailerResolver.nextXrefObj( startByteOffset, XRefType.TABLE );
+    
+        if (str.startsWith("trailer"))
+        {
+            LOG.warn("skipping empty xref table");
+            return false;
+        }
+        
+        /*
+         * Xref tables can have multiple sections.
+         * Each starts with a starting object id and a count.
+         */
+        while(true)
+        {
+            long currObjID = readObjectNumber(); // first obj id
+            long count = readLong(); // the number of objects in the xref table
+            skipSpaces();
+            for(int i = 0; i < count; i++)
+            {
+                if(pdfSource.isEOF() || isEndOfName((char)pdfSource.peek()))
+                {
+                    break;
+                }
+                if(pdfSource.peek() == 't')
+                {
+                    break;
+                }
+                //Ignore table contents
+                String currentLine = readLine();
+                String[] splitString = currentLine.split("\\s");
+                if (splitString.length < 3)
+                {
+                    LOG.warn("invalid xref line: " + currentLine);
+                    break;
+                }
+                /* This supports the corrupt table as reported in
+                 * PDFBOX-474 (XXXX XXX XX n) */
+                if(splitString[splitString.length-1].equals("n"))
+                {
+                    try
+                    {
+                        long currOffset = Long.parseLong(splitString[0]);
+                        int currGenID = Integer.parseInt(splitString[1]);
+                        COSObjectKey objKey = new COSObjectKey(currObjID, 
currGenID);
+                        xrefTrailerResolver.setXRef(objKey, currOffset);
+                    }
+                    catch(NumberFormatException e)
+                    {
+                        throw new IOException(e);
+                    }
+                }
+                else if(!splitString[2].equals("f"))
+                {
+                    throw new IOException("Corrupt XRefTable Entry - ObjID:" + 
currObjID);
+                }
+                currObjID++;
+                skipSpaces();
+            }
+            skipSpaces();
+            char c = (char)pdfSource.peek();
+            if(c < '0' || c > '9')
+            {
+                break;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Fills XRefTrailerResolver with data of given stream.
+     * Stream must be of type XRef.
+     * @param stream the stream to be read
+     * @param objByteOffset the offset to start at
+     * @param isStandalone should be set to true if the stream is not part of 
a hybrid xref table
+     * @throws IOException if there is an error parsing the stream
+     */
+    private void parseXrefStream(COSStream stream, long objByteOffset, boolean 
isStandalone) throws IOException
+    {
+        // the cross reference stream of a hybrid xref table will be added to 
the existing one
+        // and we must not override the offset and the trailer
+        if ( isStandalone )
+        {
+            xrefTrailerResolver.nextXrefObj( objByteOffset, XRefType.STREAM );
+            xrefTrailerResolver.setTrailer( stream );
+        }        
+        PDFXrefStreamParser parser =
+                new PDFXrefStreamParser( stream, document, xrefTrailerResolver 
);
+        parser.parse();
+    }
+
+    /**
+     * This will get the document that was parsed.  parse() must be called 
before this is called.
+     * When you are done with this document you must call close() on it to 
release
+     * resources.
+     *
+     * @return The document that was parsed.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    public COSDocument getDocument() throws IOException
+    {
+        if( document == null )
+        {
+            throw new IOException( "You must call parse() before calling 
getDocument()" );
+        }
+        return document;
+    }
+
+    /**
+     * This will get the FDF document that was parsed.  When you are done with
+     * this document you must call close() on it to release resources.
+     *
+     * @return The document at the PD layer.
+     *
+     * @throws IOException If there is an error getting the document.
+     */
+    public FDFDocument getFDFDocument() throws IOException
+    {
+        return new FDFDocument( getDocument() );
+    }
 }

Modified: 
pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java
URL: 
http://svn.apache.org/viewvc/pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java?rev=1652599&r1=1652598&r2=1652599&view=diff
==============================================================================
--- 
pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java
 (original)
+++ 
pdfbox/trunk/preflight/src/main/java/org/apache/pdfbox/preflight/parser/PreflightParser.java
 Sat Jan 17 12:57:29 2015
@@ -73,7 +73,6 @@ import org.apache.pdfbox.io.IOUtils;
 import org.apache.pdfbox.pdfparser.BaseParser;
 import org.apache.pdfbox.pdfparser.NonSequentialPDFParser;
 import org.apache.pdfbox.pdfparser.PDFObjectStreamParser;
-import org.apache.pdfbox.pdfparser.PDFParser;
 import org.apache.pdfbox.pdfparser.XrefTrailerResolver.XRefType;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.persistence.util.COSObjectKey;

svn commit: r1652599 - in /pdfbox/trunk: pdfbox/src/main/java/org/apache/pdfbox/pdfparser/ preflight/src/main/java/org/apache/pdfbox/preflight/parser/

Reply via email to